diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 00823951dcc..f265a42f9d5 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,21 +1,6 @@ -# Document 1: Buildkite loads only this block on first parse. The next step resolves docs-only skip-ci -# from git diff, then uploads document 2. When docs-only skip applies, image-build still runs if nightly-test -# / main NIGHTLY so upload-nightly is not skipped together with test-ready/test-merge. -# -# Document 2: appended after `---`; same file, read by upload_pipeline_with_skip_ci.sh (not evaluated as a second pipeline by Buildkite). -steps: - - label: ":github: Resolve skip-ci & upload pipeline" - key: upload-ci-pipeline - commands: - - "bash .buildkite/scripts/upload_pipeline_with_skip_ci.sh" - agents: - queue: "cpu_queue_premerge" - ---- steps: - label: ":docker: Build image" key: image-build - if: __IMAGE_BUILD_IF__ commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker build --progress=plain --file docker/Dockerfile.ci -t vllm-omni-ci ." @@ -28,7 +13,7 @@ steps: - label: "Upload Ready Pipeline" depends_on: image-build key: upload-ready-pipeline - if: __UPLOAD_READY_IF__ + if: build.branch != "main" && build.pull_request.labels includes "ready" commands: - buildkite-agent pipeline upload .buildkite/test-ready.yml agents: @@ -38,25 +23,17 @@ steps: - label: "Upload Merge Pipeline" depends_on: image-build key: upload-merge-pipeline - if: __UPLOAD_MERGE_IF__ + if: build.branch == "main" && build.env("NIGHTLY") != "1" commands: - buildkite-agent pipeline upload .buildkite/test-merge.yml agents: queue: "cpu_queue_premerge" - # L4 Test — main+NIGHTLY=1 (scheduled), or PR with specific label (e.g. add label then Rebuild) + # L4 Test — main+NIGHTLY=1 (scheduled), or PR with label nightly-test (e.g. add label then Rebuild) - label: "Upload Nightly Pipeline" depends_on: image-build key: upload-nightly-pipeline - if: >- - (build.branch == "main" && build.env("NIGHTLY") == "1") || - (build.branch != "main" && ( - build.pull_request.labels includes "nightly-test" || - build.pull_request.labels includes "omni-test" || - build.pull_request.labels includes "tts-test" || - build.pull_request.labels includes "diffusion-x2iat-test" || - build.pull_request.labels includes "diffusion-x2v-test" - )) + if: '(build.branch == "main" && build.env("NIGHTLY") == "1") || (build.branch != "main" && build.pull_request.labels includes "nightly-test")' commands: - buildkite-agent pipeline upload .buildkite/test-nightly.yml agents: diff --git a/.buildkite/scripts/generate-and-upload-nightly-index.sh b/.buildkite/scripts/generate-and-upload-nightly-index.sh index b09c13f5cf9..6624af32303 100755 --- a/.buildkite/scripts/generate-and-upload-nightly-index.sh +++ b/.buildkite/scripts/generate-and-upload-nightly-index.sh @@ -19,7 +19,7 @@ has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) if [[ "$has_new_python" -eq 0 ]]; then # use new python from docker docker pull python:3-slim - PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3" + PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3" fi echo "Using python interpreter: $PYTHON" @@ -36,7 +36,7 @@ mkdir -p "$INDICES_OUTPUT_DIR" # HACK: we do not need regex module here, but it is required by pre-commit hook # To avoid any external dependency, we simply replace it back to the stdlib re module -sed -i.bak 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py && rm -f .buildkite/scripts/generate-nightly-index.py.bak +sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py # Generate indices -- the version is just the commit hash (not omni/{commit}) # because relative paths are computed between the index and wheel directories, @@ -73,16 +73,15 @@ echo "Pure version (without variant): $pure_version" # re-generate and copy to /omni/{version}/ only if it does not have "dev" in the version if [[ "$version" != *"dev"* ]]; then - s3_version="v$pure_version" - echo "Re-generating indices for /omni/$s3_version/" + echo "Re-generating indices for /omni/$pure_version/" rm -rf "${INDICES_OUTPUT_DIR:?}" mkdir -p "$INDICES_OUTPUT_DIR" # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path $PYTHON .buildkite/scripts/generate-nightly-index.py \ - --version "$s3_version" \ + --version "$pure_version" \ --wheel-dir "$BUILDKITE_COMMIT" \ --current-objects "$obj_json" \ --output-dir "$INDICES_OUTPUT_DIR" \ --comment "version $pure_version" - aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/omni/$s3_version/" + aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/omni/$pure_version/" fi diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index bb4a74a7044..c616c446b09 100755 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -4,7 +4,6 @@ import argparse import json -import re import sys from dataclasses import asdict, dataclass from datetime import datetime @@ -12,6 +11,8 @@ from typing import Any from urllib.parse import quote +import regex as re + def normalize_package_name(name: str) -> str: """Normalize package name per PEP 503.""" diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 96c139c8f7b..f56f23b5deb 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -11,6 +11,15 @@ set -o pipefail export PYTHONPATH=".." # Print ROCm version +echo "--- Confirming Clean Initial State" +while true; do + sleep 3 + if grep -q clean /opt/amdgpu/etc/gpu_state; then + echo "GPUs state is \"clean\"" + break + fi +done + echo "--- ROCm info" rocminfo @@ -42,14 +51,25 @@ cleanup_docker() { # Call the cleanup docker function cleanup_docker +echo "--- Resetting GPUs" + +echo "reset" > /opt/amdgpu/etc/gpu_state + +while true; do + sleep 3 + if grep -q clean /opt/amdgpu/etc/gpu_state; then + echo "GPUs state is \"clean\"" + break + fi +done + echo "--- Pulling container" -## Temporary change to use AMD Docker Hub to store the vllm-omni image +## Temporary change to use AMD Docker Hub to store the vllm-ci image # to bypass the rate limit issue with ECR Public Gallery. -# Images are now stored in a separate repository for vllm-omni, instead of vllm-ci. # TODO: @tjtanaa point back to ECR Public Gallery # once the amd agents are configured to use ECR Public Gallery. # image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-rocm-omni" -image_name="rocm/vllm-omni:${BUILDKITE_COMMIT}" +image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}-rocm-omni" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" # TODO: @tjtanaa uncomment this once the amd agents are configured to use ECR Public Gallery. diff --git a/.buildkite/scripts/upload_pipeline_with_skip_ci.sh b/.buildkite/scripts/upload_pipeline_with_skip_ci.sh deleted file mode 100644 index 6259d39b290..00000000000 --- a/.buildkite/scripts/upload_pipeline_with_skip_ci.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env bash -# Evaluate docs-only skip-ci and upload continuation steps from the same `.buildkite/pipeline.yml` -# (YAML document after the first `---`). Buildkite `if` is evaluated at upload time. -set -euo pipefail - -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -PIPELINE_YML="${ROOT}/.buildkite/pipeline.yml" - -# Prints a single digit to stdout: 1 = skip image CI, 0 = run. Logs go to stderr. -is_docs_only_change() { - local file_path - local has_any=0 - - while IFS= read -r file_path; do - [[ -z "${file_path}" ]] && continue - has_any=1 - - if [[ "${file_path}" == docs/* ]]; then - continue - fi - if [[ "${file_path}" == *.md ]]; then - continue - fi - if [[ "${file_path}" == "mkdocs.yaml" ]]; then - continue - fi - return 1 - done - - [[ "${has_any}" -eq 1 ]] -} - -resolve_skip_ci() { - local is_pr_build=0 - local files - local base_branch base_ref - - if [[ "${BUILDKITE_PULL_REQUEST:-false}" != "false" && -n "${BUILDKITE_PULL_REQUEST:-}" ]]; then - is_pr_build=1 - fi - - if [[ "${is_pr_build}" -eq 1 ]]; then - base_branch="${BUILDKITE_PULL_REQUEST_BASE_BRANCH:-main}" - if ! git rev-parse --verify "origin/${base_branch}" >/dev/null 2>&1; then - echo "resolve_skip_ci: origin/${base_branch} not found locally; trying fetch" >&2 - git fetch --depth=200 origin "${base_branch}" >/dev/null 2>&1 || true - fi - - base_ref="" - if git rev-parse --verify "origin/${base_branch}" >/dev/null 2>&1; then - base_ref="origin/${base_branch}" - elif git rev-parse --verify "${base_branch}" >/dev/null 2>&1; then - base_ref="${base_branch}" - else - echo "resolve_skip_ci: cannot resolve PR base ${base_branch}; skip-ci=0" >&2 - echo -n 0 - return 0 - fi - - if ! files="$(git diff --name-only "${base_ref}...${BUILDKITE_COMMIT}" 2>/dev/null)"; then - echo "resolve_skip_ci: failed to compute PR changed files; skip-ci=0" >&2 - echo -n 0 - return 0 - fi - elif [[ "${BUILDKITE_BRANCH:-}" == "main" ]]; then - if ! git rev-parse --verify "${BUILDKITE_COMMIT}^" >/dev/null 2>&1; then - echo "resolve_skip_ci: commit has no parent on main; skip-ci=0" >&2 - echo -n 0 - return 0 - fi - if ! files="$(git diff --name-only "${BUILDKITE_COMMIT}^..${BUILDKITE_COMMIT}" 2>/dev/null)"; then - echo "resolve_skip_ci: failed to compute main changed files; skip-ci=0" >&2 - echo -n 0 - return 0 - fi - else - echo "resolve_skip_ci: not PR/main build; skip-ci=0" >&2 - echo -n 0 - return 0 - fi - - if is_docs_only_change <<< "${files}"; then - echo "resolve_skip_ci: docs-only change detected; skip-ci=1" >&2 - echo -n 1 - return 0 - fi - - echo "resolve_skip_ci: non-doc changes detected; skip-ci=0" >&2 - echo -n 0 -} - -SKIP_CI="$(resolve_skip_ci)" - -if [[ ! -f "${PIPELINE_YML}" ]]; then - echo "upload_pipeline_with_skip_ci: missing ${PIPELINE_YML}" >&2 - exit 1 -fi - -export ROOT SKIP_CI PIPELINE_YML -python3 <<'PY' | buildkite-agent pipeline upload -import os -import pathlib - -path = pathlib.Path(os.environ["PIPELINE_YML"]) -text = path.read_text(encoding="utf-8") -sep = "\n---\n" -if sep not in text: - raise SystemExit( - "upload_pipeline_with_skip_ci: .buildkite/pipeline.yml must contain a '\\n---\\n' separator " - "(document 1 = bootstrap, document 2 = uploaded steps)" - ) -_, continuation = text.split(sep, 1) - -skip = os.environ.get("SKIP_CI") == "1" -# When docs-only skip-ci: skip default CI image, but still build for L4 nightly (PR label nightly-test or -# main NIGHTLY=1), otherwise upload-nightly (depends_on image-build) would be skipped too. -nightly_only = ( - '(build.pull_request.labels includes "nightly-test") ' - '|| (build.branch == "main" && build.env("NIGHTLY") == "1")' -) -# Placeholder in pipeline.yml is `if: __IMAGE_BUILD_IF__` (valid YAML); replace value only. -if skip: - rep = f"'{nightly_only}'" - ready_rep = "'false'" - merge_rep = "'false'" -else: - rep = "'true'" - ready_rep = "'build.branch != \"main\" && build.pull_request.labels includes \"ready\"'" - merge_rep = "'(build.branch == \"main\" && build.env(\"NIGHTLY\") != \"1\") || (build.branch != \"main\" && build.pull_request.labels includes \"merge-test\")'" -rendered = ( - continuation - .replace("__IMAGE_BUILD_IF__", rep) - .replace("__UPLOAD_READY_IF__", ready_rep) - .replace("__UPLOAD_MERGE_IF__", merge_rep) -) -print(rendered, end="") -PY diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml index ac52f60b35b..60ba0d9d416 100644 --- a/.buildkite/test-amd-merge.yml +++ b/.buildkite/test-amd-merge.yml @@ -32,6 +32,7 @@ steps: mirror_hardwares: [amdproduction] grade: Blocking commands: + - export GPU_ARCHS=gfx942 - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - | @@ -54,7 +55,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 @@ -62,12 +63,13 @@ steps: mirror_hardwares: [amdproduction] grade: Blocking commands: + - export GPU_ARCHS=gfx942 - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - timeout 15m pytest -s -v -m "core_model and cache and diffusion and not distributed_cuda and L4" -- label: "Diffusion Sequence Parallelism Test (Need 4 GPUs)" - agent_pool: mi325_4 +- label: "Diffusion Sequence Parallelism Test" + agent_pool: mi325_2 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking @@ -75,7 +77,6 @@ steps: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - timeout 20m pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py - - timeout 20m pytest -s -v tests/diffusion/distributed/test_ulysses_uaa_perf.py # merge-only tests - label: "Diffusion Tensor Parallelism Test" @@ -94,14 +95,22 @@ steps: commands: - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py -- label: "Engine Test" - agent_pool: mi325_1 +- label: "Benchmark & Engine Test" + agent_pool: mi325_2 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - timeout 20m pytest -s -v tests/engine/test_async_omni_engine_abort.py + - | + timeout 20m bash -c ' + set +e + pytest -s -v tests/benchmarks/test_serve_cli.py + EXIT1=\$? + pytest -s -v tests/engine/test_async_omni_engine_abort.py + EXIT2=\$? + exit \$((EXIT1 | EXIT2)) + ' - label: "Omni Model Test Qwen2-5-Omni" agent_pool: mi325_2 @@ -112,7 +121,6 @@ steps: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py - - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model" - label: "Omni Model Test Qwen3-Omni" agent_pool: mi325_2 @@ -123,10 +131,11 @@ steps: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - timeout 30m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model" + - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py + - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" - label: "Qwen3-TTS CustomVoice E2E Test" - agent_pool: mi325_1 + agent_pool: mi325_2 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking @@ -136,21 +145,21 @@ steps: export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py tests/e2e/offline_inference/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" + pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_customvoice.py ' - label: "Qwen3-TTS Base E2E Test" - agent_pool: mi325_1 + agent_pool: mi325_2 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - | - timeout 30m bash -c ' + timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py tests/e2e/offline_inference/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" + pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_base.py ' - label: "Diffusion Image Edit Test" @@ -164,58 +173,43 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py -# TODO: Bagel test on ROCm is very unstable. @tjtanaa -# Need to debug before reneable numerical changes across large PRs -# # split Bagel Model Test with H100 (Real Weights) into three tests -# - label: "Bagel Text2Img Model Test (1/3)" -# agent_pool: mi325_1 -# depends_on: amd-build -# mirror_hardwares: [amdproduction] -# grade: Blocking -# commands: -# - export GPU_ARCHS=gfx942 -# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 -# - export VLLM_LOGGING_LEVEL=DEBUG -# - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - export VLLM_ROCM_USE_AITER_RMSNORM=0 -# - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" -k "rocm" - -# - label: "Bagel Img2Img Model Test (2/3)" -# agent_pool: mi325_1 -# depends_on: amd-build -# mirror_hardwares: [amdproduction] -# grade: Blocking -# commands: -# - export GPU_ARCHS=gfx942 -# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 -# - export VLLM_LOGGING_LEVEL=DEBUG -# - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - export VLLM_ROCM_USE_AITER_RMSNORM=0 -# - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model" -k "rocm" +# split Bagel Model Test with H100 (Real Weights) into three tests +- label: "Bagel Text2Img Model Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" -k "rocm" -# - label: "Bagel Online Serving Test (3/3)" -# agent_pool: mi325_1 -# depends_on: amd-build -# mirror_hardwares: [amdproduction] -# grade: Blocking -# commands: -# - export GPU_ARCHS=gfx942 -# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 -# - export VLLM_IMAGE_FETCH_TIMEOUT=60 -# - export VLLM_LOGGING_LEVEL=DEBUG -# - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - export VLLM_ROCM_USE_AITER_RMSNORM=0 -# - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" -k "rocm" +- label: "Bagel Img2Img Model Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model" -k "rocm" -- label: "Voxtral-TTS E2E Test" +- label: "Bagel Online Serving Test" agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - - | - timeout 20m bash -c ' - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" - ' + - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - export VLLM_IMAGE_FETCH_TIMEOUT=60 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" -k "rocm" diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml index 30bbc769412..6e31163accb 100644 --- a/.buildkite/test-amd-ready.yaml +++ b/.buildkite/test-amd-ready.yaml @@ -9,37 +9,13 @@ steps: - export VLLM_ROCM_USE_AITER=0 - "timeout 20m pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml" -- label: "Voxtral TTS CUDA Unit Test" - agent_pool: mi325_1 - depends_on: amd-build - mirror_hardwares: [amdproduction] - grade: Blocking - commands: - - timeout 10m pytest -s -v tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py - - label: "Diffusion Model Test" - agent_pool: mi325_1 - depends_on: amd-build - mirror_hardwares: [amdproduction] - grade: Blocking - commands: - - timeout 30m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model" - -- label: "Diffusion Batching Test" - agent_pool: mi325_1 - depends_on: amd-build - mirror_hardwares: [amdproduction] - grade: Blocking - commands: - - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py -m "core_model and diffusion" --run-level "core_model" - -- label: "Custom Pipeline Test" - agent_pool: mi325_1 + agent_pool: mi325_2 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - - timeout 20m pytest -s -v tests/e2e/offline_inference/custom_pipeline/ -m "core_model" + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model" - label: "Diffusion Model CPU offloading Test" agent_pool: mi325_1 @@ -47,6 +23,7 @@ steps: mirror_hardwares: [amdproduction] grade: Blocking commands: + - export GPU_ARCHS=gfx942 - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - | @@ -69,7 +46,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 @@ -100,58 +77,47 @@ steps: commands: - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py -- label: "Engine Test" - agent_pool: mi325_1 +- label: "Benchmark & Engine Test" + agent_pool: mi325_2 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - | - timeout 15m bash -c ' - pytest -s -v tests/engine/test_async_omni_engine_abort.py + timeout 30m bash -c ' + set +e + pytest -s -v tests/benchmarks/test_serve_cli.py + EXIT1=\$? + pytest -s -v tests/engine/test_async_omni_engine_abort.py + EXIT2=\$? + exit \$((EXIT1 | EXIT2)) ' +- label: "Omni Model Test Qwen2-5-Omni" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 17m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py -# NOTE: This test is not running any thing. It is skipped and deselected. -# Currently it is = 1 skipped, 1 deselected, 17 warnings in 0.03s ====== -# - label: "Omni Model Test Qwen2-5-Omni" -# agent_pool: mi325_2 -# depends_on: amd-build -# mirror_hardwares: [amdproduction] -# grade: Blocking -# commands: -# - export VLLM_LOGGING_LEVEL=DEBUG -# - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py -m "core_model" --run-level "core_model" - -# - label: "Omni Model Test Qwen3-Omni" -# agent_pool: mi325_2 -# depends_on: amd-build -# mirror_hardwares: [amdproduction] -# grade: Blocking -# commands: -# - export VLLM_LOGGING_LEVEL=DEBUG -# - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 -# - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py -# - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model" - -- label: "MiMo-Audio E2E Test with H100" - agent_pool: mi325_1 +- label: "Omni Model Test Qwen3-Omni" + agent_pool: mi325_2 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - - | - timeout 30m bash -c ' - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_mimo_audio.py -m "core_model" --run-level "core_model" - ' + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py + - timeout 10m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model" - label: "Qwen3-TTS E2E Test" - agent_pool: mi325_1 + agent_pool: mi325_2 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking @@ -159,82 +125,55 @@ steps: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - timeout 30m pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "core_model" --run-level "core_model" + - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "core_model" --run-level "core_model" -- label: "Voxtral-TTS E2E Test" +- label: "Diffusion Image Edit Test" agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - - | - timeout 20m bash -c ' - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" - pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" - ' + - export GPU_ARCHS=gfx942 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py -- label: "Diffusion Image Edit Test" +- label: "Bagel Text2Img Model Test" agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py - -# TODO: Bagel test on ROCm is very unstable. @tjtanaa -# Need to debug before reneable numerical changes across large PRs -# - label: "Bagel Text2Img Model Test" -# agent_pool: mi325_1 -# depends_on: amd-build -# mirror_hardwares: [amdproduction] -# grade: Blocking -# commands: -# - export GPU_ARCHS=gfx942 -# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 -# - export VLLM_LOGGING_LEVEL=DEBUG -# - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - export VLLM_ROCM_USE_AITER_RMSNORM=0 -# - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model" -k "rocm" - -# - label: "Bagel Img2Img Model Test" -# agent_pool: mi325_1 -# depends_on: amd-build -# mirror_hardwares: [amdproduction] -# grade: Blocking -# commands: -# - export GPU_ARCHS=gfx942 -# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 -# - export VLLM_LOGGING_LEVEL=DEBUG -# - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - export VLLM_ROCM_USE_AITER_RMSNORM=0 -# - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model" -k "rocm" + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model" -k "rocm" -# - label: "Bagel Online Serving Test" -# agent_pool: mi325_1 -# depends_on: amd-build -# mirror_hardwares: [amdproduction] -# grade: Blocking -# commands: -# - export GPU_ARCHS=gfx942 -# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 -# - export VLLM_IMAGE_FETCH_TIMEOUT=60 -# - export VLLM_LOGGING_LEVEL=DEBUG -# - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - export VLLM_ROCM_USE_AITER_RMSNORM=0 -# - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model" -k "rocm" +- label: "Bagel Img2Img Model Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model" -k "rocm" -- label: "CosyVoice3-TTS E2E Test" +- label: "Bagel Online Serving Test" agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - - | - timeout 20m bash -c ' - pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "core_model" --run-level "core_model" - ' + - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - export VLLM_IMAGE_FETCH_TIMEOUT=60 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model" -k "rocm" diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 0b9a3f47aba..e175385ff0d 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -117,17 +117,3 @@ steps: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py - - -- label: "Omni Sleep Mode Test" - timeout_in_minutes: 40 - agent_pool: mi325_2 - depends_on: amd-build - mirror_hardwares: [amdproduction] - grade: Blocking - commands: - - export GPU_ARCHS=gfx942 - - export VLLM_LOGGING_LEVEL=DEBUG - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - - pytest -s -v tests/e2e/offline_inference/test_omni_sleep_mode.py -m "advanced_model and omni and MI325" --run-level "advanced_model" diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 691f3f8764d..b0b5a639618 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -1,8 +1,3 @@ -env: - VLLM_WORKER_MULTIPROC_METHOD: spawn - HF_HUB_DOWNLOAD_TIMEOUT: 300 - HF_HUB_ETAG_TIMEOUT: 60 - steps: - label: "Simple Unit Test" depends_on: upload-merge-pipeline @@ -76,6 +71,24 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" + - label: "Audio Generation Model Test" + timeout_in_minutes: 20 + depends_on: upload-merge-pipeline + commands: + - pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py + agents: + queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + - label: "Diffusion Cache Backend Test" timeout_in_minutes: 15 depends_on: upload-merge-pipeline @@ -95,7 +108,7 @@ steps: - "/fsx/hf_cache:/fsx/hf_cache" - label: "Diffusion Sequence Parallelism Test" - timeout_in_minutes: 25 + timeout_in_minutes: 20 depends_on: upload-merge-pipeline commands: - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py tests/diffusion/distributed/test_ulysses_uaa_perf.py @@ -156,6 +169,7 @@ steps: commands: - | timeout 15m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/engine/test_async_omni_engine_abort.py ' agents: @@ -177,6 +191,7 @@ steps: depends_on: upload-merge-pipeline commands: - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py tests/e2e/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU @@ -197,6 +212,7 @@ steps: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py tests/e2e/offline_inference/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" ' @@ -219,6 +235,7 @@ steps: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py tests/e2e/offline_inference/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" ' @@ -239,6 +256,7 @@ steps: timeout_in_minutes: 30 depends_on: upload-merge-pipeline commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model" agents: @@ -275,50 +293,11 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: "Audio Streaming Input Test with H100" - timeout_in_minutes: 30 - depends_on: upload-merge-pipeline - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - - pytest -s -v tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py -m "advanced_model" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - label: "Diffusion Image Edit Test with H100 (1 GPU)" timeout_in_minutes: 20 depends_on: upload-merge-pipeline commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py agents: queue: "mithril-h100-pool" @@ -361,6 +340,7 @@ steps: - | timeout 55m bash -c ' set -e + export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_TEST_CLEAN_GPU_MEMORY=1 export VLLM_IMAGE_FETCH_TIMEOUT=60 pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" @@ -400,46 +380,6 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: "Omni Sleep Mode Test with H100" - timeout_in_minutes: 30 - depends_on: upload-merge-pipeline - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - - pytest -s -v tests/e2e/offline_inference/test_omni_sleep_mode.py -m "advanced_model and H100 and omni" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - label: "Voxtral-TTS E2E Test" timeout_in_minutes: 20 depends_on: upload-merge-pipeline @@ -447,8 +387,19 @@ steps: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" ' + + - label: "CosyVoice3-TTS E2E Test" + timeout_in_minutes: 20 + depends_on: upload-merge-pipeline + commands: + - | + timeout 20m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "advanced_model" --run-level "advanced_model" + ' agents: queue: "mithril-h100-pool" plugins: diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index ce67b76d921..9dc88850618 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -1,693 +1,407 @@ -env: - VLLM_WORKER_MULTIPROC_METHOD: spawn - HF_HUB_DOWNLOAD_TIMEOUT: 300 - HF_HUB_ETAG_TIMEOUT: 60 - steps: - # Group: collapses under one heading in the Buildkite UI; child steps still run in parallel. - - group: ":card_index_dividers: Omni Model Test" - key: nightly-omni-test-group + - label: ":full_moon: Omni Model Test with H100" + timeout_in_minutes: 90 depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" || build.pull_request.labels includes "omni-test" - steps: - - label: ":full_moon: Omni · Function Test" - timeout_in_minutes: 90 - commands: - - pytest -s -v tests/e2e/ -m "full_model and H100 and omni" --run-level "full_model" --ignore=tests/e2e/accuracy - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Omni · Doc Test with L4" - timeout_in_minutes: 90 - commands: - - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/examples/ -m "full_model and omni and L4" --run-level "full_model" - agents: - queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - shm-size: "8gb" - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - - label: ":full_moon: Omni · Doc Test with H100" - timeout_in_minutes: 90 - commands: - - pytest -s -v tests/examples/ -m "full_model and omni and H100" --run-level "full_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Omni · Accuracy Test" - timeout_in_minutes: 180 - commands: - - export SEED_TTS_WER_EVAL=1 - - export SEED_TTS_EVAL_DEVICE=cuda:1 - - | - set +e - pytest -s -v tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py -m "full_model" --run-level full_model - EXIT=$$? - buildkite-agent artifact upload "tests/e2e/accuracy/qwen3_omni/results/qwen_omni_acc/*.json" - exit $$EXIT - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Omni · Perf Test" - key: nightly-omni-performance - timeout_in_minutes: 180 - commands: - - export BENCHMARK_DIR=tests/dfx/perf/results - - | - set +e - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_omni.json - EXIT=$$? - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" - exit $$EXIT - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - | + pytest -s -v \ + tests/examples/ \ + tests/e2e/online_serving/test_*_expansion.py \ + -m "advanced_model and H100 and omni" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - - group: ":card_index_dividers: TTS Model Test" - key: nightly-tts-test-group + - label: ":full_moon: Omni Model Test with L4" + timeout_in_minutes: 90 depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" || build.pull_request.labels includes "tts-test" - steps: - - label: ":full_moon: TTS · Function Test" - timeout_in_minutes: 90 - commands: - - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/e2e/ -m "full_model and L4 and omni" --run-level "full_model" --ignore=tests/e2e/accuracy - agents: - queue: "gpu_1_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - shm-size: "8gb" - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - - label: ":full_moon: TTS · Perf Test" - key: nightly-tts-performance - timeout_in_minutes: 180 - commands: - - export BENCHMARK_DIR=tests/dfx/perf/results - - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - | - set +e - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py --test-config-file tests/dfx/perf/tests/test_tts.json - EXIT=$$? - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" - exit $$EXIT - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" + - pytest -s -v tests/examples/ tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" + agents: + queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" - # Diffusion X2I suite: x2i / x2a / x2t and related non-video paths; x2v is only in "Diffusion X2V Model Test" below. - - group: ":card_index_dividers: Diffusion X2I(&A&T) Model Test" - key: nightly-diffusion-x2iat-group + - label: ":full_moon: Diffusion Model Test with H100" + timeout_in_minutes: 120 depends_on: upload-nightly-pipeline - if: >- - build.env("NIGHTLY") == "1" || - build.pull_request.labels includes "nightly-test" || - build.pull_request.labels includes "diffusion-x2iat-test" - steps: - - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100" - timeout_in_minutes: 120 - commands: - - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion" -m "advanced_model and diffusion and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4" - timeout_in_minutes: 60 - commands: - - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy - agents: - queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - shm-size: "8gb" - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - - label: ":full_moon: Diffusion X2I(&A&T) · Doc Test" - timeout_in_minutes: 60 - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - - pytest -s -v tests/examples/*/test_text_to_image.py -m "full_model and example and H100" --run-level "full_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + - label: ":full_moon: Diffusion Model (Wan2.2) Test with H100" + timeout_in_minutes: 90 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py -m "advanced_model" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - - label: ":full_moon: Diffusion X2I(&A&T) · GEBench Accuracy Test" - timeout_in_minutes: 60 - commands: - - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + - label: ":full_moon: Diffusion Model Test" + timeout_in_minutes: 60 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" + agents: + queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" - - label: ":full_moon: Diffusion X2I(&A&T) · GEdit-Bench Accuracy Test" - timeout_in_minutes: 60 - commands: - - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level full_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1 - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv" - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_HTTP_TIMEOUT_KEEP_ALIVE - value: "120" - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - label: ":full_moon: Diffusion X2I(&A&T) · Accuracy Test" - timeout_in_minutes: 180 - commands: - - pytest -s -v tests/e2e/accuracy/test_qwen_image*.py --run-level full_model - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_HTTP_TIMEOUT_KEEP_ALIVE - value: "120" - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + - label: ":full_moon: Doc Example Code Test with H100" + timeout_in_minutes: 60 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_TEST_CLEAN_GPU_MEMORY="1" + - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - - label: ":full_moon: Diffusion X2I(&A&T) · Perf Test" - key: nightly-diffusion-x2iat-performance - timeout_in_minutes: 180 - commands: - - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results - - export DIFFUSION_ATTENTION_BACKEND=FLASH_ATTN - - export CACHE_DIT_VERSION=1.3.0 - # [HACK]: run upload in the same command block as pytest. - # Because `exit` aborts the entire commands list. - - | - set +e - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json - EXIT1=$$? - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json - EXIT2=$$? - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json - EXIT3=$$? - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json - EXIT4=$$? - buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json" - buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" - exit $$((EXIT1 | EXIT2 | EXIT3 | EXIT4)) - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + - label: ":full_moon: Omni Model Perf Test & Testcase Statistics with H100" + key: nightly-omni-performance + timeout_in_minutes: 180 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export BENCHMARK_DIR=tests/dfx/perf/results + - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" + - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py + - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" + - python tools/nightly/buildkite_testcase_statistics.py -o tests/dfx/perf/results/buildkite_testcase_statistics.html + - buildkite-agent artifact upload "tests/dfx/perf/results/*.html" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - # Diffusion x2v only (Wan, HunyuanVideo, …). x2i/x2a/x2t live in the X2I group above, not here. - - group: ":card_index_dividers: Diffusion X2V Model Test" - key: nightly-diffusion-x2v-group + - label: ":full_moon: GEBench Accuracy Test with H100" + key: nightly-gebench-accuracy + timeout_in_minutes: 60 depends_on: upload-nightly-pipeline - if: >- - build.env("NIGHTLY") == "1" || - build.pull_request.labels includes "nightly-test" || - build.pull_request.labels includes "diffusion-x2v-test" - steps: - - label: ":full_moon: Diffusion X2V · Function Test" - timeout_in_minutes: 90 - commands: - - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py --run-level "full_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - - label: ":full_moon: Diffusion X2V · Accuracy Test" - timeout_in_minutes: 180 - commands: - - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py -m full_model --run-level full_model - - pytest -s -v tests/e2e/accuracy/test_ltx2_3_video_similarity.py -m advanced_model --run-level advanced_model - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + - label: ":full_moon: GEdit-Bench Accuracy Test with H100" + key: nightly-gedit-bench-accuracy + timeout_in_minutes: 60 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1 + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv" + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_HTTP_TIMEOUT_KEEP_ALIVE + value: "120" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - - label: ":full_moon: Diffusion X2V · Perf Test" - key: nightly-diffusion-x2v-performance - timeout_in_minutes: 180 - commands: - - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results - - export DIFFUSION_ATTENTION_BACKEND=FLASH_ATTN - - | - set +e - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_wan22_i2v_vllm_omni.json - EXIT1=$$? - buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json" - buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" - exit $$EXIT1 - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + - label: ":full_moon: Wan22 I2V Accuracy Test with H100" + key: nightly-wan22-i2v-accuracy + timeout_in_minutes: 180 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - - label: ":bar_chart: Testcase Statistics" - key: nightly-testcase-statistics - timeout_in_minutes: 120 + - label: ":full_moon: Diffusion Perf Test with H100" + key: nightly-qwen-image-performance + timeout_in_minutes: 180 depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - python tools/nightly/buildkite_testcase_statistics.py -o tests/dfx/perf/results/buildkite_testcase_statistics.html - - buildkite-agent artifact upload "tests/dfx/perf/results/*.html" + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results + - export CACHE_DIT_VERSION=1.3.0 + - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json + - buildkite-agent artifact upload "tests/dfx/perf/results/benchmark_results_*.json" + - buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" agents: queue: "mithril-h100-pool" plugins: @@ -697,7 +411,7 @@ steps: - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT resources: limits: - nvidia.com/gpu: 2 + nvidia.com/gpu: 4 volumeMounts: - name: devshm mountPath: /dev/shm @@ -727,20 +441,15 @@ steps: key: nightly-perf-distribution depends_on: - nightly-omni-performance - - nightly-tts-performance - - nightly-diffusion-x2iat-performance - - nightly-diffusion-x2v-performance - - nightly-testcase-statistics + - nightly-qwen-image-performance if: build.env("NIGHTLY") == "1" commands: - pip install openpyxl - export DEFAULT_INPUT_DIR=tests/dfx/perf/results - export DEFAULT_OUTPUT_DIR=tests/dfx/perf/results - - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-tts-performance - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-omni-performance - - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-diffusion-x2iat-performance - - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-diffusion-x2v-performance - - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-testcase-statistics + - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-qwen-image-performance + - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-omni-performance - python tools/nightly/generate_nightly_perf_excel.py - python tools/nightly/generate_nightly_perf_html.py - python tools/nightly/send_nightly_email.py --report-file "tests/dfx/perf/results/*.xlsx, tests/dfx/perf/results/*.html" diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 080f18885ef..be528b316cd 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -1,8 +1,3 @@ -env: - VLLM_WORKER_MULTIPROC_METHOD: spawn - HF_HUB_DOWNLOAD_TIMEOUT: 300 - HF_HUB_ETAG_TIMEOUT: 60 - steps: - label: "Simple Unit Test" depends_on: upload-ready-pipeline @@ -21,10 +16,11 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "CUDA Unit Test with single card" + - label: "Voxtral TTS CUDA Unit Test" + timeout_in_minutes: 10 depends_on: upload-ready-pipeline commands: - - timeout 10m pytest -v -s -m 'core_model and cuda and L4 and not distributed_cuda' --ignore=tests/e2e --ignore=tests/engine/test_async_omni_engine_abort.py --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml + - "timeout 10m pytest -s -v tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py" agents: queue: "gpu_1_queue" plugins: @@ -37,12 +33,12 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "CUDA Unit Test with multi cards" + - label: "Diffusion Model Test" depends_on: upload-ready-pipeline commands: - - timeout 10m pytest -v -s -m 'core_model and cuda and L4 and distributed_cuda' --ignore=tests/e2e --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml + - timeout 30m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model" agents: - queue: "gpu_4_queue" + queue: "gpu_1_queue" plugins: - docker#v5.2.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT @@ -50,15 +46,16 @@ steps: propagate-environment: true environment: - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Diffusion Model Test" + - label: "Diffusion Batching Test" depends_on: upload-ready-pipeline commands: - - timeout 30m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model" + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py -m "core_model and diffusion" --run-level "core_model" agents: - queue: "gpu_1_queue" + queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: - docker#v5.2.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT @@ -70,12 +67,12 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Diffusion Batching Test" + - label: "Custom Pipeline Test" depends_on: upload-ready-pipeline commands: - - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py -m "core_model and diffusion" --run-level "core_model" + - timeout 20m pytest -s -v tests/e2e/offline_inference/custom_pipeline/ -m "core_model" agents: - queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU + queue: "gpu_1_queue" plugins: - docker#v5.2.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT @@ -83,16 +80,15 @@ steps: propagate-environment: true environment: - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Custom Pipeline Test" + - label: "Diffusion Model CPU offloading Test" depends_on: upload-ready-pipeline commands: - - timeout 20m pytest -s -v tests/e2e/offline_inference/custom_pipeline/ -m "core_model" + - timeout 10m pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py agents: - queue: "gpu_1_queue" + queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: - docker#v5.2.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT @@ -100,13 +96,14 @@ steps: propagate-environment: true environment: - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Diffusion Model CPU offloading Test" + - label: "Audio Generation Model Test" depends_on: upload-ready-pipeline commands: - - timeout 10m pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: @@ -155,12 +152,31 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" + - label: "Diffusion GPU Worker Test" + depends_on: upload-ready-pipeline + commands: + - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py + agents: + queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + - label: "Engine Test" depends_on: upload-ready-pipeline commands: - | timeout 15m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/engine/test_async_omni_engine_abort.py ' agents: @@ -177,11 +193,35 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" + + - label: "Omni Model Test" + depends_on: upload-ready-pipeline + commands: + - | + timeout 17m bash -c ' + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/online_serving/test_qwen2_5_omni.py -m "core_model" --run-level "core_model" + ' + agents: + queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + - label: "Omni Model Test with H100" depends_on: upload-ready-pipeline commands: - | timeout 20m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model" ' agents: @@ -219,6 +259,7 @@ steps: - | timeout 30m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_mimo_audio.py -m "core_model" --run-level "core_model" ' agents: @@ -261,6 +302,7 @@ steps: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "core_model" --run-level "core_model" ' @@ -278,90 +320,15 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "VoxCPM E2E Test" - timeout_in_minutes: 20 - depends_on: upload-ready-pipeline - commands: - - | - timeout 20m bash -c ' - pip install voxcpm - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/offline_inference/test_voxcpm.py -m "core_model" --run-level "core_model" - ' - agents: - queue: "gpu_1_queue" - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - shm-size: "8gb" - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - - label: "VoxCPM2 Native AR E2E Test" - timeout_in_minutes: 20 - depends_on: upload-ready-pipeline - commands: - - | - timeout 20m bash -c ' - pip install voxcpm - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/offline_inference/test_voxcpm2.py -m "core_model" --run-level "core_model" - ' - agents: - queue: "gpu_1_queue" - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - shm-size: "8gb" - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - label: "OmniVoice E2E Test" timeout_in_minutes: 20 depends_on: upload-ready-pipeline - commands: - - | - timeout 20m bash -c ' - export VLLM_LOGGING_LEVEL=DEBUG - pytest -s -v tests/e2e/online_serving/test_omnivoice.py -m "core_model" --run-level "core_model" - ' - agents: - queue: "gpu_1_queue" - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - shm-size: "8gb" - environment: - - "HF_HOME=/fsx/hf_cache" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - - label: "Qwen3-TTS Base E2E Test (ModelRunner V2)" - depends_on: upload-ready-pipeline - soft_fail: - - exit_status: 1 commands: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - export VLLM_OMNI_USE_V2_RUNNER="1" - pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py -m "core_model" --run-level "core_model" + pytest -s -v tests/e2e/online_serving/test_omnivoice.py -m "core_model" --run-level "core_model" ' agents: queue: "gpu_1_queue" @@ -373,7 +340,6 @@ steps: shm-size: "8gb" environment: - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" volumes: - "/fsx/hf_cache:/fsx/hf_cache" @@ -384,6 +350,7 @@ steps: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "core_model" --run-level "core_model" ' agents: @@ -420,6 +387,7 @@ steps: # commands: # - | # timeout 20m bash -c ' + # export VLLM_WORKER_MULTIPROC_METHOD=spawn # pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py # ' # agents: @@ -456,6 +424,7 @@ steps: commands: - | timeout 30m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model" ' @@ -498,6 +467,7 @@ steps: commands: - | timeout 30m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model" ' @@ -540,6 +510,7 @@ steps: commands: - | timeout 40m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_TEST_CLEAN_GPU_MEMORY=1 export VLLM_IMAGE_FETCH_TIMEOUT=60 pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model" @@ -584,6 +555,7 @@ steps: commands: - | timeout 20m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "core_model" --run-level "core_model" ' agents: diff --git a/.buildkite/test-template-amd-omni.j2 b/.buildkite/test-template-amd-omni.j2 index 78f47d1aec0..8dc91a11727 100644 --- a/.buildkite/test-template-amd-omni.j2 +++ b/.buildkite/test-template-amd-omni.j2 @@ -3,7 +3,7 @@ Last synced: 2025-12-15 Modifications: Removed unused CUDA/NVIDIA logic, keeping only AMD tests #} -{% set docker_image_amd = "rocm/vllm-omni:$BUILDKITE_COMMIT" %} +{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT-rocm-omni" %} {% set default_working_dir = "/app/vllm-omni" %} - group: "AMD Tests" @@ -48,9 +48,6 @@ DOCKER_BUILDKIT: "1" TEST_COMMAND: |- (command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe }} -{% if "mi250" in step.agent_pool %} - python3 -m pip uninstall -y amd-aiter -{% endif %} {{ indented_cmd | safe }} priority: 100 {% if step.grade and step.grade == "Blocking" %} diff --git a/.claude/skills/add-diffusion-model/SKILL.md b/.claude/skills/add-diffusion-model/SKILL.md deleted file mode 100644 index 0b979e1a984..00000000000 --- a/.claude/skills/add-diffusion-model/SKILL.md +++ /dev/null @@ -1,566 +0,0 @@ ---- -name: add-diffusion-model -description: Add a new diffusion model (text-to-image, text-to-video, image-to-video, text-to-audio, image editing) to vLLM-Omni, including Cache-DiT acceleration and parallelism support (TP, SP/USP, CFG-Parallel, HSDP). Use when integrating a new diffusion model, porting a diffusers pipeline or a custom model repo to vllm-omni, creating a new DiT transformer adapter, adding diffusion model support, or enabling multi-GPU parallelism and cache acceleration for an existing model. ---- - -# Adding a Diffusion Model to vLLM-Omni - -## Overview - -This skill guides you through adding a new diffusion model to vLLM-Omni. The model may come from HuggingFace Diffusers (structured pipeline) or from a private/custom repo. The workflow differs significantly depending on the source. - -## Prerequisites - -Before starting, determine: - -1. **Model category**: Text-to-Image, Text-to-Video, Image-to-Video, Image Editing, Text-to-Audio, or Omni -2. **Reference source**: Diffusers pipeline, custom repo, or a combination -3. **Model HuggingFace ID** or local checkpoint path -4. **Architecture**: Scheduler, text encoder, VAE, transformer/backbone - -## Step 0: Classify the Migration Path - -Check the model's HF repo for `model_index.json`. This determines your path: - -| Scenario | How to identify | Migration path | -|----------|----------------|----------------| -| **Already supported** | `_class_name` in `model_index.json` matches a key in `_DIFFUSION_MODELS` in `registry.py` | Skip to Step 5 (test) and Step 7 (docs) | -| **Diffusers-based** | Has standard `model_index.json` with `_diffusers_version`, subfolders for `transformer/`, `vae/`, etc. | Follow **Path A** below | -| **Custom/private repo** | No diffusers `model_index.json`, weights in non-standard format, custom model code in a separate git repo | Follow **Path B** below | -| **Hybrid** | Has some diffusers components (VAE) but custom transformer/fusion | Mix of Path A and Path B | - -## Path A: Diffusers-Based Model - -For models with a standard diffusers layout. See [references/transformer-adaptation.md](references/transformer-adaptation.md) for detailed code patterns. - -### A1. Analyze `model_index.json` - -Identify components: `transformer`, `scheduler`, `vae`, `text_encoder`, `tokenizer`. - -### A2. Create model directory - -``` -vllm_omni/diffusion/models/your_model_name/ -├── __init__.py -├── pipeline_your_model.py -└── your_model_transformer.py -``` - -### A3. Adapt transformer - -1. Copy from diffusers source. Remove mixins (`ModelMixin`, `ConfigMixin`, `AttentionModuleMixin`). -2. Replace attention with `vllm_omni.diffusion.attention.layer.Attention` (QKV shape: `[B, seq, heads, head_dim]`). -3. Add `od_config: OmniDiffusionConfig | None = None` to `__init__`. -4. Add `load_weights()` method mapping diffusers weight names to vllm-omni names. -5. Add class attributes: `_repeated_blocks`, `_layerwise_offload_blocks_attr`. - -### A4. Adapt pipeline - -Inherit from `nn.Module`. The key contract: - -```python -class YourPipeline(nn.Module): - def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = ""): - # Load VAE, text encoder, tokenizer via from_pretrained() - # Instantiate transformer (weights loaded later via weights_sources) - self.weights_sources = [ - DiffusersPipelineLoader.ComponentSource( - model_or_path=od_config.model, subfolder="transformer", - prefix="transformer.", fall_back_to_pt=True)] - - def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: - # Encode prompt → prepare latents → denoise loop → VAE decode - return DiffusionOutput(output=output) - - def load_weights(self, weights): - return AutoWeightsLoader(self).load_weights(weights) -``` - -Add post/pre-process functions in the same pipeline file. Register them in `registry.py`. - -### A5. Register, test, docs → continue at Step 4 below. - ---- - -## Path B: Custom/Private Repo Model - -For models without a diffusers pipeline — weights in custom format, model code in a private repo. Real examples: DreamID-Omni, BAGEL, HunyuanImage3. - -### B1. Understand the reference repo - -Study the original model's code to identify: -- **Model architecture files** (transformers, fusion modules, embeddings) -- **Weight format** (safetensors, `.pth`, custom checkpoint structure) -- **Weight loading helpers** (custom init functions, checkpoint loaders) -- **Pre/post-processing** (image/audio transforms, tokenization, VAE encode/decode) -- **External dependencies** (packages not on PyPI) -- **Config format** (JSON config files, hardcoded dicts) - -### B2. Decide what lives WHERE - -This is the key design decision for custom models. Follow these placement rules: - -| Code type | Where to place | Example | -|-----------|---------------|---------| -| **Pipeline orchestration** (init, forward, denoise loop) | `vllm_omni/diffusion/models//pipeline_.py` | Always required | -| **Custom transformer/backbone** (ported and adapted to vllm-omni) | `vllm_omni/diffusion/models//_transformer.py` or similar | `wan2_2.py`, `fusion.py`, `bagel_transformer.py` | -| **Custom sub-models** (VAE, fusion, autoencoder) | `vllm_omni/diffusion/models//` as separate files | `autoencoder.py`, `fusion.py` | -| **External dependency code** (original repo utilities) | **External repo**, installed via download script or pip | `dreamid_omni` package via git clone | -| **Hardcoded model configs** | Module-level dicts in pipeline file | `VIDEO_CONFIG`, `AUDIO_CONFIG` dicts | -| **Download/setup script** | `examples/offline_inference//download_.py` | `download_dreamid_omni.py` | -| **Custom `model_index.json`** | Generated by download script, placed at model root | Minimal: `{"_class_name": "YourPipeline", ...}` | - -### B3. Handle external dependencies - -If the model's code lives in a separate git repo: - -**Option 1: Import with graceful fallback** (recommended for models with external utils) - -```python -try: - from external_model.utils import init_vae, load_checkpoint -except ImportError: - raise ImportError( - "Failed to import from dependency 'external_model'. " - "Please run the download script first." - ) -``` - -**Option 2: Port the code directly** (preferred when feasible) - -Copy the essential model files into `vllm_omni/diffusion/models//` and adapt them. This avoids external dependencies. BAGEL does this — `autoencoder.py` and `bagel_transformer.py` are ported directly. - -**Decision criteria**: Port if the code is self-contained and won't diverge. Use external deps if the model repo is actively maintained and the code is complex. - -### B4. Handle custom weight loading - -Custom models have two patterns for weight loading: - -**Pattern 1: Bypass standard loader** (DreamID-Omni style) - -When the original model has complex custom init functions that load weights in `__init__`: - -```python -class CustomPipeline(nn.Module): - def __init__(self, *, od_config, prefix=""): - super().__init__() - model = od_config.model - # Load everything eagerly in __init__ using custom helpers - self.vae = custom_init_vae(model, device=self.device) - self.text_encoder = custom_init_text_encoder(model, device=self.device) - self.transformer = CustomFusionModel(CONFIG) - load_custom_checkpoint(self.transformer, - checkpoint_path=os.path.join(model, "model.safetensors")) - # NO weights_sources defined — bypasses standard loader - - def load_weights(self, weights): - pass # No-op — all weights loaded in __init__ -``` - -**Pattern 2: Use standard loader with custom `load_weights`** (BAGEL style) - -When weights are in safetensors format but need name remapping: - -```python -class CustomPipeline(nn.Module): - def __init__(self, *, od_config, prefix=""): - super().__init__() - # Instantiate model architecture without weights - self.bagel = BagelModel(config) - self.vae = AutoEncoder(ae_params) - - # Point loader at the safetensors in the model root - self.weights_sources = [ - DiffusersPipelineLoader.ComponentSource( - model_or_path=od_config.model, - subfolder=None, # weights at root, not in subfolder - prefix="", - fall_back_to_pt=False, - ) - ] - - def load_weights(self, weights): - # Custom name remapping for non-diffusers weight names - params = dict(self.named_parameters()) - loaded = set() - for name, tensor in weights: - # Remap original weight names to vllm-omni module names - name = self._remap_weight_name(name) - if name in params: - default_weight_loader(params[name], tensor) - loaded.add(name) - return loaded -``` - -### B5. Create the `model_index.json` - -Custom models need a `model_index.json` at the model root for vllm-omni to discover them. For custom models, this is minimal: - -```json -{ - "_class_name": "YourModelPipeline", - "custom_key": "path/to/custom_weights.safetensors" -} -``` - -The `_class_name` must match a key in `_DIFFUSION_MODELS` in `registry.py`. Additional keys are model-specific (accessed via `od_config.model_config`). - -If the model's weights come from multiple HF repos, write a **download script** that: -1. Downloads from each repo -2. Assembles into a single directory -3. Generates `model_index.json` -4. Installs any external dependencies (git clone + `.pth` file) - -Place at: `examples/offline_inference//download_.py` - -### B6. Handle multi-modal inputs - -If the model accepts images, audio, or other multi-modal inputs, implement the protocol classes from `vllm_omni/diffusion/models/interface.py`: - -```python -from vllm_omni.diffusion.models.interface import SupportImageInput, SupportAudioInput - -class MyPipeline(nn.Module, SupportImageInput, SupportAudioInput): - # Protocol markers — the engine uses these to enable proper input routing - pass -``` - -Preprocessing for custom models is typically done **inside `forward()`** rather than via registered pre-process functions, since the logic is often tightly coupled to the model. - -### B7. Continue at Step 4 below. - ---- - -## Common Steps (Both Paths) - -### Step 4: Register Model in registry.py - -Edit `vllm_omni/diffusion/registry.py`: - -```python -_DIFFUSION_MODELS = { - "YourModelPipeline": ("your_model_name", "pipeline_your_model", "YourModelPipeline"), -} -_DIFFUSION_POST_PROCESS_FUNCS = { - "YourModelPipeline": "get_your_model_post_process_func", # if applicable -} -_DIFFUSION_PRE_PROCESS_FUNCS = { - "YourModelPipeline": "get_your_model_pre_process_func", # if applicable -} -``` - -The registry key is the `_class_name` from `model_index.json`. The tuple is `(folder_name, module_file, class_name)`. - -Create `__init__.py` exporting the pipeline class and any factory functions. - -### Step 5: Run, Test, Debug - -Use the appropriate existing example script: - -| Category | Script | -|----------|--------| -| Text-to-Image | `examples/offline_inference/text_to_image/text_to_image.py` | -| Text-to-Video | `examples/offline_inference/text_to_video/text_to_video.py` | -| Image-to-Video | `examples/offline_inference/image_to_video/image_to_video.py` | -| Image-to-Image | `examples/offline_inference/image_to_image/image_edit.py` | -| Text-to-Audio | `examples/offline_inference/text_to_audio/text_to_audio.py` | - -For custom/Omni models that don't fit these categories, create a dedicated example script. - -**Validation**: No errors, output is meaningful, quality matches reference implementation. - -See [references/troubleshooting.md](references/troubleshooting.md) for common errors. - -### Step 6: Add Example Scripts - -For Omni or custom models, create: -- `examples/offline_inference/your_model_name/` — offline script + README -- `examples/online_serving/your_model_name/` — server script + client -- Download script if weights require assembly from multiple sources - -### Step 7: Update Documentation - -Required updates: -1. `docs/user_guide/diffusion/parallelism_acceleration.md` — parallelism support table -2. `docs/user_guide/diffusion/cpu_offload_diffusion.md` — if CPU offload supported (add to supported models table) -3. `docs/user_guide/diffusion/teacache.md` — if TeaCache supported -4. `docs/user_guide/diffusion/cache_dit_acceleration.md` — if Cache-DiT supported -5. `examples/offline_inference/xxx/README.md` — offline example docs -6. `examples/online_serve/xxx/README.md` — online serve docs - -### Step 8: Add E2E Tests (Recommended) - -Create `tests/e2e/online_serving/test_your_model_expansion.py`. - -### Step 9: Add Cache-DiT Acceleration - -Cache-DiT accelerates inference by caching intermediate computation results across denoising steps. After your model is working correctly on a single GPU, add cache-dit support. - -See [references/cache-dit-patterns.md](references/cache-dit-patterns.md) for detailed code patterns. - -#### 9a. Determine your model type - -| Model Type | Description | Action | -|------------|-------------|--------| -| **Standard single-transformer** | One transformer with one `ModuleList` of blocks | No code needed — `CacheDiTBackend` auto-detects via `enable_cache_for_dit()` | -| **Multi-block-list** | One transformer with multiple block lists (e.g., `transformer_blocks` + `single_transformer_blocks`) | Write custom enabler with `BlockAdapter` | -| **Dual-transformer** | Two transformers (e.g., high-noise + low-noise) | Write custom enabler with `BlockAdapter` wrapping both | - -#### 9b. Standard models — verify automatic support - -For standard single-transformer models, test directly: - -```python -omni = Omni( - model="your-model-name", - cache_backend="cache_dit", - cache_config={ - "Fn_compute_blocks": 1, - "Bn_compute_blocks": 0, - "max_warmup_steps": 4, - } -) -``` - -Check logs for "Cache-dit enabled successfully on xxx". If it works, skip to Step 9e. - -#### 9c. Custom architectures — write a custom enabler - -For multi-block-list or dual-transformer models, write a custom enabler function: - -```python -from cache_dit import BlockAdapter, ForwardPattern, ParamsModifier, DBCacheConfig - -def enable_cache_for_your_model(pipeline, cache_config): - db_cache_config = DBCacheConfig( - num_inference_steps=None, - Fn_compute_blocks=cache_config.Fn_compute_blocks, - Bn_compute_blocks=cache_config.Bn_compute_blocks, - max_warmup_steps=cache_config.max_warmup_steps, - max_cached_steps=cache_config.max_cached_steps, - max_continuous_cached_steps=cache_config.max_continuous_cached_steps, - residual_diff_threshold=cache_config.residual_diff_threshold, - ) - - cache_dit.enable_cache( - BlockAdapter( - transformer=pipeline.transformer, - blocks=[ - pipeline.transformer.transformer_blocks, - pipeline.transformer.single_transformer_blocks, - ], - forward_pattern=[ForwardPattern.Pattern_1, ForwardPattern.Pattern_1], - params_modifiers=[ParamsModifier(...)], - ), - cache_config=db_cache_config, - ) - - def refresh_cache_context(pipeline, num_inference_steps, verbose=True): - cache_dit.refresh_context( - pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose - ) - return refresh_cache_context -``` - -#### 9d. Register the custom enabler - -Add your enabler to `CUSTOM_DIT_ENABLERS` in `vllm_omni/diffusion/cache/cache_dit_backend.py`: - -```python -CUSTOM_DIT_ENABLERS = { - "Wan22Pipeline": enable_cache_for_wan22, - "LongCatImagePipeline": enable_cache_for_longcat_image, - "YourModelPipeline": enable_cache_for_your_model, # Add here -} -``` - -#### 9e. Test Cache-DiT - -```python -omni = Omni( - model="your-model-name", - cache_backend="cache_dit", - cache_config={ - "Fn_compute_blocks": 1, "Bn_compute_blocks": 0, - "max_warmup_steps": 4, "residual_diff_threshold": 0.24, - } -) -images = omni.generate("a beautiful landscape", - OmniDiffusionSamplingParams(num_inference_steps=50)) -``` - -**Verify**: 1) logs show cache enabled, 2) 1.5-2x speedup, 3) output quality acceptable vs baseline. - -If quality degrades, lower `residual_diff_threshold` (try 0.12-0.18) or increase `max_warmup_steps` (try 6-8). - ---- - -### Step 10: Add Parallelism Support - -After the model works on a single GPU, add multi-GPU parallelism. Add each type incrementally, testing after each addition. - -See [references/parallelism-patterns.md](references/parallelism-patterns.md) for detailed code patterns and API reference. - -**Recommended order**: TP → SP/USP → CFG Parallel → HSDP - -#### 10a. Tensor Parallelism (TP) - -Shards DiT linear layers across GPUs. Requires code changes in the transformer. - -**What to change in the transformer**: -1. Replace `nn.Linear` with `ColumnParallelLinear` / `RowParallelLinear` / `QKVParallelLinear` -2. Update `load_weights()` to handle QKV fusion with `stacked_params_mapping` -3. Use `self.to_qkv.num_heads` (local heads) instead of total heads for split sizes - -```python -from vllm.model_executor.layers.linear import ( - QKVParallelLinear, RowParallelLinear, ColumnParallelLinear, -) - -# Attention: QKV → RowParallel output -self.to_qkv = QKVParallelLinear(dim, head_dim, num_heads, num_kv_heads) -self.to_out = RowParallelLinear(dim, dim, input_is_parallel=True) - -# FFN: ColumnParallel → RowParallel -self.w1 = ColumnParallelLinear(dim, ffn_dim) -self.w2 = RowParallelLinear(ffn_dim, dim, input_is_parallel=True) -``` - -**Constraints**: `num_heads % tp_size == 0` and `num_kv_heads % tp_size == 0`. - -**Test**: `--tensor-parallel-size 2` - -#### 10b. Sequence Parallelism (SP / USP) - -Splits sequence tokens across GPUs. Non-intrusive via `_sp_plan` on the transformer class — no changes to `forward()`. - -**What to change in the transformer**: - -Add `_sp_plan` class attribute: - -```python -from vllm_omni.diffusion.distributed.sp_plan import ( - SequenceParallelInput, SequenceParallelOutput, -) - -class YourTransformer(nn.Module): - _sp_plan = { - "blocks.0": { - "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), - }, - "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), - } -``` - -If inline tensor ops (e.g., `torch.cat`) exist between shard/gather points, extract them into `nn.Module` submodules so hooks can intercept them. - -For RoPE that needs splitting, add an entry for the RoPE module with `split_output=True`. - -**Test**: `--ulysses-degree 2` (offline) or `--usp 2` (online serving) - -#### 10c. CFG Parallel - -Distributes positive/negative CFG branches across 2 GPUs. Requires the pipeline to inherit `CFGParallelMixin`. - -**What to change in the pipeline**: - -```python -from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin - -class YourPipeline(nn.Module, CFGParallelMixin): - def diffuse(self, ...) -> torch.Tensor: - for i, t in enumerate(timesteps): - positive_kwargs = {...} - negative_kwargs = {...} if do_true_cfg else None - noise_pred = self.predict_noise_maybe_with_cfg( - do_true_cfg=do_true_cfg, true_cfg_scale=cfg_scale, - positive_kwargs=positive_kwargs, negative_kwargs=negative_kwargs, - ) - latents = self.scheduler_step_maybe_with_cfg( - noise_pred, t, latents, do_true_cfg - ) - return latents -``` - -Override `predict_noise()` if your transformer call is non-standard. Override `combine_cfg_noise()` for multi-output models (e.g., video + audio). - -**Constraint**: Exactly 2 GPUs. Only for models using classifier-free guidance. - -**Test**: `--cfg-parallel-size 2` - -#### 10d. HSDP (Hybrid Sharded Data Parallel) - -Shards transformer weights via PyTorch FSDP2 to reduce per-GPU VRAM. No code changes to the forward pass — just add a class attribute. - -**What to change in the transformer**: - -```python -class YourTransformer(nn.Module): - @staticmethod - def _is_transformer_block(name: str, module) -> bool: - return "blocks" in name and name.split(".")[-1].isdigit() - - _hsdp_shard_conditions = [_is_transformer_block] -``` - -**Constraint**: Cannot combine with TP. For standalone HSDP, set `hsdp_shard_size` explicitly. - -**Test**: `--use-hsdp` or `DiffusionParallelConfig(use_hsdp=True)` - -#### 10e. Update parallelism documentation - -After adding parallelism support, update: -1. `docs/user_guide/diffusion/parallelism_acceleration.md` — add your model to the support table -2. Record which parallelism methods are supported (USP, Ring, CFG, TP, HSDP, VAE-Patch) - -### Step 11: Add CPU Offload Support - -Implement `SupportsModuleOffload` on your pipeline class to enable -`--enable-cpu-offload` and `--enable-layerwise-offload`. The protocol -declares which submodules the offloader should manage: - -```python -from typing import ClassVar -from vllm_omni.diffusion.models.interface import SupportsModuleOffload - -class YourPipeline(nn.Module, SupportsModuleOffload): - _dit_modules: ClassVar[list[str]] = ["transformer"] - _encoder_modules: ClassVar[list[str]] = ["text_encoder"] - _vae_modules: ClassVar[list[str]] = ["vae"] - _resident_modules: ClassVar[list[str]] = [] # optional -``` - -- `_dit_modules`: denoising submodules (kept on GPU during diffusion loop) -- `_encoder_modules`: encoder/vision submodules (offloaded to CPU during diffusion loop) -- `_vae_modules`: VAE(s) (handled by both sequential and layerwise backends) -- `_resident_modules`: additional modules to pin on GPU during layerwise - offloading (e.g. embedders, connectors). Only used by the layerwise - backend. Optional — defaults to `[]`. - -All attribute names support dotted paths for nested submodules -(e.g. `"pipe.transformer"`, `"bagel.time_embedder"`). - -Pipelines without `SupportsModuleOffload` fall back to scanning -well-known attribute names (`transformer`, `text_encoder`, `vae`, -etc.), which fails for non-standard names. - ---- - -## Iterative Development Tips - -1. **Start minimal**: Basic generation first, no parallelism/caching -2. **Use `--enforce-eager`**: Disable torch.compile during debugging -3. **Use small models**: Test with smaller variants first -4. **Check tensor shapes**: Most errors are reshape mismatches in attention -5. **Add features incrementally**: Single GPU → TP → SP → CFG → HSDP → Cache-DiT -6. **For custom models**: Get the model running with the original code first, then progressively replace components with vllm-omni equivalents -7. **Cache-DiT before parallelism tuning**: Cache-DiT is lossy — verify quality at baseline before combining with parallelism -8. **Combine lossless + lossy**: e.g., TP + SP + Cache-DiT for maximum throughput - -## Reference Files - -- [Transformer Adaptation](references/transformer-adaptation.md) — porting transformers from diffusers -- [Custom Model Patterns](references/custom-model-patterns.md) — patterns for non-diffusers models -- [Parallelism Patterns](references/parallelism-patterns.md) — TP, SP/USP, CFG parallel, HSDP implementation details -- [Cache-DiT Patterns](references/cache-dit-patterns.md) — cache-dit acceleration for standard and custom architectures -- [Troubleshooting](references/troubleshooting.md) — common errors and fixes diff --git a/.claude/skills/add-diffusion-model/references/cache-dit-patterns.md b/.claude/skills/add-diffusion-model/references/cache-dit-patterns.md deleted file mode 100644 index d34ce0e0f43..00000000000 --- a/.claude/skills/add-diffusion-model/references/cache-dit-patterns.md +++ /dev/null @@ -1,254 +0,0 @@ -# Cache-DiT Patterns Reference - -## Overview - -Cache-DiT accelerates Diffusion Transformers by caching intermediate computation results across denoising steps. Adjacent steps produce similar features, so redundant computations can be skipped. - -Three caching strategies: -- **DBCache**: Dynamic block-level caching — selectively computes or caches transformer blocks based on residual differences -- **TaylorSeer**: Calibration-based prediction using Taylor expansion to estimate block outputs -- **SCM** (Step Computation Masking): Dynamic step skipping based on configurable policies - -**Typical speedup**: 1.5-2.5x depending on model and configuration. - -**Official docs**: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/cache_dit - -## Architecture - -vLLM-Omni integrates cache-dit through `CacheDiTBackend`: - -| Component | Purpose | -|-----------|---------| -| `CacheDiTBackend` | Unified backend — auto-selects enabler (standard or custom) | -| `enable_cache_for_dit()` | Default enabler for standard single-transformer models | -| `CUSTOM_DIT_ENABLERS` dict | Registry of custom enablers keyed by pipeline class name | -| `BlockAdapter` | Wraps complex architectures (multi-block-list or multi-transformer) | -| `ForwardPattern` | Specifies block forward signature: `Pattern_0`, `Pattern_1`, `Pattern_2` | -| `ParamsModifier` | Per-transformer or per-block-list config customization | -| `DBCacheConfig` | Configuration for DBCache parameters | -| `cache_dit.refresh_context()` | Updates cache context when `num_inference_steps` changes | - -**Source files**: -- `vllm_omni/diffusion/cache/cache_dit_backend.py` — `CacheDiTBackend`, enablers, `CUSTOM_DIT_ENABLERS` -- `vllm_omni/diffusion/cache/` — cache backend implementations - -## Standard Models: Automatic Support - -Most DiT models follow this pattern: -- Single transformer with one `nn.ModuleList` of blocks -- Standard forward signature -- Compatible with cache-dit's automatic detection - -**Examples**: Qwen-Image, Z-Image, FLUX - -No code changes needed. `CacheDiTBackend` automatically uses `enable_cache_for_dit()`: - -```python -from vllm_omni import Omni - -omni = Omni( - model="Qwen/Qwen-Image", - cache_backend="cache_dit", - cache_config={ - "Fn_compute_blocks": 1, - "Bn_compute_blocks": 0, - "max_warmup_steps": 4, - } -) -``` - -What happens automatically: - -```python -def enable_cache_for_dit(pipeline, cache_config): - db_cache_config = DBCacheConfig( - num_inference_steps=None, - Fn_compute_blocks=cache_config.Fn_compute_blocks, - Bn_compute_blocks=cache_config.Bn_compute_blocks, - max_warmup_steps=cache_config.max_warmup_steps, - max_cached_steps=cache_config.max_cached_steps, - max_continuous_cached_steps=cache_config.max_continuous_cached_steps, - residual_diff_threshold=cache_config.residual_diff_threshold, - ) - - cache_dit.enable_cache(pipeline.transformer, cache_config=db_cache_config) - - def refresh_cache_context(pipeline, num_inference_steps, verbose=True): - cache_dit.refresh_context( - pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose - ) - return refresh_cache_context -``` - -## Custom Architectures: Writing Custom Enablers - -### When you need a custom enabler - -- Model has multiple block lists in one transformer (e.g., `transformer_blocks` + `single_transformer_blocks`) -- Model has two transformers (e.g., high-noise + low-noise like Wan2.2) -- Model uses non-standard block forward signature - -### Pattern 1: Multi-Block-List (LongCat-Image style) - -Single transformer with two block lists: - -```python -import cache_dit -from cache_dit import BlockAdapter, ForwardPattern, ParamsModifier, DBCacheConfig - -def enable_cache_for_your_model(pipeline, cache_config): - db_cache_config = DBCacheConfig( - num_inference_steps=None, - Fn_compute_blocks=cache_config.Fn_compute_blocks, - Bn_compute_blocks=cache_config.Bn_compute_blocks, - max_warmup_steps=cache_config.max_warmup_steps, - max_cached_steps=cache_config.max_cached_steps, - max_continuous_cached_steps=cache_config.max_continuous_cached_steps, - residual_diff_threshold=cache_config.residual_diff_threshold, - ) - - cache_dit.enable_cache( - BlockAdapter( - transformer=pipeline.transformer, - blocks=[ - pipeline.transformer.transformer_blocks, - pipeline.transformer.single_transformer_blocks, - ], - forward_pattern=[ForwardPattern.Pattern_1, ForwardPattern.Pattern_1], - params_modifiers=[ParamsModifier(...)], - ), - cache_config=db_cache_config, - ) - - def refresh_cache_context(pipeline, num_inference_steps, verbose=True): - cache_dit.refresh_context( - pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose - ) - return refresh_cache_context -``` - -For single transformer with multiple block lists, `refresh_context` works the same as standard models — call it once on the transformer. - -### Pattern 2: Dual-Transformer (Wan2.2 style) - -Two transformers with separate configs: - -```python -def enable_cache_for_dual_transformer(pipeline, cache_config): - db_cache_config = DBCacheConfig(...) - - cache_dit.enable_cache( - BlockAdapter( - transformer=[pipeline.transformer, pipeline.transformer_2], - blocks=[pipeline.transformer.blocks, pipeline.transformer_2.blocks], - forward_pattern=[ForwardPattern.Pattern_2, ForwardPattern.Pattern_2], - params_modifiers=[ - ParamsModifier(...), # Config for transformer 1 - ParamsModifier(...), # Config for transformer 2 - ], - ), - cache_config=db_cache_config, - ) - - def refresh_cache_context(pipeline, num_inference_steps, verbose=True): - high_steps, low_steps = _split_inference_steps(num_inference_steps) - cache_dit.refresh_context( - pipeline.transformer, num_inference_steps=high_steps, verbose=verbose - ) - cache_dit.refresh_context( - pipeline.transformer_2, num_inference_steps=low_steps, verbose=verbose - ) - return refresh_cache_context -``` - -Key difference: `refresh_context` must be called on **each transformer separately** with its own step count. - -### Choosing the ForwardPattern - -| Pattern | Block forward signature | Example models | -|---------|------------------------|----------------| -| `Pattern_0` | `block(hidden_states, **kwargs)` → residual added inside block | Default | -| `Pattern_1` | `block(hidden_states, **kwargs)` → returns `(hidden_states, ...)` tuple | FLUX-style single blocks | -| `Pattern_2` | `block(hidden_states, **kwargs)` → `(hidden_states, ...)` with different residual pattern | Wan2.2 blocks | - -Inspect your block's `forward()` return type and residual connection pattern to choose the right one. See [Cache-DiT API Reference](https://cache-dit.readthedocs.io/en/latest/user_guide/CACHE_API/) for details. - -## Registering Custom Enablers - -Add your enabler to `CUSTOM_DIT_ENABLERS` in `vllm_omni/diffusion/cache/cache_dit_backend.py`: - -```python -CUSTOM_DIT_ENABLERS = { - "Wan22Pipeline": enable_cache_for_wan22, - "LongCatImagePipeline": enable_cache_for_longcat_image, - "YourModelPipeline": enable_cache_for_your_model, -} -``` - -The key must match `pipeline.__class__.__name__`. - -## Configuration Parameters - -| Parameter | Default | Description | -|-----------|---------|-------------| -| `Fn_compute_blocks` | 1 | Number of blocks to always compute at the front | -| `Bn_compute_blocks` | 0 | Number of blocks to always compute at the back | -| `max_warmup_steps` | 4 | Steps to run without caching at the beginning | -| `max_cached_steps` | — | Max total cached steps | -| `max_continuous_cached_steps` | — | Max consecutive cached steps | -| `residual_diff_threshold` | 0.24 | Threshold for deciding whether to cache a block | - -### Tuning for quality vs speed - -| Goal | Adjustments | -|------|-------------| -| **More speed, acceptable quality loss** | Higher `residual_diff_threshold` (0.24-0.4), lower `max_warmup_steps` (2-4) | -| **Better quality, less speed** | Lower `residual_diff_threshold` (0.12-0.18), higher `max_warmup_steps` (6-8), lower `max_continuous_cached_steps` (2) | - -## Testing - -```python -from vllm_omni import Omni -from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -omni = Omni( - model="your-model-name", - cache_backend="cache_dit", - cache_config={ - "Fn_compute_blocks": 1, - "Bn_compute_blocks": 0, - "max_warmup_steps": 4, - "residual_diff_threshold": 0.24, - } -) -images = omni.generate( - "a beautiful landscape", - OmniDiffusionSamplingParams(num_inference_steps=50), -) -``` - -CLI (online serving): - -```bash -vllm serve your-model --omni --port 8098 \ - --cache-backend cache_dit \ - --cache-config '{"Fn_compute_blocks": 1, "Bn_compute_blocks": 0, "max_warmup_steps": 4}' -``` - -**Verification checklist**: -1. Logs show "Cache-dit enabled successfully on xxx" -2. Performance: 1.5-2x speedup vs no cache -3. Quality: compare output with `cache_backend=None` - -## Excluded Models - -Models listed in `_NO_CACHE_ACCELERATION` in `vllm_omni/diffusion/registry.py` do not support cache-dit (e.g., `NextStep11Pipeline`, `StableDiffusionPipeline`). Check this set before attempting to enable cache-dit. - -## Reference Implementations - -| Model | Path | Notes | -|-------|------|-------| -| Standard DiT | `cache_dit_backend.py::enable_cache_for_dit` | Default enabler, automatic | -| Wan2.2 | `cache_dit_backend.py::enable_cache_for_wan22` | Dual-transformer, auto-detects mode | -| LongCat | `cache_dit_backend.py::enable_cache_for_longcat_image` | Multi-block-list | -| BAGEL | `cache_dit_backend.py::enable_cache_for_bagel` | Complex omni model | diff --git a/.claude/skills/add-diffusion-model/references/custom-model-patterns.md b/.claude/skills/add-diffusion-model/references/custom-model-patterns.md deleted file mode 100644 index 2434e0b5da0..00000000000 --- a/.claude/skills/add-diffusion-model/references/custom-model-patterns.md +++ /dev/null @@ -1,273 +0,0 @@ -# Custom Model Patterns Reference - -Patterns for adding models that don't come from the standard diffusers pipeline format. - -## Directory Structure Comparison - -### Diffusers-based model (e.g., Wan2.2) - -``` -vllm_omni/diffusion/models/wan2_2/ -├── __init__.py # Exports pipeline + transformer + helpers -├── pipeline_wan2_2.py # Pipeline: loads components via from_pretrained() -├── pipeline_wan2_2_i2v.py # Variant pipeline for image-to-video -└── wan2_2_transformer.py # Transformer: ported from diffusers, uses Attention layer -``` - -The transformer is loaded separately via `weights_sources` + `load_weights()`. Non-transformer components (VAE, text encoder) are loaded in `__init__` via `from_pretrained()`. - -### Custom model with external deps (e.g., DreamID-Omni) - -``` -vllm_omni/diffusion/models/dreamid_omni/ -├── __init__.py # Exports pipeline only -├── pipeline_dreamid_omni.py # Pipeline: loads ALL weights in __init__ via custom helpers -├── fusion.py # Custom fusion architecture (video + audio cross-attention) -└── wan2_2.py # Re-implemented Wan backbone with split API - -examples/offline_inference/x_to_video_audio/ -└── download_dreamid_omni.py # Downloads weights from 3 HF repos + clones code repo -``` - -All weights loaded eagerly in `__init__`. `load_weights()` is a no-op. External dependency (`dreamid_omni` package) imported with try/except. - -### Custom model with ported code (e.g., BAGEL) - -``` -vllm_omni/diffusion/models/bagel/ -├── __init__.py -├── pipeline_bagel.py # Pipeline: instantiates models, uses weights_sources -├── bagel_transformer.py # Full LLM backbone (Qwen2-MoT) ported into vllm-omni -└── autoencoder.py # Custom VAE ported from original repo -``` - -Model code is fully ported (no external dependency). Uses `weights_sources` and `load_weights()` with custom name remapping to handle non-diffusers safetensors format. - -## Weight Loading Patterns - -### Pattern 1: Standard diffusers flow (Wan2.2, Z-Image, FLUX) - -``` -init → create transformer (empty) → set weights_sources → [loader calls load_weights()] -``` - -- `weights_sources` points to safetensors in HF subfolder (e.g., `transformer/`) -- `load_weights()` receives `(name, tensor)` pairs from the loader -- Name remapping handles diffusers→vllm-omni differences (QKV fusion, Sequential index removal) - -### Pattern 2: Custom safetensors at root (BAGEL) - -``` -init → create all models (empty) → set weights_sources(subfolder=None) → [loader calls load_weights()] -``` - -- `weights_sources` points to **root** of model directory, not a subfolder -- Weights have non-diffusers names (e.g., `bagel.language_model.model.layers.0.self_attn.q_proj.weight`) -- `load_weights()` does heavy name normalization - -```python -self.weights_sources = [ - DiffusersPipelineLoader.ComponentSource( - model_or_path=od_config.model, - subfolder=None, # root directory - prefix="", # no prefix stripping - fall_back_to_pt=False, - ) -] -``` - -### Pattern 3: Fully custom loading (DreamID-Omni) - -``` -init → load ALL weights eagerly via custom helpers → load_weights() = no-op -``` - -- No `weights_sources` attribute — standard loader finds nothing to iterate -- Custom init functions (e.g., `init_wan_vae_2_2()`, `load_fusion_checkpoint()`) handle downloading and loading -- `load_weights()` is `pass` -- Weights may come from multiple HF repos in different formats (`.pth`, `.safetensors`) - -Use this when: -- The original model has complex, well-tested loading code you don't want to rewrite -- Weights span multiple HF repos -- Weight format is non-standard (e.g., a single `.pth` file, not sharded safetensors) - -## model_index.json for Custom Models - -Standard diffusers `model_index.json`: -```json -{ - "_class_name": "WanPipeline", - "_diffusers_version": "0.35.0.dev0", - "scheduler": ["diffusers", "UniPCMultistepScheduler"], - "transformer": ["diffusers", "WanTransformer3DModel"], - "vae": ["diffusers", "AutoencoderKLWan"] -} -``` - -Custom model `model_index.json` (minimal): -```json -{ - "_class_name": "DreamIDOmniPipeline", - "fusion": "DreamID-Omni/dreamid_omni.safetensors" -} -``` - -The only **required** field is `_class_name` — it must match a key in `_DIFFUSION_MODELS` in `registry.py`. Other fields are model-specific and accessible via `od_config.model_config` dict. - -## External Dependency Management - -### Git clone + .pth injection (DreamID-Omni pattern) - -```python -def download_dependency(): - CACHE_DIR.mkdir(parents=True, exist_ok=True) - with open(LOCK_FILE, "w") as f: - fcntl.flock(f, fcntl.LOCK_EX) - if not DEPENDENCY_DIR.exists(): - subprocess.run([ - "git", "clone", "--depth", "1", - REPO_URL, "--branch", BRANCH, - str(DEPENDENCY_DIR) - ], check=True) - fcntl.flock(f, fcntl.LOCK_UN) - - # Add to Python path via .pth file - site_packages = Path(site.getsitepackages()[0]) - pth_file = site_packages / "vllm_omni_dependency.pth" - pth_file.write_text(str(DEPENDENCY_DIR)) -``` - -### Direct port (BAGEL pattern) - -Copy essential files from the original repo into `vllm_omni/diffusion/models//`. Adapt imports to use vllm-omni utilities. Benefits: no external dependency, no git clone step. Drawback: must maintain the ported code. - -## Multi-Modal Input/Output Protocols - -Custom models that handle images, audio, or video I/O should implement protocol classes: - -```python -from vllm_omni.diffusion.models.interface import ( - SupportImageInput, # Model accepts image input - SupportAudioInput, # Model accepts audio input - SupportAudioOutput, # Model produces audio output -) - -class MyPipeline(nn.Module, SupportImageInput, SupportAudioInput, SupportAudioOutput): - pass # Protocol markers enable proper engine routing -``` - -The engine checks `isinstance(pipeline, SupportImageInput)` at startup to configure input validation and warmup behavior. - -## Hardcoded Config vs Config Files - -Diffusers models use `config.json` in each subfolder. Custom models often use: - -**Module-level config dicts** (DreamID-Omni): -```python -VIDEO_CONFIG = { - "patch_size": [1, 2, 2], "model_type": "ti2v", - "dim": 3072, "ffn_dim": 14336, "num_heads": 24, "num_layers": 30, ... -} -``` - -**Loaded from custom JSON** (BAGEL): -```python -cfg_path = os.path.join(model_path, "config.json") -with open(cfg_path) as f: - bagel_cfg = json.load(f) -vae_cfg = bagel_cfg.get("vae_config", {}) -``` - -## Custom Architecture Patterns - -### Split forward API (DreamID-Omni) - -When a fusion model needs to interleave blocks from two backbones: - -```python -class WanModel(nn.Module): - def prepare_transformer_block_kwargs(self, x, t, context, ...): - # Patch embed, time embed, text embed, RoPE - return x, e, kwargs - - def post_transformer_block_out(self, x, grid_sizes, e): - # Output projection, unpatchify - return output - - def forward(self, *args, **kwargs): - raise NotImplementedError # Fusion model handles block iteration -``` - -The `FusionModel` then iterates blocks in lock-step: -```python -for video_block, audio_block in zip(self.video_model.blocks, self.audio_model.blocks): - video_out = video_block(video_hidden, ...) - audio_out = audio_block(audio_hidden, ...) - # Cross-attend between modalities - video_out = cross_attention(video_out, audio_out) - audio_out = cross_attention(audio_out, video_out) -``` - -### LLM-as-denoiser (BAGEL) - -When the backbone is a language model that also does diffusion: - -```python -class BagelModel(nn.Module): - def __init__(self): - self.language_model = Qwen2MoTForCausalLM(config) - self.vit_model = SiglipVisionModel(vit_config) -``` - -The LLM processes both text tokens and latent image tokens in a single forward pass, using KV caching for the text portion. - -## Pre/Post Processing for Custom Models - -Custom models typically handle pre/post processing **inside `forward()`** rather than via registered functions, because the logic is tightly coupled: - -```python -def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: - # Inline preprocessing - image = self._load_and_resize_image(req.prompts[0].get("multi_modal_data", {}).get("image")) - image_latent = self._vae_encode(image) - - # ... denoising loop ... - - # Inline postprocessing - pil_image = self._decode_to_pil(latents) - return DiffusionOutput(output=[pil_image]) -``` - -If pre/post functions are not registered in `_DIFFUSION_PRE_PROCESS_FUNCS` / `_DIFFUSION_POST_PROCESS_FUNCS`, the engine simply skips those steps. - -## Download Script Template - -```python -# examples/offline_inference//download_.py -from huggingface_hub import snapshot_download -import json, os - -def main(output_dir): - # Download model weights from HF - snapshot_download(repo_id="org/model-weights", local_dir=os.path.join(output_dir, "weights")) - - # Download additional components if from separate repos - snapshot_download(repo_id="org/vae-weights", local_dir=os.path.join(output_dir, "vae"), - allow_patterns=["*.safetensors"]) - - # Generate model_index.json - config = {"_class_name": "YourPipeline", "custom_key": "weights/model.safetensors"} - with open(os.path.join(output_dir, "model_index.json"), "w") as f: - json.dump(config, f, indent=2) - - # Install external code dependency (if needed) - download_dependency() - -if __name__ == "__main__": - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", default="./your_model") - args = parser.parse_args() - main(args.output_dir) -``` diff --git a/.claude/skills/add-diffusion-model/references/parallelism-patterns.md b/.claude/skills/add-diffusion-model/references/parallelism-patterns.md deleted file mode 100644 index 933e2d23204..00000000000 --- a/.claude/skills/add-diffusion-model/references/parallelism-patterns.md +++ /dev/null @@ -1,571 +0,0 @@ -# Parallelism Patterns Reference - -## Overview - -vLLM-Omni supports multiple parallelism strategies for diffusion models. Each targets a different bottleneck: - -| Strategy | Splits | Best For | Constraint | -|----------|--------|----------|------------| -| Tensor Parallel (TP) | Model layers across GPUs | Latency reduction, large models | Requires fast GPU interconnect, `num_heads % tp == 0` | -| Sequence Parallel (SP/USP) | Sequence tokens across GPUs | Long sequences (video, high-res) | Near-linear scaling | -| CFG Parallel | Positive/negative CFG branches | Models using classifier-free guidance | Exactly 2 GPUs | -| HSDP | Weight shards via FSDP2 | VRAM reduction | Cannot combine with TP | -| VAE Patch Parallel | VAE decode spatial tiles | Large VAE outputs | Auto-enables tiling | - -**Recommended integration order**: TP → SP → CFG Parallel → HSDP - -**Official design docs**: -- TP: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/tensor_parallel -- SP: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/sequence_parallel -- CFG: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/cfg_parallel -- HSDP: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/hsdp - ---- - -## Tensor Parallelism (TP) - -Replace standard `nn.Linear` with vLLM's parallel linear layers. This is the most invasive change but provides direct VRAM savings and compute speedup. - -### Layer replacement rules - -| Pattern | vLLM Layer | When to Use | -|---------|-----------|-------------| -| Fan-out (first in FFN) | `ColumnParallelLinear` | Projection that splits output across ranks | -| Fan-in (second in FFN) | `RowParallelLinear` | Projection that gathers across ranks | -| QKV projection | `QKVParallelLinear` | Fused Q/K/V for self-attention | -| Single Q or K or V | `ColumnParallelLinear` | Separate projections (cross-attention) | -| Attention output | `RowParallelLinear` | Output projection after attention | -| Must not shard | `ReplicatedLinear` | Layers that must stay replicated | - -### MLP Block (Up-Down Pattern) - -```python -from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, RowParallelLinear, -) - -class TPFeedForward(nn.Module): - def __init__(self, dim, ffn_dim): - super().__init__() - self.fc1 = ColumnParallelLinear(dim, ffn_dim, bias=False, return_bias=False) - self.fc2 = RowParallelLinear( - ffn_dim, dim, bias=False, - input_is_parallel=True, # Input already sharded from fc1 - return_bias=False, - ) - - def forward(self, x): - x, _ = self.fc1(x) - x = torch.nn.functional.gelu(x) - x, _ = self.fc2(x) - return x -``` - -### Attention Block (QKV-Out Pattern) - -```python -from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear -from vllm_omni.diffusion.attention.layer import Attention - -class TPSelfAttention(nn.Module): - def __init__(self, dim, num_heads, num_kv_heads=None): - super().__init__() - num_kv_heads = num_kv_heads or num_heads - self.head_dim = dim // num_heads - - self.to_qkv = QKVParallelLinear( - hidden_size=dim, - head_size=self.head_dim, - total_num_heads=num_heads, - total_num_kv_heads=num_kv_heads, - bias=False, - return_bias=False, - ) - self.to_out = RowParallelLinear( - dim, dim, bias=False, - input_is_parallel=True, - return_bias=False, - ) - self.attn = Attention( - num_heads=self.to_qkv.num_heads, # Local heads per GPU - head_size=self.head_dim, - softmax_scale=1.0 / (self.head_dim ** 0.5), - causal=False, - num_kv_heads=self.to_qkv.num_kv_heads, # Local KV heads per GPU - ) - - def forward(self, x): - qkv, _ = self.to_qkv(x) - q, k, v = qkv.split( - [self.to_qkv.num_heads * self.head_dim, - self.to_qkv.num_kv_heads * self.head_dim, - self.to_qkv.num_kv_heads * self.head_dim], - dim=-1, - ) - B, S, _ = x.shape - q = q.view(B, S, self.to_qkv.num_heads, self.head_dim) - k = k.view(B, S, self.to_qkv.num_kv_heads, self.head_dim) - v = v.view(B, S, self.to_qkv.num_kv_heads, self.head_dim) - out = self.attn(q, k, v) - out = out.reshape(B, S, -1) - out, _ = self.to_out(out) - return out -``` - -### QKV Fusion in load_weights - -When you fuse separate Q/K/V into `QKVParallelLinear`, map diffusers' separate weight names: - -```python -stacked_params_mapping = [ - ("to_qkv", "to_q", "q"), - ("to_qkv", "to_k", "k"), - ("to_qkv", "to_v", "v"), -] - -def load_weights(self, weights): - params = dict(self.named_parameters()) - loaded = set() - for name, tensor in weights: - for fused_name, orig_name, shard_id in stacked_params_mapping: - if orig_name in name: - name = name.replace(orig_name, fused_name) - param = params[name] - param.weight_loader(param, tensor, shard_id) - loaded.add(name) - break - else: - if name in params: - param = params[name] - if hasattr(param, "weight_loader"): - param.weight_loader(param, tensor) - else: - default_weight_loader(param, tensor) - loaded.add(name) - return loaded -``` - -### RMSNorm with TP - -When RMSNorm sits between TP-sharded dimensions, use `DistributedRMSNorm` — it computes global RMS via all-reduce across TP ranks. See the Wan2.2 implementation for the pattern. - -### TP Constraints - -- `num_heads % tp_size == 0` -- `num_kv_heads % tp_size == 0` -- Use `self.to_qkv.num_heads` (local per-GPU count), not total heads, for split sizes - -### Testing TP - -```bash -python text_to_image.py --model Your-org/your-model \ - --tensor-parallel-size 2 --output "tp_test.png" -``` - -**Verify**: speedup, memory reduction proportional to TP size, quality matches single-GPU. - -### Reference implementations - -| Model | Path | -|-------|------| -| Z-Image | `vllm_omni/diffusion/models/z_image/z_image_transformer.py` | -| FLUX | `vllm_omni/diffusion/models/flux/flux_transformer.py` | -| Qwen-Image | `vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py` | - ---- - -## Sequence Parallelism (SP / USP) - -SP splits sequence tokens across GPUs using Ulysses (all-to-all) or Ring (P2P) communication. It is applied non-intrusively via the `_sp_plan` dict — no changes to `forward()` logic. - -### Approach 1: Non-Intrusive `_sp_plan` (Recommended) - -The framework automatically registers hooks to shard inputs and gather outputs at `nn.Module` boundaries. - -#### Step 1: Identify module boundaries - -Find where tensors need sharding/gathering: - -```python -class MyTransformer(nn.Module): - def __init__(self): - self.patch_embed = PatchEmbed() # Before blocks - self.pos_embed = RoPE() # RoPE may need splitting - self.blocks = nn.ModuleList([...]) # Blocks process sharded x - self.norm_out = LayerNorm() - self.proj_out = Linear() # Gather after this - - def forward(self, x): - x = self.patch_embed(x) - pos = self.pos_embed(x) - for block in self.blocks: - x = block(x, pos) - x = self.norm_out(x) - return self.proj_out(x) -``` - -#### Step 2: Handle inline operations - -`_sp_plan` hooks only work at `nn.Module` boundaries. Inline ops like `torch.cat()` must be extracted into submodules: - -```python -# BAD: Inline — hooks can't intercept -unified = torch.cat([x, cap_feats], dim=1) - -# GOOD: Extract into submodule -class UnifiedPrepare(nn.Module): - def forward(self, x, cap_feats): - return torch.cat([x, cap_feats], dim=1) - -self.unified_prepare = UnifiedPrepare() -unified = self.unified_prepare(x, cap_feats) -``` - -Common cases: `torch.cat()`, `pad_sequence()`, `tensor.reshape()`, complex preprocessing. - -#### Step 3: Write `_sp_plan` - -**Pattern 1: Shard at first block, gather at output** (most common) - -```python -from vllm_omni.diffusion.distributed.sp_plan import ( - SequenceParallelInput, SequenceParallelOutput, -) - -class StandardTransformer(nn.Module): - _sp_plan = { - "blocks.0": { - "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), - }, - "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), - } -``` - -**Pattern 2: Shard RoPE outputs separately** - -```python -class TransformerWithRoPE(nn.Module): - _sp_plan = { - "rope": { - 0: SequenceParallelInput(split_dim=1, expected_dims=4, split_output=True), - 1: SequenceParallelInput(split_dim=1, expected_dims=4, split_output=True), - }, - "blocks.0": { - "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), - }, - "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), - } -``` - -**Pattern 3: Dual-stream (shard image, replicate text)** - -```python -class DualStreamTransformer(nn.Module): - _sp_plan = { - "rope_preparer": { - 2: SequenceParallelInput(split_dim=0, expected_dims=2, split_output=True), - 3: SequenceParallelInput(split_dim=0, expected_dims=2, split_output=True), - }, - "transformer_blocks.0": { - "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), - }, - "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), - } -``` - -### API Reference - -**SequenceParallelInput**: - -| Parameter | Type | Description | -|-----------|------|-------------| -| `split_dim` | int | Dimension to split (usually 1 for sequence) | -| `expected_dims` | int/None | Expected tensor rank for validation | -| `split_output` | bool | `False`: shard input params; `True`: shard output tensors | -| `auto_pad` | bool | Auto-pad if sequence not divisible by world_size | - -**SequenceParallelOutput**: - -| Parameter | Type | Description | -|-----------|------|-------------| -| `gather_dim` | int | Dimension to gather (usually 1 for sequence) | -| `expected_dims` | int/None | Expected tensor rank for validation | - -**Module naming**: - -| Key | Meaning | -|-----|---------| -| `"blocks.0"` | First element of ModuleList | -| `"blocks.*"` | All elements of ModuleList | -| `"rope"` | Named submodule | - -**Dictionary value types**: - -| Key type | split_output | Description | -|----------|-------------|-------------| -| `"param_name"` (str) | False | Shard input parameter by name | -| `0, 1, ...` (int) | True | Shard output tuple by index | - -### Approach 2: Intrusive Modification (Complex Cases) - -For dynamic sharding logic that can't be expressed via `_sp_plan`: - -```python -from vllm_omni.diffusion.distributed.sp_sharding import sp_shard, sp_gather - -def forward(self, hidden_states, ...): - if self.parallel_config.sequence_parallel_size > 1: - hidden_states = sp_shard(hidden_states, dim=1) - for block in self.blocks: - hidden_states = block(hidden_states) - if self.parallel_config.sequence_parallel_size > 1: - hidden_states = sp_gather(hidden_states, dim=1) - return hidden_states -``` - -Use intrusive modification as a last resort — `_sp_plan` is preferred for maintainability. - -### UAA Mode (Experimental) - -`ulysses_mode="advanced_uaa"` handles arbitrary sequence lengths and head counts that aren't divisible by `ulysses_degree`. Uses variable all-to-all split sizes and temporary head padding. - -### Combining SP methods - -Ulysses and Ring can be combined: `ulysses_degree × ring_degree = total SP GPUs`. - -```python -DiffusionParallelConfig(ulysses_degree=2, ring_degree=2) # 4 GPUs total -``` - -### Testing SP - -```bash -# Offline -python text_to_image.py --model Your-model --ulysses-degree 2 - -# Online serving -vllm serve Your-model --omni --usp 2 -``` - -### Reference implementations - -| Model | Path | -|-------|------| -| Qwen-Image | `vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py` | -| Wan2.2 | `vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py` | -| Z-Image | `vllm_omni/diffusion/models/z_image/z_image_transformer.py` | - ---- - -## CFG Parallelism - -Distributes positive/negative Classifier-Free Guidance branches across 2 GPUs. - -### Implementation - -Inherit `CFGParallelMixin` and implement `diffuse()`: - -```python -from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin - -class YourPipeline(nn.Module, CFGParallelMixin): - def diffuse(self, latents, timesteps, prompt_embeds, negative_embeds, - do_true_cfg, true_cfg_scale, **kwargs): - for i, t in enumerate(timesteps): - positive_kwargs = { - "hidden_states": latents, - "encoder_hidden_states": prompt_embeds, - "timestep": t, - } - negative_kwargs = { - "hidden_states": latents, - "encoder_hidden_states": negative_embeds, - "timestep": t, - } if do_true_cfg else None - - noise_pred = self.predict_noise_maybe_with_cfg( - do_true_cfg=do_true_cfg, - true_cfg_scale=true_cfg_scale, - positive_kwargs=positive_kwargs, - negative_kwargs=negative_kwargs, - ) - latents = self.scheduler_step_maybe_with_cfg( - noise_pred, t, latents, do_true_cfg - ) - return latents -``` - -### Customization hooks - -| Method | Override when | -|--------|-------------| -| `predict_noise()` | Non-standard transformer call (e.g., dual-transformer like Wan2.2) | -| `cfg_normalize_function()` | Custom normalization (e.g., LongCat with clamping) | -| `combine_cfg_noise()` | Multi-output models (e.g., video + audio: CFG on video, positive-only on audio) | - -**Custom predict_noise** (Wan2.2 — selects active transformer): - -```python -def predict_noise(self, current_model=None, **kwargs): - if current_model is None: - current_model = self.transformer - return current_model(**kwargs)[0] -``` - -**Custom combine_cfg_noise** (multi-output): - -```python -def combine_cfg_noise(self, positive_pred, negative_pred, scale, normalize): - video_pos, audio_pos = positive_pred - video_neg, audio_neg = negative_pred - video_combined = super().combine_cfg_noise(video_pos, video_neg, scale, normalize) - return (video_combined, audio_pos) -``` - -### Composite scheduler for multi-output - -When each output has its own schedule: - -```python -class VideoAudioScheduler: - def __init__(self, video_scheduler, audio_scheduler): - self.video_scheduler = video_scheduler - self.audio_scheduler = audio_scheduler - - def step(self, noise_pred, t, latents, return_dict=False, generator=None): - video_out = self.video_scheduler.step( - noise_pred[0], t[0], latents[0], return_dict=False, generator=generator - )[0] - audio_out = self.audio_scheduler.step( - noise_pred[1], t[1], latents[1], return_dict=False, generator=generator - )[0] - return ((video_out, audio_out),) -``` - -### Testing CFG Parallel - -```bash -python text_to_image.py --model Your-model \ - --cfg-parallel-size 2 --cfg-scale 4.0 \ - --negative-prompt "ugly, unclear" -``` - -**Constraint**: `guidance_scale > 1.0` and negative prompt must be provided. - -### Reference implementations - -| Model | Path | -|-------|------| -| Qwen-Image | `vllm_omni/diffusion/models/qwen_image/cfg_parallel.py` | -| Wan2.2 | `vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py` | -| Mixin base | `vllm_omni/diffusion/distributed/cfg_parallel.py` | - ---- - -## HSDP (Hybrid Sharded Data Parallel) - -Shards model weights across GPUs using PyTorch FSDP2. Reduces per-GPU VRAM without changing computation. - -### Implementation - -Add `_hsdp_shard_conditions` to the transformer class: - -```python -class YourTransformer(nn.Module): - @staticmethod - def _is_transformer_block(name: str, module) -> bool: - return "blocks" in name and name.split(".")[-1].isdigit() - - _hsdp_shard_conditions = [_is_transformer_block] -``` - -For MoE models, add additional conditions: - -```python -class MoETransformer(nn.Module): - @staticmethod - def _is_transformer_block(name, module): - return "blocks" in name and name.split(".")[-1].isdigit() - - @staticmethod - def _is_moe_expert(name, module): - return "experts" in name and name.split(".")[-1].isdigit() - - _hsdp_shard_conditions = [_is_transformer_block, _is_moe_expert] -``` - -A module is sharded if **any** condition returns `True`. - -### Constraints - -- Cannot combine with Tensor Parallelism -- For standalone HSDP (no other parallelism), `hsdp_shard_size` must be specified explicitly -- Can combine with SP: HSDP reduces memory while SP distributes sequence - -### Testing HSDP - -```python -from vllm_omni.diffusion.data import DiffusionParallelConfig - -parallel_config = DiffusionParallelConfig(use_hsdp=True, hsdp_shard_size=8) -omni = Omni(model="your-model", parallel_config=parallel_config) -``` - -Or CLI: - -```bash -vllm serve Your-model --omni --use-hsdp -``` - -**Verify**: logs show "HSDP Inference: replicate_size=..., shard_size=..." and "Sharded N modules + root". Check VRAM reduction. - -### Reference implementations - -| Model | Path | -|-------|------| -| Wan2.2 | `vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py` | -| HSDP Core | `vllm_omni/diffusion/distributed/hsdp.py` | - ---- - -## VAE Patch Parallelism - -Shards VAE decode spatially across ranks using tiling: - -```bash -python text_to_image.py --model Your-model --vae-patch-parallel-size 4 -``` - -Auto-enables `--vae-use-tiling`. Uses `DistributedAutoencoderKLWan` or similar distributed VAE. Set `vae_patch_parallel_size` in `DiffusionParallelConfig`. - ---- - -## Combining Parallelism Methods - -Common multi-GPU recipes: - -```bash -# 4 GPUs: CFG (2) × Ulysses (2) -python text_to_image.py --model Qwen/Qwen-Image \ - --cfg-parallel-size 2 --ulysses-degree 2 - -# 8 GPUs: Ulysses (4) × Ring (2) + VAE patch (8) -python text_to_video.py --model Wan-AI/Wan2.2-T2V-A14B-Diffusers \ - --ulysses-degree 4 --ring-degree 2 --vae-patch-parallel-size 8 - -# 2 GPUs: HSDP + Ulysses (cannot combine HSDP with TP) -vllm serve Your-model --omni --use-hsdp --usp 2 -``` - -## Discovering Parallelism Support - -Check which parallelism methods a model supports: - -| Check | How | -|-------|-----| -| **Ulysses / Ring SP** | Transformer defines `_sp_plan`. Search: `grep -r '_sp_plan' vllm_omni/diffusion/models/` | -| **CFG Parallel** | Pipeline inherits `CFGParallelMixin`. Search: `grep -r 'CFGParallelMixin' vllm_omni/diffusion/models/` | -| **TP** | Uses `ColumnParallelLinear` / `QKVParallelLinear`. Search: `grep -r 'ParallelLinear\|QKVParallel' vllm_omni/diffusion/models//` | -| **HSDP** | Transformer defines `_hsdp_shard_conditions`. Search: `grep -r '_hsdp_shard_conditions' vllm_omni/diffusion/models/` | - -The canonical per-model support table is in `docs/user_guide/diffusion/parallelism_acceleration.md`. diff --git a/.claude/skills/add-diffusion-model/references/transformer-adaptation.md b/.claude/skills/add-diffusion-model/references/transformer-adaptation.md deleted file mode 100644 index 6e344b6a66e..00000000000 --- a/.claude/skills/add-diffusion-model/references/transformer-adaptation.md +++ /dev/null @@ -1,218 +0,0 @@ -# Transformer Adaptation Reference - -## Adapting a Diffusers Transformer to vLLM-Omni - -### Step-by-step Checklist - -1. Copy the transformer class from diffusers source -2. Remove all mixin classes — inherit only from `nn.Module` -3. Replace attention dispatch with `vllm_omni.diffusion.attention.layer.Attention` -4. Replace logger with `vllm.logger.init_logger` -5. Add `od_config: OmniDiffusionConfig | None = None` to `__init__` -6. Remove training-only code (gradient checkpointing, dropout) -7. Add `load_weights()` method for weight loading from safetensors -8. Add class-level attributes for acceleration features - -### Mixin Removal - -Remove these diffusers mixins (and their imports): - -```python -# Remove all of these: -from diffusers.models.modeling_utils import ModelMixin -from diffusers.configuration_utils import ConfigMixin, register_to_config -from diffusers.models.attention_processor import AttentionModuleMixin -from diffusers.loaders import PeftAdapterMixin, FromOriginalModelMixin - -# Replace: -class MyTransformer(ModelMixin, ConfigMixin, AttentionModuleMixin): -# With: -class MyTransformer(nn.Module): -``` - -Also remove `@register_to_config` decorators from `__init__`. - -### Attention Replacement - -The vLLM-Omni `Attention` layer wraps backend selection (FlashAttention, SDPA, SageAttn, etc.) and supports sequence parallelism hooks. - -**QKV tensor shape must be `[batch, seq_len, num_heads, head_dim]`.** - -#### Self-Attention Pattern - -```python -from vllm_omni.diffusion.attention.layer import Attention -from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata - -class SelfAttentionBlock(nn.Module): - def __init__(self, dim, num_heads): - super().__init__() - self.num_heads = num_heads - self.head_dim = dim // num_heads - - self.to_q = nn.Linear(dim, dim) - self.to_k = nn.Linear(dim, dim) - self.to_v = nn.Linear(dim, dim) - self.to_out = nn.Linear(dim, dim) - - self.attn = Attention( - num_heads=num_heads, - head_size=self.head_dim, - softmax_scale=1.0 / (self.head_dim ** 0.5), - causal=False, - num_kv_heads=num_heads, - ) - - def forward(self, x, attn_mask=None): - B, S, _ = x.shape - q = self.to_q(x).view(B, S, self.num_heads, self.head_dim) - k = self.to_k(x).view(B, S, self.num_heads, self.head_dim) - v = self.to_v(x).view(B, S, self.num_heads, self.head_dim) - - attn_metadata = AttentionMetadata(attn_mask=attn_mask) - out = self.attn(q, k, v, attn_metadata=attn_metadata) - out = out.reshape(B, S, -1) - return self.to_out(out) -``` - -#### Fused QKV with TP (Advanced) - -For tensor parallelism, use vLLM's parallel linear layers: - -```python -from vllm.model_executor.layers.linear import ( - QKVParallelLinear, RowParallelLinear -) - -class TPSelfAttention(nn.Module): - def __init__(self, dim, num_heads): - super().__init__() - self.num_heads = num_heads - self.head_dim = dim // num_heads - - self.to_qkv = QKVParallelLinear( - hidden_size=dim, - head_size=self.head_dim, - total_num_heads=num_heads, - total_num_kv_heads=num_heads, - ) - self.to_out = RowParallelLinear(dim, dim) - - self.attn = Attention( - num_heads=num_heads, - head_size=self.head_dim, - softmax_scale=1.0 / (self.head_dim ** 0.5), - causal=False, - num_kv_heads=num_heads, - ) -``` - -### Logger Replacement - -```python -# Replace: -from diffusers.utils import logging -logger = logging.get_logger(__name__) - -# With: -from vllm.logger import init_logger -logger = init_logger(__name__) -``` - -### Custom Layers from vLLM-Omni - -Available utility layers: - -```python -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm_omni.diffusion.layers.rope import RotaryEmbedding -from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNorm -``` - -### Config Support - -```python -from vllm_omni.diffusion.data import OmniDiffusionConfig - -class MyTransformer(nn.Module): - def __init__(self, *, od_config=None, num_layers=28, hidden_size=3072, **kwargs): - super().__init__() - self.od_config = od_config - self.parallel_config = od_config.parallel_config if od_config else None - # ... build layers -``` - -The transformer config values come from `model_index.json` → `config.json` in the transformer subfolder. The pipeline uses `get_transformer_config_kwargs(od_config.tf_model_config, TransformerClass)` to filter config keys to match the `__init__` signature. - -### Weight Loading - -The `load_weights` method receives an iterable of `(name, tensor)` from safetensors files, with the prefix (e.g., `"transformer."`) already stripped by the loader. - -```python -from vllm.model_executor.model_loader.weight_utils import default_weight_loader - -class MyTransformer(nn.Module): - def load_weights(self, weights): - params = dict(self.named_parameters()) - loaded = set() - for name, tensor in weights: - # Optional: remap names from diffusers to vllm-omni naming - # e.g., "ff.net.0.proj" -> "ff.net_0.proj" - - if name in params: - param = params[name] - if hasattr(param, "weight_loader"): - param.weight_loader(param, tensor) - else: - default_weight_loader(param, tensor) - loaded.add(name) - return loaded -``` - -#### QKV Fusion in load_weights - -If you fused separate Q/K/V into a `QKVParallelLinear`, you need to map diffusers' separate weight names: - -```python -stacked_params_mapping = [ - ("to_qkv", "to_q", "q"), - ("to_qkv", "to_k", "k"), - ("to_qkv", "to_v", "v"), -] - -def load_weights(self, weights): - params = dict(self.named_parameters()) - loaded = set() - for name, tensor in weights: - for fused_name, orig_name, shard_id in stacked_params_mapping: - if orig_name in name: - name = name.replace(orig_name, fused_name) - param = params[name] - param.weight_loader(param, tensor, shard_id) - loaded.add(name) - break - else: - # Normal loading - ... - return loaded -``` - -### Class-Level Attributes for Features - -```python -class MyTransformer(nn.Module): - # torch.compile: list block class names that repeat and can be compiled - _repeated_blocks = ["MyTransformerBlock"] - - # CPU offload: attribute name of the nn.ModuleList containing blocks - _layerwise_offload_blocks_attr = "blocks" - - # LoRA: mapping of fused param names to original param names - packed_modules_mapping = {"to_qkv": ["to_q", "to_k", "to_v"]} - - # Sequence parallelism plan (advanced — add after basic impl works) - _sp_plan = { - "blocks.0": SequenceParallelInput(split_dim=1), - "proj_out": SequenceParallelOutput(gather_dim=1), - } -``` diff --git a/.claude/skills/add-diffusion-model/references/troubleshooting.md b/.claude/skills/add-diffusion-model/references/troubleshooting.md deleted file mode 100644 index 27acdd8d154..00000000000 --- a/.claude/skills/add-diffusion-model/references/troubleshooting.md +++ /dev/null @@ -1,178 +0,0 @@ -# Troubleshooting Reference - -## Common Errors When Adding a Diffusion Model - -### ImportError / ModuleNotFoundError - -**Cause**: Missing or incorrect registration. - -**Fix checklist**: -1. Model registered in `vllm_omni/diffusion/registry.py` `_DIFFUSION_MODELS` dict -2. `__init__.py` exports the pipeline class -3. Pipeline file exists at the correct path: `vllm_omni/diffusion/models/{folder}/{file}.py` -4. Class name in registry matches the actual class name in the file - -### Shape Mismatch in Attention - -**Symptom**: `RuntimeError: shape mismatch` or `expected 4D tensor` - -**Cause**: QKV tensors not reshaped to `[batch, seq_len, num_heads, head_dim]`. - -**Fix**: Before calling `self.attn(q, k, v, ...)`, ensure: -```python -q = q.view(batch, seq_len, self.num_heads, self.head_dim) -k = k.view(batch, kv_seq_len, self.num_kv_heads, self.head_dim) -v = v.view(batch, kv_seq_len, self.num_kv_heads, self.head_dim) -``` - -After attention, reshape back: -```python -out = out.reshape(batch, seq_len, -1) -``` - -### Weight Loading Failures - -**Symptom**: `RuntimeError: size mismatch for parameter ...` or missing keys - -**Debugging**: -1. Print diffusers weight names: `safetensors.safe_open(path, "pt").keys()` -2. Print model parameter names: `dict(model.named_parameters()).keys()` -3. Compare and add name remappings in `load_weights()` - -**Common remappings needed**: -- `ff.net.0.proj` → `ff.net_0.proj` (PyTorch Sequential indexing) -- `.to_out.0.` → `.to_out.` (Sequential unwrapping) -- `scale_shift_table` → moved to a wrapper module - -### Black/Blank/Noisy Output - -**Possible causes**: -1. **Wrong latent normalization**: Check VAE expects latents scaled by `vae.config.scaling_factor` -2. **Wrong scheduler**: Using the wrong scheduler class or wrong `flow_shift` -3. **Missing CFG**: Some models require `guidance_scale > 1.0` with negative prompt -4. **Wrong timestep format**: Some schedulers expect float, others expect int/long -5. **Missing post-processing**: Raw VAE output may need denormalization - -**Quick test**: Run with diffusers directly using the same seed and compare latents at each step. - -### OOM (Out of Memory) - -**Solutions** (in order of preference): -1. `--enforce-eager` to disable torch.compile (saves compile memory) -2. `--enable-cpu-offload` for model-level offload -3. `--enable-layerwise-offload` for block-level offload (better for large models) -4. `--vae-use-slicing --vae-use-tiling` for VAE memory reduction -5. Reduce resolution: `--height 480 --width 832` -6. Use TP: `--tensor-parallel-size 2` - -### Different Output vs Diffusers Reference - -**Common causes**: -1. **Attention backend difference**: FlashAttention vs SDPA may produce slightly different results. Set `DIFFUSION_ATTENTION_BACKEND=TORCH_SDPA` to match diffusers -2. **Float precision**: vLLM-Omni may use bfloat16 where diffusers uses float32 for some operations -3. **Missing normalization**: Check all LayerNorm/RMSNorm are preserved -4. **Scheduler rounding**: Some schedulers have numerical sensitivity - -### Tensor Parallel Errors - -**Symptom**: `AssertionError: not divisible` or incorrect output with TP>1 - -**Fix**: -1. Verify `num_heads % tp_size == 0` and `num_kv_heads % tp_size == 0` -2. Ensure `ColumnParallelLinear` / `RowParallelLinear` are used correctly -3. Check that norms between parallel layers use distributed norm if needed -4. Verify `load_weights` handles TP sharding for norm weights -5. Use `self.to_qkv.num_heads` (local heads per GPU) for QKV split sizes, not total heads - -**Missing `input_is_parallel=True`**: - -`RowParallelLinear` expects sharded input from `ColumnParallelLinear`: -```python -self.w1 = ColumnParallelLinear(dim, hidden_dim, return_bias=False) -self.w2 = RowParallelLinear(hidden_dim, dim, input_is_parallel=True, return_bias=False) -``` - -### Sequence Parallel Errors - -**Symptom**: Incorrect output or crashes with `--ulysses-degree N` or `--usp N` - -**Possible causes**: -1. **Inline operations between shard/gather points**: `torch.cat()`, `pad_sequence()` etc. not at `nn.Module` boundaries. Fix: extract into submodule. -2. **Wrong `split_dim`**: Check the tensor shape at the shard point. Sequence dimension is typically `dim=1` for `[B, S, D]` tensors. -3. **RoPE not sharded**: If RoPE is computed separately, add it to `_sp_plan` with `split_output=True`. -4. **Sequence not divisible by SP degree**: Use `auto_pad=True` in `SequenceParallelInput` or switch to `ulysses_mode="advanced_uaa"`. - -**Debugging**: Add `expected_dims=N` to `SequenceParallelInput`/`Output` for shape validation at runtime. - -### CFG Parallel Errors - -**Symptom**: CFG parallel not activating, no speedup - -**Fix checklist**: -1. Pipeline inherits `CFGParallelMixin` -2. `guidance_scale > 1.0` -3. Negative prompt provided (even if empty string) -4. `--cfg-parallel-size 2` specified -5. `diffuse()` method calls `predict_noise_maybe_with_cfg()` and `scheduler_step_maybe_with_cfg()` - -**Symptom**: Different output with CFG parallel vs sequential - -**Possible cause**: Non-deterministic scheduler. Fix: pass `generator=torch.Generator(device).manual_seed(seed)` to `scheduler_step_maybe_with_cfg()`. - -### HSDP Errors - -**Symptom**: HSDP not activating or errors during weight loading - -**Fix checklist**: -1. Transformer defines `_hsdp_shard_conditions` class attribute -2. Shard condition functions return `True` for correct modules (test with `model.named_modules()`) -3. Not combining with TP (HSDP and TP are incompatible) -4. For standalone HSDP, `hsdp_shard_size` is specified explicitly - -**Verify**: Check logs for "HSDP Inference: replicate_size=..., shard_size=..." and "Sharded N modules + root". - -### Cache-DiT Not Applied - -**Symptom**: No speedup, no cache-related log messages - -**Fix checklist**: -1. Model not in `_NO_CACHE_ACCELERATION` in `registry.py` -2. Pipeline class name matches `CUSTOM_DIT_ENABLERS` key (if using custom enabler) -3. `cache_backend="cache_dit"` specified -4. Check logs for "Cache-dit enabled successfully on xxx" - -**Verify pipeline name**: `print(pipeline.__class__.__name__)` — must match registry key. - -### Cache-DiT Quality Degradation - -**Symptom**: Artifacts or lower quality with cache-dit - -**Fix**: Reduce aggressiveness: -```python -cache_config={ - "residual_diff_threshold": 0.12, # Lower from 0.24 - "max_warmup_steps": 6, # Increase from 4 - "max_continuous_cached_steps": 2, # Reduce if higher -} -``` - -If quality is still poor, the model may need a custom enabler with per-block-list `ParamsModifier` tuning. - -### Model Not Detected / Wrong Pipeline Class - -**Symptom**: `ValueError: Model class ... not found in diffusion model registry` - -**Cause**: The model's `model_index.json` has a `_class_name` for the pipeline that doesn't match registry keys. - -**Fix**: The registry key must match the diffusers pipeline class name from `model_index.json`. If using a different name, map it in the registry: -```python -"DiffusersPipelineClassName": ("your_folder", "your_file", "YourVllmClassName"), -``` - -## Debugging Workflow - -1. **Add verbose logging**: Use `logger.info()` to print tensor shapes at each stage -2. **Compare step-by-step**: Run diffusers and vllm-omni side by side, comparing tensors after each major operation -3. **Use small configs**: Reduce `num_inference_steps=2`, small resolution for fast iteration -4. **Test transformer isolation**: Feed the same input to both diffusers and vllm-omni transformers, compare outputs -5. **Binary search for bugs**: Comment out blocks/layers to isolate where divergence starts diff --git a/.claude/skills/add-tts-model/SKILL.md b/.claude/skills/add-tts-model/SKILL.md deleted file mode 100644 index 963ffb4f64d..00000000000 --- a/.claude/skills/add-tts-model/SKILL.md +++ /dev/null @@ -1,504 +0,0 @@ ---- -name: add-tts-model -description: "Integrate a new text-to-speech model into vLLM-Omni from HuggingFace reference implementation through production-ready serving with streaming and CUDA graph acceleration. Use when adding a new TTS model, wiring stage separation for speech synthesis, enabling online voice generation serving, debugging TTS integration behavior, or building audio output pipelines." ---- - -# TTS Model Integration Workflow - -## Overview - -``` -HF Reference -> Stage Separation -> Online Serving -> Async Chunk -> CUDA Graph -> Pre-commit/DCO - (Phase 1) (Phase 2) (Phase 3) (Phase 4) (Phase 5) (Phase 6) -``` - -Three architecture patterns are supported: - -- **Two-stage pipeline** (e.g. Qwen3-TTS, Fish Speech, CosyVoice3): AR - code-predictor → audio decoder, connected via async_chunk for low-latency - streaming. Use this for maximum performance. -- **Single-stage AR via generator** (e.g. MOSS-TTS-Nano): entire model runs - inside one AR worker, streaming audio chunks through a per-request - `inference_stream()` generator. Use this when the upstream model bundles AR - + codec inseparably. See [references/single-stage-ar.md](references/single-stage-ar.md). -- **Single-stage, vLLM-native base LM + side computation** (e.g. VoxCPM2): - the base language model runs under vLLM's PagedAttention as a normal AR - model; diffusion / VAE / side computations run outside vLLM and are - attached via the runner post-processing hook. This is a distinct pattern - from the generator approach above — do not confuse the two. - -The single-stage variants skip Phase 4 (async_chunk) but Phase 5 (CUDA graph) -is still encouraged for the inner AR loop. - -## Cross-Cutting Invariants - -These rules apply to every TTS model regardless of architecture (AR vs AR+diffusion, single-stage vs two-stage, codec-based vs VAE-based). They surface repeatedly across PRs — check them at the end of every phase. - -### I1. Streaming output contract - -Pick exactly one per-step semantics for `forward()` and document it in the docstring: - -- **Delta**: yield only new audio samples produced this step. Preferred — linear cost, low memory. -- **Cumulative**: re-decode from step 0 every call. O(N²); only acceptable if the codec has no streaming decode path. - -If you choose **delta**, verify the full emit→consolidate→consume chain: - -1. `forward()` returns `{"model_outputs": , ...}` -2. `_consolidate_multimodal_tensors()` in `vllm_omni/engine/output_processor.py` concatenates the audio key into one tensor at finish. If it skips the key (`continue`), offline consumers receive only the final chunk. See `output_processor.py` for the concrete list of handled modality keys. -3. Streaming consumers (SSE, Gradio) receive per-step deltas; offline consumers (`engine.generate()`) receive a single concatenated tensor. - -Cumulative-vs-delta mismatch is the most common silent bug — offline RTF benchmarks pass, but users hear replays or truncation. - -### I2. Multimodal output consumer hygiene - -`outputs[0].outputs[0].multimodal_output[]` can be any of `Tensor`, `list[Tensor]` (pre-consolidation snapshot), `np.ndarray`, or scalar. When writing tests, examples, and benchmarks: - -- **Never** use `dict.get("a") or dict.get("b")` on tensor values — Python evaluates the tensor's boolean, raising `RuntimeError: Boolean value of Tensor with more than one value is ambiguous`. Use explicit `if x is None` chains. -- Always defensively handle the list form: `if isinstance(x, list): x = torch.cat([t.reshape(-1) for t in x], dim=0)`. -- Assert `shape` / `dtype` / `duration` explicitly; do not rely on truthiness for presence checks. - -### I3. Hot-loop GPU discipline - -Inside any per-step model loop (AR decode, diffusion solver, CFM Euler, vocoder block loop): - -- No `tensor.item()`, `.cpu()`, or `.tolist()` — each triggers a GPU→CPU sync; at 10 steps × 60 frames × 4 ops that is 2400 syncs per request. -- Prefer `dst.copy_(src)` over `dst.fill_(src.item())` when writing a scalar tensor into a buffer. -- Prefer `torch.compile(Model.forward, fullgraph=False)` on the whole forward over per-submodule compile — fewer dispatch boundaries, larger fusion regions. Measure before choosing granularity. -- No Python-side control flow that depends on tensor values; use `torch.where` / masking instead. - -Profile first, optimize second. See the profiling docs / project memory for the trace-analysis workflow. - -### I4. Validation pyramid - -Offline RTF alone is necessary but not sufficient. Every new TTS model must pass all three: - -| Layer | Catches | Tool | -|-------|---------|------| -| Offline RTF / duration check | Throughput regressions, missing audio, wrong sample rate | `end2end.py`, pytest e2e | -| Browser streaming playback | Delta/cumulative bugs, chunk boundary glitches, TTFP regressions | Gradio demo over `/v1/audio/speech?stream=true` | -| Concurrent requests | Per-request state leaks, codec window round-robin gaps | `max_num_seqs>1` smoke test with 4+ parallel prompts | - -Declaring a model "done" without all three has shipped regressions more than once. - -### I5. Per-request state is owned by the request, not the model - -If the model caches *anything* across `forward()` calls (streaming generators, codec buffers, sliding-window pads, CUDA graph state), key it by request ID: - -```python -self._state: dict[str, YourState] = {} # request_key → state -# fetch: request_key = str(info.get("_omni_req_id", "0")) -# free on finish: del self._state[request_key] -``` - -A shared buffer silently corrupts audio across concurrent requests — the symptom is crosstalk or truncation only under load. - -## Phase 1: HuggingFace Reference - -**Goal**: Understand the reference implementation and verify it produces correct audio. - -### Steps - -1. **Run the reference model** end-to-end using the official HuggingFace / GitHub code -2. **Document the architecture**: - - What are the sub-models? (AR decoder, codec decoder, vocoder, etc.) - - What is the token vocabulary? (semantic codes, RVQ codebooks, special tokens) - - What is the output format? (sample rate, channels, codec type) -3. **Capture reference outputs** for comparison during integration -4. **Identify the config structure**: `config.json` fields, `model_type`, sub-model configs - -### Key Questions - -- How many codebooks? What are the codebook sizes? -- What special tokens exist? (`<|voice|>`, `<|audio_start|>`, `<|im_end|>`, etc.) -- What is the token-to-ID mapping for codec codes? -- What is the hop length / frame rate of the codec? -- Does the model support voice cloning? How? (reference audio encoding, speaker embeddings, etc.) - -### Deliverables - -- Working reference script that produces audio -- Architecture diagram / notes -- Token vocabulary mapping -- Reference audio samples for regression testing - -## Phase 2: Stage Separation (Offline Inference) - -**Goal**: Split the model into vLLM-Omni stages and get offline inference working. - -### Steps - -1. **Register the model** in `vllm_omni/model_executor/models/registry.py` -2. **Create config classes** (`configuration_.py`) with `model_type` registration -3. **Implement Stage 0** (AR model): - - Subclass appropriate base (e.g., wrap Qwen3 decoder layers) - - Implement `forward()` for autoregressive token generation - - Handle special token logic (start/stop tokens, codec token mapping) - - If dual-AR (like Fish Speech), implement Fast AR as a nested module -4. **Implement Stage 1** (Decoder): - - Load codec weights (may need lazy loading from separate checkpoint) - - Implement `forward()`: codec codes -> audio waveform - - Return `OmniOutput` with `multimodal_outputs` -5. **Create stage config YAML** defining both stages, memory allocation, and model paths -6. **Create stage input processor** for prompt building -7. **Write end2end.py** test script - -### Critical Parameters to Get Right - -| Parameter | Impact if Wrong | -|-----------|----------------| -| Hop length | Audio duration wrong, streaming noise | -| Token ID mapping | Garbage codes -> noise output | -| Codebook count/size | Shape mismatch crashes | -| Stop token | Generation never stops or stops too early | -| dtype / autocast | Numerical issues, silent quality degradation | -| Repetition penalty | Must match reference (often 1.0 for TTS) | - -### Debugging Priority (from experience) - -When audio output is wrong, check in this order: - -1. **RoPE / attention**: Are position encodings correct? Is the attention mask right? -2. **Normalization**: RMSNorm epsilon, layer norm placement (pre vs post) -3. **Hop length**: Product of all upsample rates in the codec decoder -4. **Token mapping**: Are codec IDs correctly offset from the vocabulary base? -5. **Sampling parameters**: Temperature, top_k, top_p, repetition_penalty -6. **Tensor layout**: Codebook-major vs frame-major ordering -7. **dtype**: Float32 for codec decoders (autocast can corrupt audio) - -### Streaming Correctness Rules (single-stage and two-stage) - -These bugs appear in almost every new TTS PR. Check all before the first push. See also the cross-cutting invariants I1 (output contract) and I5 (per-request state) above — the rules below are the Phase 2-specific instances of those invariants: - -- **Accumulate codes across AR steps** — each `forward()` appends new codes; do not reset between steps or audio will be truncated (fish speech: `fix: accumulate audio_codes across steps`) -- **Emit delta audio, not full waveform** — in streaming mode yield only the new chunk per step, not the re-decoded full waveform from step 0 (fish speech: `fix: emit delta audio not full waveform`) -- **All return paths must emit `model_outputs`** — if any early-return branch skips setting `model_outputs`, the serving layer silently drops that step's audio (fish speech: `fix: ensure ALL return paths emit model_outputs`) -- **Per-request state isolation** — for batched concurrent requests, key all state by request ID; a shared buffer corrupts audio across requests (fish speech: `fix: per-request vocode + delta emission`) -- **Codec tensor device** — move codec codes to the codec decoder's device before calling decode; mismatches cause silent CPU fallback or crashes (fish speech: `fix: use model device for CUDA stream`) -- **AR stage `max_num_seqs`** — set to at least 4 in production configs; for single-stage models this is the only stage. For two-stage models, Stage 0 (AR) needs `max_num_seqs ≥ 4` to pipeline concurrent requests; Stage 1 (codec decoder) typically uses `max_num_seqs: 1` intentionally. Default of 1 everywhere causes audio gaps under concurrency because the codec window round-robins across requests (RFC #2568) - -### Optional Dependency Handling - -Patch optional dependencies (`torchaudio` / `torchcodec` / `soundfile`) at -the top of `load_weights()`, not at module import. Failures to do so cause -cryptic errors only on environments missing the optional package — after -the model is already deployed. See -[references/optional-deps.md](references/optional-deps.md) for the full -pattern, signature constraints, and MOSS-TTS-Nano reference. - -### Single-Stage AR Pattern (alternative to two-stage) - -When the upstream model cannot be cleanly split into an AR stage and a -separate decoder, run the full pipeline inside a single AR worker and -stream audio through a per-request `inference_stream()` generator keyed by -`_omni_req_id`. Stage config must set `worker_type: ar`, -`engine_output_type: audio`, `final_output: true`, `is_comprehension: true`, -and `async_chunk: false` at the top level. Only extract params from -`additional_information` that you actually forward, or pre-commit fails -`ruff F841`. - -Full walkthrough with the complete `forward()` / `_create_stream_gen()` -skeleton and stage-config fields: -[references/single-stage-ar.md](references/single-stage-ar.md). For an -in-tree reference, look for any single-stage AR model under -`vllm_omni/model_executor/models/` — e.g. the MOSS-TTS-Nano integration when -it lands. - -**VoxCPM2 is a different pattern** and should not reuse this skeleton — it -runs the base LM under vLLM PagedAttention with external side-computation. -See `plan/voxcpm2_native_ar_design.md`. - -### Deliverables - -- Model files in `vllm_omni/model_executor/models//` -- Stage config YAML -- Working `end2end.py` with correct audio output -- README.md in the example directory - -## Phase 3: Online Serving - -**Goal**: Expose the model via `/v1/audio/speech` API endpoint. - -### Steps - -1. **Register in `serving_speech.py`** — add all 5 points in a **single commit**; - partial integration causes hard-to-debug failures. This file is modified by every - model PR and is the most common source of rebase conflicts — see conflict note below. - - **Point 1** — stage constant (near the top, alongside the other `_*_TTS_MODEL_STAGES` sets): - ```python - _YOUR_MODEL_TTS_MODEL_STAGES = {"your_stage_key"} - ``` - - **Point 2** — union into `_TTS_MODEL_STAGES`: - ```python - _TTS_MODEL_STAGES: set[str] = ( - ... - | _YOUR_MODEL_TTS_MODEL_STAGES - ) - ``` - - **Point 3** — model type detection in `_detect_tts_model_type()`: - ```python - if model_stage in _YOUR_MODEL_TTS_MODEL_STAGES: - return "your_model" - ``` - - **Point 4** — validation dispatch in `_validate_tts_request()`: - ```python - if self._tts_model_type == "your_model": - return self._validate_your_model_request(request) - ``` - - **Point 5** — validation + parameter-builder methods: - ```python - def _validate_your_model_request(self, request) -> str | None: - if not request.input or not request.input.strip(): - return "Input text cannot be empty" - return None - - def _build_your_model_params(self, request) -> dict: - params = {"text": [request.input]} - if request.voice is not None: - params["voice"] = [request.voice] - return params - ``` - Wire `_build_your_model_params` into `_create_tts_request()` alongside the other - model-specific param builders. - - > **Two dispatch patterns coexist**: Fish Speech uses a `self._is_fish_speech` boolean - > instance attribute checked before `elif self._is_tts`, while all newer models - > (CosyVoice3, MOSS-TTS-Nano) use the `_tts_model_type` string returned by - > `_detect_tts_model_type()`. For new models, always use the `_tts_model_type` string - > pattern — do not add new `_is_*` flags. - - > **Unused variable rule**: only extract fields in `_build_your_model_params` that - > are actually forwarded to the model. Unused extractions fail `ruff F841`. - > For voice-cloning fields (`ref_audio` → `prompt_audio_path`, `ref_text` → - > `prompt_text`), add them to the param builder and verify they reach the model call. - - **Rebase conflict note**: when rebasing onto `main` after another model was merged, - `serving_speech.py` will conflict. Resolution: always keep *both* the upstream - model's additions and your own — never discard either side. - -2. **Handle model-specific parameters**: - - Voice cloning: `ref_audio` encoding and prompt injection - - `max_new_tokens` override in sampling params - - Model-specific default values -3. **Create client scripts**: `speech_client.py`, `run_server.sh` -4. **Test all response formats**: wav, mp3, flac, pcm -5. **Add Gradio demo**: Interactive web UI with streaming support - -### Voice Cloning Pattern - -```python -import base64 -from pathlib import Path - -def build_voice_clone_prompt(ref_audio_path: str, text: str, codec) -> list: - """Build prompt with reference audio for voice cloning in serving_speech.py.""" - audio_bytes = Path(ref_audio_path).read_bytes() - codes = codec.encode(audio_bytes) # Encode on CPU using model's codec (e.g., DAC) - token_ids = [code + codec.vocab_offset for code in codes.flatten().tolist()] - return [ - {"role": "system", "content": f"<|voice|>{''.join(chr(t) for t in token_ids)}"}, - {"role": "user", "content": text}, - ] -``` - -### Deliverables - -- Updated `serving_speech.py` with all 5 integration points (single commit) -- Client scripts and server launcher -- Gradio demo with streaming and voice cloning UI -- E2E online serving test (`tests/e2e/online_serving/test_.py`) -- Buildkite CI entry in `.buildkite/test-merge.yml` -- Documentation (offline + online serving docs) - -### E2E test pitfalls to avoid - -- **One `OmniServerParams` set per file.** `omni_server` is module-scoped; a second - id in the same file forces mid-module teardown/restart and exposes startup - races (`APIConnectionError` on the first request post-restart). Split variants - into separate files instead. -- **No external URL fetches from the server.** CI and some dev hosts can't - reach `raw.githubusercontent.com` over TLS. Inline ref audio as - `data:audio/wav;base64,...`; the serving layer accepts both URL and data URL. -- **Use the harness readiness gate.** The fixture waits for HTTP 200 on - `/health`; don't add `time.sleep` in tests. If warmup is incomplete, make - `/health` return non-200 until you're actually ready. -- **Mark with `@pytest.mark.core_model` + `hardware_test(res={"cuda": "H100"})`** - so the test lands in `test-ready.yml` (triggered by the `ready` label) rather - than only nightly. - -## Phase 4: Async Chunk (Streaming) - -**Goal**: Enable inter-stage streaming so audio chunks are produced while AR generation continues. - -### Steps - -1. **Update stage config YAML**: - ```yaml - async_chunk: true - codec_chunk_frames: 25 # frames per chunk - codec_left_context_frames: 25 # overlap for smooth boundaries - ``` -2. **Implement chunk handling in Stage 1**: - - Accept partial input (chunk of codec codes) - - Handle left context for smooth audio boundaries - - Return partial audio in `OmniOutput` -3. **Test streaming**: - - Verify audio quality matches non-streaming output - - Check for artifacts at chunk boundaries - - Measure TTFA (time to first audio) -4. **Update online serving** to support `stream=true` with PCM output - -### Streaming Architecture - -``` -Stage 0 (AR) Stage 1 (Decoder) - | | - |-- chunk 0 (25 frames) ------> decode -> audio chunk 0 -> client - |-- chunk 1 (25 frames) ------> decode -> audio chunk 1 -> client - |-- chunk 2 (25 frames) ------> decode -> audio chunk 2 -> client - ... -``` - -### Key Considerations - -- **Left context overlap**: Prevents audible artifacts at chunk boundaries -- **Hop length matters**: `context_audio_samples = context_frames * hop_length` -- **First chunk latency**: Can use larger initial chunk for better quality, then smaller chunks - -### Deliverables - -- Updated stage config with async_chunk enabled -- Smooth streaming audio without boundary artifacts -- TTFA metrics - -## Phase 5: CUDA Graph Acceleration - -**Goal**: Capture the AR loop as a CUDA graph for significant speedup. - -### Steps - -1. **Identify the hot loop**: The AR decoding loop that runs N steps per token -2. **Create static buffers**: - - KV caches with fixed max sequence length - - Pre-built causal masks and position tensors per step - - Static input/output tensors -3. **Implement graph capture**: - - Warm up with real data - - Capture the forward pass - - Replay with updated inputs -4. **Handle constraints**: - - Use `torch.argmax` instead of `torch.multinomial` (graph-safe) - - Fixed batch size (fall back to eager for other sizes) - - No dynamic control flow inside the graph - -See [references/cuda-graph-example.md](references/cuda-graph-example.md) for -a worked skeleton (Qwen3-TTS code predictor, 16-step AR loop), performance -expectations (3–5× on the graphed component for fixed batch_size=1), and the -graph-safety constraints you must honor inside the captured region. - -### Deliverables - -- CUDA graph implementation for the AR hot loop -- Benchmark script comparing eager vs graph performance -- Documentation of constraints and fallback behavior - -## Phase 6: Pre-commit and DCO - -**Goal**: Every commit passes `pre-commit` lint and carries a DCO -`Signed-off-by` line that matches the author email. - -- Install hooks once: `pre-commit install`. -- Run `pre-commit run --files ` before every push; accept any - auto-fixes, stage, re-commit. -- Sign every commit with `git commit -s`. DCO checks that author email and - `Signed-off-by` email match — `git config user.email` must match your - GitHub account email. - -Common pre-commit failures, recovery commands for missing sign-off, and the -full `pre-commit run` invocation for a TTS model: -[references/precommit-dco.md](references/precommit-dco.md). - -## Integration Checklist - -Use this checklist when integrating a new TTS model: - -### Cross-Cutting Invariants (verify at end of every phase) -- [ ] I1: `forward()` docstring states cumulative vs delta; consolidation path audited end-to-end -- [ ] I2: Tests / examples / benchmarks never use `dict.get(a) or dict.get(b)` on tensor values; list form handled -- [ ] I3: No `.item()` / `.cpu()` / Python branch on tensor values inside per-step loops -- [ ] I4: Offline RTF, browser streaming playback, and concurrent-request smoke test all pass -- [ ] I5: Any cross-step cache keyed by `_omni_req_id`; entries freed when the request finishes - -### Phase 1: HF Reference -- [ ] Reference model runs and produces correct audio -- [ ] Architecture documented (stages, codebooks, tokens, sample rate) -- [ ] Reference audio samples saved for comparison - -### Phase 2: Stage Separation -- [ ] Model registered in `registry.py` -- [ ] Config classes created with `model_type` registration -- [ ] Stage 0 (AR) implemented and generates correct tokens -- [ ] Stage 1 (Decoder) produces correct audio from tokens — dtype float32 for codec decoder -- [ ] Stage 1 `max_num_seqs` ≥ 4 in production config (default 1 causes gaps under concurrency) -- [ ] Optional dependency fallbacks handled at `load_weights()` time (torchaudio/soundfile/etc.) -- [ ] Streaming: codec codes accumulated across AR steps (not reset per step) -- [ ] Streaming: delta audio emitted per chunk, not full re-decoded waveform -- [ ] Streaming: all `forward()` return paths emit `model_outputs` -- [ ] Streaming: per-request state keyed by request ID (not shared across requests) -- [ ] Streaming: codec tensors moved to codec decoder device before decode -- [ ] Stage config YAML created -- [ ] `end2end.py` produces audio matching reference quality -- [ ] README.md written - -### Phase 3: Online Serving -- [ ] All 5 `serving_speech.py` integration points added in one commit -- [ ] Only extract params in `_build_*_params` that are forwarded to the model call (ruff F841) -- [ ] Prompt builder handles text input correctly -- [ ] Voice cloning works (if supported) -- [ ] All response formats work (wav, mp3, flac, pcm) -- [ ] Client scripts and server launcher created -- [ ] E2E online serving test written (`tests/e2e/online_serving/test_.py`) -- [ ] Buildkite CI entry added to `.buildkite/test-merge.yml` -- [ ] Gradio demo working -- [ ] Documentation added (offline + online docs, nav, supported models) - -### Phase 4: Async Chunk -- [ ] Stage config updated with `async_chunk: true` -- [ ] Stage 1 handles partial chunks correctly -- [ ] No audio artifacts at chunk boundaries -- [ ] Streaming via API (`stream=true`) works -- [ ] TTFA measured and acceptable - -### Phase 5: CUDA Graph -- [ ] Hot loop identified and profiled -- [ ] Static buffers allocated -- [ ] Graph captured and replays correctly -- [ ] Benchmark shows meaningful speedup -- [ ] Fallback to eager works for unsupported configs - -### Phase 6: Pre-commit and DCO -- [ ] `pre-commit run --files ` passes before every push -- [ ] Every commit has `Signed-off-by` matching the author email (`git commit -s`) -- [ ] `git config user.email` matches the email registered on your GitHub account -- [ ] Details and failure-recovery commands: [references/precommit-dco.md](references/precommit-dco.md) - -## References - -In-skill references (details split out of the main body): - -- [references/single-stage-ar.md](references/single-stage-ar.md) — full `forward()` / generator skeleton for the MOSS-TTS-Nano-style pattern -- [references/optional-deps.md](references/optional-deps.md) — torchaudio / torchcodec fallback pattern -- [references/cuda-graph-example.md](references/cuda-graph-example.md) — Qwen3-TTS code-predictor CUDA graph skeleton -- [references/precommit-dco.md](references/precommit-dco.md) — full pre-commit invocation, failure table, DCO recovery - -Project docs and adjacent skills: - -- [TTS audio skill](../vllm-omni-audio-tts/SKILL.md) — supported models and usage -- [Fish Speech integration](../vllm-omni-audio-tts/references/fish-speech.md) — complete example of Phases 1–3 -- [Qwen3-TTS reference](../vllm-omni-audio-tts/references/qwen-tts.md) — complete example of all 5 phases -- [Adding a TTS model (developer guide)](https://github.com/vllm-project/vllm-omni/blob/main/docs/contributing/model/adding_tts_model.md) -- `plan/voxcpm2_native_ar_design.md` — VoxCPM2's vLLM-native AR + side-computation pattern (distinct from the generator-based single-stage described above) diff --git a/.claude/skills/add-tts-model/references/cuda-graph-example.md b/.claude/skills/add-tts-model/references/cuda-graph-example.md deleted file mode 100644 index 6f4993b5c4c..00000000000 --- a/.claude/skills/add-tts-model/references/cuda-graph-example.md +++ /dev/null @@ -1,42 +0,0 @@ -# CUDA Graph Example: Qwen3-TTS Code Predictor - -Reference sketch for capturing the 16-step code-predictor AR loop as a single -CUDA graph. Adapt the shapes, number of steps, and KV-head layout to your -model. - -```python -import torch - -class CodePredictorGraph: - """Captures the 16-step code predictor AR loop as a single CUDA graph.""" - - def setup_graph(self, device: torch.device, kv_heads: int = 4, head_dim: int = 64): - self.num_steps = 16 - self.kv_cache = torch.zeros(1, kv_heads, self.num_steps, head_dim, device=device) - self.positions = torch.arange(self.num_steps, device=device) - self.causal_mask = torch.tril(torch.ones(self.num_steps, self.num_steps, device=device)) - self.input_buf = torch.zeros(1, 1, kv_heads * head_dim, device=device) - self.output_buf = torch.zeros(1, self.num_steps, device=device, dtype=torch.long) - # Warm up, then: self.graph = torch.cuda.CUDAGraph(); self.graph.capture(...) - - def run_graph(self, initial_input: torch.Tensor) -> torch.Tensor: - self.input_buf.copy_(initial_input) - self.graph.replay() - return self.output_buf.clone() -``` - -## Performance expectations (Qwen3-TTS code predictor) - -- **3–5× speedup** on the graphed component. -- Effective only for fixed batch sizes (typically `batch_size=1`). -- Fall back to eager for any shape/config that wasn't captured — do not try - to recapture per request. - -## Graph-safety constraints - -- `torch.argmax` instead of `torch.multinomial`. -- Fixed batch size. -- No Python control flow that branches on tensor values inside the captured - region (use `torch.where` / masks). -- No `.item()`, `.cpu()`, `.tolist()` — each would break the capture or - cause a GPU→CPU sync during replay. diff --git a/.claude/skills/add-tts-model/references/optional-deps.md b/.claude/skills/add-tts-model/references/optional-deps.md deleted file mode 100644 index 0a55f30f05c..00000000000 --- a/.claude/skills/add-tts-model/references/optional-deps.md +++ /dev/null @@ -1,47 +0,0 @@ -# Optional Dependency Handling - -Models that rely on `torchaudio`, `torchcodec`, `soundfile`, or other optional -packages must handle the missing-package case at import time, not at call -time. Failing to do this causes cryptic errors only on environments without -the optional package — after the model is already deployed. - -## Pattern (used in MOSS-TTS-Nano) - -```python -def _patch_torchaudio_load() -> None: - """Fallback torchaudio.load/save to soundfile if torchcodec is unavailable.""" - try: - import torchcodec # noqa: F401 - return # torchcodec present, torchaudio works as-is - except ImportError: - pass - - import soundfile as sf - import torchaudio - - def _sf_load(path, **kwargs): - data, sr = sf.read(str(path), dtype="float32", always_2d=True) - return torch.from_numpy(data).T, sr - - torchaudio.load = _sf_load - # patch .save similarly if needed -``` - -## Rules - -- Mirror the full signature of the replaced function. `torchaudio.load` - accepts `frame_offset`, `num_frames`, `normalize`, `channels_first`, - `format` — missing any of them causes `TypeError` from calling code. -- Catch `except Exception`, not just `ImportError`. `import torchaudio` - itself can fail with non-`ImportError` errors on broken installs. Log the - exception type and message (`logger.warning("torchaudio probe failed: %s: %s", - type(exc).__name__, exc)`) before falling back, so unrelated errors are not - silently swallowed. -- Call the patch function at the top of `load_weights()` before loading any - audio assets. Do not call it at module import time. - -## Reference implementation - -Any in-tree model that patches `torchaudio.load` in its `load_weights()` — -e.g. MOSS-TTS-Nano's `modeling_moss_tts_nano.py` once that integration -lands. diff --git a/.claude/skills/add-tts-model/references/precommit-dco.md b/.claude/skills/add-tts-model/references/precommit-dco.md deleted file mode 100644 index 86a1f42cefb..00000000000 --- a/.claude/skills/add-tts-model/references/precommit-dco.md +++ /dev/null @@ -1,54 +0,0 @@ -# Pre-commit and DCO - -Every commit must pass `pre-commit` lint and carry a `Signed-off-by` line -that matches the commit author email. - -## Pre-commit - -Install hooks once: - -```bash -pre-commit install -``` - -Run before every push on the files you changed: - -```bash -pre-commit run --files \ - vllm_omni/model_executor/models//*.py \ - vllm_omni/entrypoints/openai/serving_speech.py \ - vllm_omni/model_executor/models/registry.py \ - tests/e2e/offline_inference/test_.py \ - tests/e2e/online_serving/test_.py -``` - -When pre-commit **modifies files** (ruff format auto-fix), it exits non-zero -but the changes are correct — stage the modified files and re-commit. - -| Failure | Root cause | Fix | -|---------|-----------|-----| -| `ruff F841` | Variable extracted but never forwarded to model call | Remove the extraction or wire it through | -| `ruff E402` | Import added below function definitions | Move to top-level import block | -| `ruff format` | Line length, spacing, quote style | Accept auto-fix, stage, re-commit | - -## DCO sign-off - -Every commit must carry `Signed-off-by: Your Name `. Use -`-s`: - -```bash -git commit -s -m "feat(): add TTS support" -``` - -Or set it permanently: `git config format.signOff true`. - -The DCO check verifies that the commit author email matches the -`Signed-off-by` line. Confirm `git config user.email` matches your GitHub -account email before committing. - -Fix a missing or mismatched sign-off on the latest commit: - -```bash -git commit --amend -s --no-edit -git push origin --force-with-lease -``` diff --git a/.claude/skills/add-tts-model/references/single-stage-ar.md b/.claude/skills/add-tts-model/references/single-stage-ar.md deleted file mode 100644 index ed53d30261c..00000000000 --- a/.claude/skills/add-tts-model/references/single-stage-ar.md +++ /dev/null @@ -1,108 +0,0 @@ -# Single-Stage AR Pattern - -When the upstream model cannot be cleanly split into an AR stage and a separate -decoder (e.g. MOSS-TTS-Nano, or any model that bundles AR + codec via an -`inference_stream()` generator), run the whole pipeline inside a single AR -worker that yields audio chunks per request. - -This is distinct from VoxCPM2's pattern, which also runs in a single stage but -uses vLLM's native PagedAttention on the base language model with diffusion / -VAE side-computation outside vLLM — see -`plan/voxcpm2_native_ar_design.md` for that variant. - -## Implementation - -1. **Single model file** — load both AR LM and codec inside - `modeling_.py`. -2. **Load weights in `load_weights()`**, not `__init__()` — vLLM initializes - distributed state before any CUDA allocations. -3. **Stream via a per-request generator** stored in `self._stream_gens`: - -```python -class YourModelForCausalLM(nn.Module): - def __init__(self, *, vllm_config, prefix=""): - super().__init__() - self._lm = None # populated in load_weights() - self._stream_gens: dict = {} # request_key → generator - - def load_weights(self, weights): - # Load self._lm here, after vLLM distributed init - ... - - def forward( - self, - input_ids, - positions, - intermediate_tensors=None, - inputs_embeds=None, - runtime_additional_information: list[dict] | None = None, # one dict per request - **kwargs, - ) -> OmniOutput: - infos = runtime_additional_information or [{}] - # Skip dummy/profiling calls - if not runtime_additional_information or all(i.get("_is_dummy") for i in infos): - self._ar_emit_stop_token = True - return OmniOutput(...) # return empty outputs - - outputs, last_flags = [], [] - for info in infos: - request_key = str(info.get("_omni_req_id", "0")) # per-request ID from vLLM - if request_key not in self._stream_gens: - self._stream_gens[request_key] = self._create_stream_gen(info) - try: - chunk, is_last = next(self._stream_gens[request_key]) - except StopIteration: - chunk, is_last = torch.zeros(0), True - if is_last: - del self._stream_gens[request_key] - outputs.append(chunk) - last_flags.append(is_last) - - self._ar_emit_stop_token = all(last_flags) - return OmniOutput(multimodal_outputs={"model_outputs": outputs, ...}) - - def _create_stream_gen(self, info: dict): - """Yield (waveform_tensor, is_last) tuples from inference_stream().""" - for event in self._lm.inference_stream(...): - if event["type"] == "audio": - yield event["waveform"], False - elif event["type"] == "result": - # Fallback: some models emit a single "result" event instead of - # incremental "audio" events — handle both paths - yield event.get("waveform", torch.zeros(0)), True - return - yield torch.zeros(0), True - - def compute_logits(self, hidden_states, sampling_metadata): - # Emit EOS only after the last chunk so the AR scheduler ends the request - ... -``` - -## Key points - -- `runtime_additional_information` is the correct parameter name (not - `**kwargs`) — it carries one dict per request in the batch. -- The request ID is `info.get("_omni_req_id")` — set by vLLM, not by user code. -- Handle both `"audio"` (incremental) and `"result"` (final combined) event - types from upstream models. - -## Stage config - -Single stage with `worker_type: ar`, `engine_output_type: audio`, -`final_output: true`, `is_comprehension: true`, and `async_chunk: false` at -the top level. Omitting any of these causes silent misclassification in the -serving layer. - -## Lint discipline - -Only extract variables from `additional_information` that you actually -forward to the model call — unused extractions trip `ruff F841` in -pre-commit. - -## Reference implementation - -Look for any single-stage AR model under -`vllm_omni/model_executor/models/` — e.g. `moss_tts_nano/` when its -integration lands. If none is in tree yet, follow the skeleton above and -cross-check against the `is_comprehension: true` / `async_chunk: false` -dispatch in `vllm_omni/entrypoints/openai/serving_speech.py`. diff --git a/.claude/skills/readme.md b/.claude/skills/readme.md deleted file mode 100644 index b66f2ecd131..00000000000 --- a/.claude/skills/readme.md +++ /dev/null @@ -1,34 +0,0 @@ -# Claude Skills for vLLM-Omni - -This directory contains Claude Code skills maintained for the `vllm-omni` -repository. These skills capture repeatable workflows for common contributor -tasks such as model integration, pull request review, and release note -generation. - -## Directory Structure - -Each skill lives in its own directory under `.claude/skills/`. A skill may -include: - -- `SKILL.md`: the main workflow and operating instructions -- `references/`: focused reference material used by the skill -- `scripts/`: small helper scripts used by the skill - -## Available Skills - -- `add-diffusion-model`: guides integration of a new diffusion model into - `vllm-omni` -- `add-omni-model`: covers addition of new omni-modality model support -- `add-tts-model`: covers integration of new TTS models and related serving - workflows -- `generate-release-note`: helps prepare release notes for repository changes -- `review-pr`: provides a structured workflow for reviewing pull requests - -## Maintenance Guidelines - -- Keep skill names short and task-oriented. -- Prefer repository-local paths, commands, and examples. -- Avoid hardcoding fast-changing support matrices unless the skill is actively - maintained alongside those changes. -- Treat skills as contributor tooling: optimize for clarity, actionability, and - low maintenance overhead. diff --git a/.claude/skills/vllm-omni-npu-upgrade/SKILL.md b/.claude/skills/vllm-omni-npu-upgrade/SKILL.md deleted file mode 100644 index 1ef7ab39301..00000000000 --- a/.claude/skills/vllm-omni-npu-upgrade/SKILL.md +++ /dev/null @@ -1,300 +0,0 @@ ---- -name: vllm-omni-npu-model-runner-upgrade -description: "Upgrade vllm-omni NPU model runners (OmniNPUModelRunner, NPUARModelRunner, NPUGenerationModelRunner) to align with the latest vllm-ascend NPUModelRunner while preserving omni-specific logic." ---- - -# vLLM-Omni NPU Model Runner Upgrade Skill - -## Overview - -This skill guides the process of upgrading vllm-omni's NPU model runners to align with the latest vllm-ascend codebase while preserving omni-specific enhancements. The NPU runners are designed to run omni multimodal models (like Qwen3-Omni, Bagel, MiMoAudio) on Ascend NPUs. - -## File Structure - -### NPU Model Runner Files -``` -vllm-omni/vllm_omni/platforms/npu/worker/ -├── __init__.py -├── npu_model_runner.py # OmniNPUModelRunner (base class) -├── npu_ar_model_runner.py # NPUARModelRunner (autoregressive) -├── npu_ar_worker.py # AR worker -├── npu_generation_model_runner.py # NPUGenerationModelRunner (diffusion/non-AR) -└── npu_generation_worker.py # Generation worker -``` - -### GPU Reference Files (for omni-specific logic sync) -``` -vllm-omni/vllm_omni/worker/ -├── __init__.py -├── gpu_model_runner.py # OmniGPUModelRunner -├── gpu_ar_model_runner.py # GPUARModelRunner -├── gpu_ar_worker.py -├── gpu_generation_model_runner.py -├── gpu_generation_worker.py -├── mixins.py -├── base.py -└── gpu_memory_utils.py -``` - -### vllm-ascend Reference Files -``` -vllm-ascend/vllm_ascend/worker/ -├── model_runner_v1.py # NPUModelRunner (base class to copy from) -├── npu_input_batch.py -├── block_table.py -├── pcp_utils.py -└── worker.py -``` - -## Inheritance Hierarchy - -``` - GPUModelRunner (vllm) - | - +----------------+----------------+ - | | - OmniGPUModelRunner NPUModelRunner (vllm-ascend) - (vllm_omni/worker) (vllm_ascend/worker) - | | - +----------- OmniNPUModelRunner --+ - (multiple inheritance) - | - +---------------+---------------+ - | | - NPUARModelRunner NPUGenerationModelRunner - (autoregressive) (non-autoregressive/diffusion) -``` - -## Omni-Specific Comment Markers - -Omni-specific logic is marked with comment blocks: -```python -# -------------------------------------- Omni-new ------------------------------------------------- -# ... omni-specific code ... -# -------------------------------------- Omni-new ------------------------------------------------- -``` - -Or simpler variations: -```python -# -------------------------------------- Omni-new ------------------------------------------------- -# ------------------------------------------------------------------------------------------------ -``` - -**Important**: -- Always preserve and add these markers when modifying code. -- **The reference documents (`references/omni-specific-blocks.md`) may not be up-to-date.** Always grep for `Omni-new` in the GPU implementations to find the authoritative list of omni-specific blocks. -- When you discover new omni-specific code that is not documented in the references, please update the reference files. - -## Key Methods Requiring Attention - -### OmniNPUModelRunner (npu_model_runner.py) - -| Method | Description | Omni-Specific Logic | -|--------|-------------|---------------------| -| `load_model` | Load model and initialize talker_mtp | Uses `ACLGraphWrapper` instead of `CUDAGraphWrapper`, initializes talker buffers | -| `_dummy_run` | Warmup/profiling run | talker_mtp dummy forward, `extract_multimodal_outputs` | -| `_model_forward` | Forward pass wrapper | Injects `model_kwargs_extra`, wraps with `OmniOutput`, NPU-specific graph updates | -| `_talker_mtp_forward` | Talker MTP forward for Qwen3-Omni | Uses `set_ascend_forward_context` | - -### NPUARModelRunner (npu_ar_model_runner.py) - -| Method | Description | Omni-Specific Logic | -|--------|-------------|---------------------| -| `__init__` | Initialize with KV transfer manager | `OmniKVTransferManager` setup | -| `execute_model` | Main inference entry | KV transfer handling, `_update_states` override, `extract_multimodal_outputs` | -| `sample_tokens` | Token sampling | Hidden states extraction, multimodal outputs processing, `OmniModelRunnerOutput` | -| `_resolve_global_request_id` | Request ID resolution | For disaggregated inference | - -### NPUGenerationModelRunner (npu_generation_model_runner.py) - -| Method | Description | Omni-Specific Logic | -|--------|-------------|---------------------| -| `_update_request_states` | Update request states for async chunk | async_chunk handling | -| `execute_model` | Generation forward | async_chunk, `seq_token_counts`, `_run_generation_model` | -| `sample_tokens` | Output processing | multimodal output packaging to `OmniModelRunnerOutput` | -| `_dummy_run` | Dummy run override | model_kwargs initialization, multimodal extraction | -| `_run_generation_model` | Run generation model | Calls `_model_forward` with sampler | - -## Upgrade Workflow - -### Step 1: Preparation - -1. **Identify target versions**(Use gh cli to check): - - We're using vllm-omni main branch - - Check the last release of vllm-omni - - Target vllm-ascend version(Just directly use the local latest vllm-ascend code) - -2. **Check GPU-side changes** (since last release): - ```bash - cd /root/vllm-workspace/vllm-omni - git log --oneline --since="" -- vllm_omni/worker/ - ``` - -3. **Read latest vllm-ascend code**: - - We don't track vllm-ascend changes - just directly use the latest code from `/root/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py` - - Copy the relevant methods and re-insert omni-specific blocks - -### Step 2: Analyze Omni-Specific Logic - -For each NPU model runner file: - -1. **Extract existing omni-specific blocks**: - ```bash - grep -n "Omni-new" vllm_omni/platforms/npu/worker/npu_model_runner.py - ``` - -2. **Document each omni block**: - - Which method it belongs to - - What functionality it provides - - Dependencies on other omni code - -### Step 3: Update Base Class (OmniNPUModelRunner) - -**Note**: Always check the GPU implementation `gpu_model_runner.py` for any new omni logic not yet documented in references. - -1. **Read the latest vllm-ascend `NPUModelRunner.load_model`** -2. **Copy the method, keeping the structure** -3. **Re-insert omni-specific logic** (check GPU `gpu_model_runner.py` for authoritative list): - - Replace `CUDAGraphWrapper` with `ACLGraphWrapper` - - Keep talker_mtp initialization - - Preserve buffer allocations for talker - - Check for any new omni blocks added since last sync - -4. **Update `_dummy_run`**: - - Copy from vllm-ascend - - Compare with GPU `_dummy_run` for omni-specific blocks - - Re-insert all `Omni-new` marked code from GPU version - -5. **Update `_model_forward`**: - - Keep the omni wrapper logic - - Update NPU-specific parts (graph params, SP all-gather) - - Check GPU version for any new omni logic - -### Step 4: Update AR Model Runner - -1. **Compare with GPU `gpu_ar_model_runner.py`** for any new omni features -2. **Copy `execute_model` from vllm-ascend** -3. **Re-insert omni blocks** (reference `references/omni-specific-blocks.md`, but note it may be incomplete): - - **IMPORTANT**: Always check the GPU implementation `gpu_ar_model_runner.py` for all `Omni-new` marked code blocks - - The reference doc may not include newly added omni logic - treat it as a starting point, not exhaustive - - When discovering new omni code blocks, please update `references/omni-specific-blocks.md` - - Common omni blocks include but are not limited to: KV transfer, multimodal outputs, sampling_metadata handling, etc. - -4. **Update `sample_tokens`** (also compare with GPU implementation): - - Compare with `gpu_ar_model_runner.py`'s `sample_tokens` method - - Identify all `Omni-new` marked code blocks - - Ensure NPU version includes all omni-specific logic - -### Step 5: Update Generation Model Runner - -**Note**: Generation model runner may have unique omni logic for diffusion/non-AR models. - -1. **Compare with GPU `gpu_generation_model_runner.py`** - grep for all `Omni-new` blocks -2. **Update `execute_model`**: - - Check GPU version for all omni-specific blocks - - Keep async_chunk handling - - Keep `seq_token_counts` injection - - Update forward/context setup from vllm-ascend - - Look for any new omni logic not documented in references - -3. **Update `_dummy_run`**: - - Copy from vllm-ascend base - - Compare with GPU `_dummy_run` if exists - - Re-insert all omni-specific logic - -### Step 6: Update Imports - -Check and update imports at the top of each file: - -```python -# Common vllm-ascend imports -from vllm_ascend.ascend_forward_context import get_forward_context, set_ascend_forward_context -from vllm_ascend.attention.attention_v1 import AscendAttentionState -from vllm_ascend.attention.utils import using_paged_attention -from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params -from vllm_ascend.ops.rotary_embedding import update_cos_sin -from vllm_ascend.utils import enable_sp, lmhead_tp_enable -from vllm_ascend.worker.model_runner_v1 import SEQ_LEN_WITH_MAX_PA_WORKSPACE, NPUModelRunner - -# Omni-specific imports -from vllm_omni.model_executor.models.output_templates import OmniOutput -from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner -from vllm_omni.outputs import OmniModelRunnerOutput -from vllm_omni.distributed.omni_connectors.kv_transfer_manager import OmniKVTransferManager -``` - -### Step 7: Sync GPU-Side Omni Changes - -1. **Check recent GPU worker changes**: - ```bash - git diff .. -- vllm_omni/worker/gpu_model_runner.py - git diff .. -- vllm_omni/worker/gpu_ar_model_runner.py - ``` - -2. **Identify new omni features** that need to be ported to NPU - -3. **Apply corresponding changes** to NPU runners - -### Step 8: Validation - -1. **Run type checking**: - ```bash - cd /root/vllm-workspace/vllm-omni - python -m py_compile vllm_omni/platforms/npu/worker/npu_model_runner.py - python -m py_compile vllm_omni/platforms/npu/worker/npu_ar_model_runner.py - python -m py_compile vllm_omni/platforms/npu/worker/npu_generation_model_runner.py - ``` - -2. **Run import test**: - ```bash - python -c "from vllm_omni.platforms.npu.worker import *" - ``` - -3. **Run model serving test** (if hardware available): - ```bash - vllm serve --trust-remote-code - ``` - -## Common Pitfalls - -### 1. Forward Context Differences -- GPU uses `set_forward_context` -- NPU uses `set_ascend_forward_context` -- Parameters may differ slightly - -### 2. Graph Wrapper Differences -- GPU: `CUDAGraphWrapper` -- NPU: `ACLGraphWrapper` -- Constructor parameters may differ - -### 3. Buffer Creation -- GPU: `_make_buffer` returns different structure -- NPU: May need numpy=True/False parameter - -### 4. Attention Metadata -- GPU: Uses vllm attention metadata builders -- NPU: Uses `AscendCommonAttentionMetadata` - -### 5. Sampling -- GPU: Uses vllm sampler -- NPU: Uses `AscendSampler` - -## Checklist Before Commit - -- [ ] All omni-specific comment markers preserved -- [ ] New omni logic from GPU side synced -- [ ] Imports updated to latest vllm-ascend -- [ ] No `CUDAGraphWrapper` references in NPU code -- [ ] `set_ascend_forward_context` used instead of `set_forward_context` -- [ ] `ACLGraphWrapper` used for talker_mtp wrapping -- [ ] Type hints match vllm-ascend signatures -- [ ] No duplicate code blocks -- [ ] Python syntax valid (py_compile passes) - -## Reference Files for Comparison - -When upgrading, keep these files open for reference: - -1. **vllm-ascend NPUModelRunner**: `/root/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py` -2. **vllm GPUModelRunner**: `/root/vllm-workspace/vllm/vllm/v1/worker/gpu_model_runner.py` -3. **vllm-omni OmniGPUModelRunner**: `/root/vllm-workspace/vllm-omni/vllm_omni/worker/gpu_model_runner.py` diff --git a/.claude/skills/vllm-omni-npu-upgrade/references/gpu-to-npu-translation.md b/.claude/skills/vllm-omni-npu-upgrade/references/gpu-to-npu-translation.md deleted file mode 100644 index 89067d37b2d..00000000000 --- a/.claude/skills/vllm-omni-npu-upgrade/references/gpu-to-npu-translation.md +++ /dev/null @@ -1,335 +0,0 @@ -# GPU to NPU Translation Patterns - -This document provides a quick reference for translating GPU code patterns to NPU equivalents when porting omni-specific logic. - -## Import Translations - -### Forward Context -```python -# GPU -from vllm.forward_context import set_forward_context - -# NPU -from vllm_ascend.ascend_forward_context import set_ascend_forward_context -``` - -### Graph Wrapper -```python -# GPU -from vllm.compilation.cuda_graph import CUDAGraphWrapper - -# NPU -from vllm_ascend.compilation.acl_graph import ACLGraphWrapper -``` - -### Attention State -```python -# GPU (no equivalent - uses FlashAttention states directly) - -# NPU -from vllm_ascend.attention.attention_v1 import AscendAttentionState -``` - -### Utilities -```python -# GPU -# (directly use torch.cuda functions) - -# NPU -from vllm_ascend.utils import enable_sp, lmhead_tp_enable -from vllm_ascend.ops.rotary_embedding import update_cos_sin -``` - -## Context Manager Translations - -### Forward Context Setup -```python -# GPU -with set_forward_context( - attn_metadata, - self.vllm_config, - num_tokens=num_tokens_padded, - num_tokens_across_dp=num_tokens_across_dp, - cudagraph_runtime_mode=cudagraph_mode, - batch_descriptor=batch_desc, -): - # forward pass - -# NPU -with set_ascend_forward_context( - attn_metadata, - self.vllm_config, - num_tokens=num_tokens_padded, - num_tokens_across_dp=num_tokens_across_dp, - aclgraph_runtime_mode=cudagraph_mode, # Note: 'aclgraph' not 'cudagraph' - batch_descriptor=batch_desc, - num_actual_tokens=scheduler_output.total_num_scheduled_tokens, - model_instance=self.model, -): - # forward pass -``` - -### Graph Capture Context -```python -# GPU -from vllm.compilation.cuda_graph import graph_capture as cuda_graph_capture -with cuda_graph_capture(self.device): - # capture - -# NPU -from vllm_ascend.worker.model_runner_v1 import graph_capture -with graph_capture(self.device): - # capture -``` - -## Graph Wrapper Usage - -### Creating Graph Wrapper -```python -# GPU -if cudagraph_mode.has_full_cudagraphs() and has_separate_talker: - self.talker_mtp = CUDAGraphWrapper( - talker_mtp, - self.vllm_config, - runtime_mode=CUDAGraphMode.FULL - ) - -# NPU -if cudagraph_mode.has_full_cudagraphs() and has_separate_talker: - self.talker_mtp = ACLGraphWrapper( - talker_mtp, - self.vllm_config, - runtime_mode=CUDAGraphMode.FULL - ) -``` - -### Checking Graph Wrapper Type -```python -# GPU -if not isinstance(self.talker_mtp, CUDAGraphWrapper): - _cudagraph_mode = CUDAGraphMode.NONE - -# NPU -if not isinstance(self.talker_mtp, ACLGraphWrapper): - _cudagraph_mode = CUDAGraphMode.NONE -``` - -## Device Operations - -### Synchronization -```python -# GPU -torch.cuda.synchronize() - -# NPU -torch.npu.synchronize() -``` - -### Stream Operations -```python -# GPU -stream = torch.cuda.Stream(device=device) -torch.cuda.current_stream() - -# NPU -stream = torch.npu.Stream(device=device) -torch.npu.current_stream() -``` - -## Attention Metadata - -### State Setting (NPU-specific) -```python -# GPU - handled internally by attention backends - -# NPU - explicit state setting required -self.attn_state = AscendAttentionState.DecodeOnly -if self.speculative_config and self.speculative_config.method == "mtp": - if self.vllm_config.model_config.use_mla: - self.attn_state = AscendAttentionState.SpecDecoding - else: - self.attn_state = AscendAttentionState.ChunkedPrefill -``` - -### Building Attention Metadata -```python -# GPU - uses vllm attention builders - -# NPU - may need additional parameters -(attn_metadata, spec_decode_common_attn_metadata) = self._build_attention_metadata( - num_tokens=num_tokens_unpadded, - num_tokens_padded=num_tokens_padded, - num_reqs=num_reqs, - num_reqs_padded=num_reqs_padded, - max_query_len=max_num_scheduled_tokens, - ubatch_slices=ubatch_slices_attn, - logits_indices=logits_indices, - use_spec_decode=use_spec_decode, - num_scheduled_tokens=scheduler_output.num_scheduled_tokens, - num_scheduled_tokens_np=num_scheduled_tokens_np, - cascade_attn_prefix_lens=cascade_attn_prefix_lens, -) -``` - -## Rotary Embedding - -### Update Cos/Sin Cache -```python -# GPU - typically handled inside attention - -# NPU - explicit update required before forward -from vllm_ascend.ops.rotary_embedding import update_cos_sin -update_cos_sin(positions) -``` - -## Sequence Parallelism - -### Enable SP Check -```python -# GPU - use vllm distributed utilities - -# NPU - use vllm-ascend wrapper -from vllm_ascend.utils import enable_sp - -if enable_sp(): - # sequence parallelism enabled -``` - -## Sampler - -### Sampler Type -```python -# GPU - uses vllm sampler -self.sampler = Sampler() - -# NPU - uses AscendSampler -from vllm_ascend.sample.sampler import AscendSampler -self.sampler = AscendSampler() -``` - -## Input Batch - -### Batch Class -```python -# GPU -from vllm.v1.worker.gpu_input_batch import InputBatch - -# NPU -from vllm_ascend.worker.npu_input_batch import NPUInputBatch -``` - -## Graph Parameter Updates - -### Full Graph Params Update (NPU-specific) -```python -# GPU - not needed - -# NPU - required for FULL graph mode -from vllm_ascend.compilation.acl_graph import update_full_graph_params - -forward_context = get_forward_context() -if ( - forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL - and not forward_context.capturing - and not self.use_sparse -): - update_full_graph_params( - self.attn_backend, - self.update_stream, - forward_context, - num_tokens_padded, - self.vllm_config, - self.speculative_config, - positions.shape[0], - ) -``` - -## Paged Attention Check - -```python -# GPU - not typically needed - -# NPU -from vllm_ascend.attention.utils import using_paged_attention - -if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config): - seq_lens = SEQ_LEN_WITH_MAX_PA_WORKSPACE -``` - -## Common Method Signature Differences - -### _dummy_run Parameters -```python -# GPU (v0.17.0) -def _dummy_run( - self, - num_tokens: int, - cudagraph_runtime_mode: CUDAGraphMode | None = None, - force_attention: bool = False, - uniform_decode: bool = False, - allow_microbatching: bool = True, - skip_eplb: bool = False, - is_profile: bool = False, - create_mixed_batch: bool = False, - remove_lora: bool = True, - is_graph_capturing: bool = False, - num_active_loras: int = 0, -) -> tuple[torch.Tensor, torch.Tensor]: - -# NPU (v0.17.0) - adds with_prefill, activate_lora -def _dummy_run( - self, - num_tokens: int, - with_prefill: bool = False, - cudagraph_runtime_mode: CUDAGraphMode | None = None, - force_attention: bool = False, - uniform_decode: bool = False, - is_profile: bool = False, - create_mixed_batch: bool = False, - allow_microbatching: bool = True, - skip_eplb: bool = False, - remove_lora: bool = True, - activate_lora: bool = False, - is_graph_capturing: bool = False, - num_active_loras: int = 0, -) -> tuple[torch.Tensor, torch.Tensor]: -``` - -### _model_forward Parameters -```python -# GPU - no num_tokens_padded -def _model_forward( - self, - input_ids: torch.Tensor | None = None, - positions: torch.Tensor | None = None, - intermediate_tensors: IntermediateTensors | None = None, - inputs_embeds: torch.Tensor | None = None, - **model_kwargs: dict[str, Any], -): - -# NPU - has num_tokens_padded as first parameter -def _model_forward( - self, - num_tokens_padded: int, - input_ids: torch.Tensor | None = None, - positions: torch.Tensor | None = None, - intermediate_tensors: IntermediateTensors | None = None, - inputs_embeds: torch.Tensor | None = None, - **model_kwargs: dict[str, Any], -): -``` - -## Quick Reference Table - -| Feature | GPU | NPU | -|---------|-----|-----| -| Graph wrapper | `CUDAGraphWrapper` | `ACLGraphWrapper` | -| Forward context | `set_forward_context` | `set_ascend_forward_context` | -| Runtime mode param | `cudagraph_runtime_mode` | `aclgraph_runtime_mode` | -| Device sync | `torch.cuda.synchronize()` | `torch.npu.synchronize()` | -| Stream | `torch.cuda.Stream` | `torch.npu.Stream` | -| Current stream | `torch.cuda.current_stream()` | `torch.npu.current_stream()` | -| Input batch | `InputBatch` | `NPUInputBatch` | -| Sampler | `Sampler` | `AscendSampler` | -| Attention state | N/A | `AscendAttentionState` | -| RoPE update | N/A | `update_cos_sin()` | diff --git a/.claude/skills/vllm-omni-npu-upgrade/references/omni-specific-blocks.md b/.claude/skills/vllm-omni-npu-upgrade/references/omni-specific-blocks.md deleted file mode 100644 index 8c5d32ab4c1..00000000000 --- a/.claude/skills/vllm-omni-npu-upgrade/references/omni-specific-blocks.md +++ /dev/null @@ -1,374 +0,0 @@ -# Omni-Specific Code Blocks Reference - -This document catalogs omni-specific code blocks in the NPU model runners, making it easier to identify what needs to be preserved during upgrades. - -> **IMPORTANT**: This document may not be complete or up-to-date! -> -> - Always grep for `Omni-new` in the GPU implementations (`vllm_omni/worker/`) to find the authoritative list -> - New omni features may be added that are not yet documented here -> - When you discover new omni-specific blocks during an upgrade, please update this document -> - Last verified: Check git history for this file - -## OmniNPUModelRunner (npu_model_runner.py) - -### load_model - Talker MTP Initialization - -```python -def load_model(self, *args, **kwargs) -> None: - NPUModelRunner.load_model(self, *args, **kwargs) - # Initialize enable_sp cache to avoid get_current_vllm_config() error - # in _pad_for_sequence_parallelism during execute_model. - # This is a workaround for vllm-ascend not passing vllm_config to enable_sp(). - enable_sp(self.vllm_config) - # TODO move this model specific logic to a separate class - # TTS model IS the talker (no .talker sub-attr); use getattr to support both Omni and TTS. - talker_mtp = getattr(self.model, "talker_mtp", None) - if talker_mtp is not None: - self.talker_mtp = talker_mtp # type: ignore[assignment] - cudagraph_mode = self.compilation_config.cudagraph_mode - assert cudagraph_mode is not None - # Only wrap talker_mtp in CUDAGraphWrapper for Omni models that - # have a separate .talker sub-module. TTS models' code predictor - # has internal AR loops / torch.multinomial — not graph-safe. - has_separate_talker = getattr(self.model, "talker", None) is not None - if cudagraph_mode.has_full_cudagraphs() and has_separate_talker: - # NOTE: Use ACLGraphWrapper on NPU, not CUDAGraphWrapper - self.talker_mtp = ACLGraphWrapper(talker_mtp, self.vllm_config, runtime_mode=CUDAGraphMode.FULL) - # TTS exposes mtp_hidden_size; Omni uses hf_text_config.hidden_size. - hidden_size = int( - getattr(self.model, "mtp_hidden_size", 0) or getattr(self.model_config.hf_text_config, "hidden_size") - ) - max_batch_size = max(self.max_num_reqs, self.compilation_config.max_cudagraph_capture_size) - self.talker_mtp_input_ids = self._make_buffer(max_batch_size, dtype=torch.int32) - self.talker_mtp_inputs_embeds = self._make_buffer( - max_batch_size, hidden_size, dtype=self.dtype, numpy=False - ) - self.last_talker_hidden = self._make_buffer(max_batch_size, hidden_size, dtype=self.dtype, numpy=False) - self.text_step = self._make_buffer(max_batch_size, hidden_size, dtype=self.dtype, numpy=False) -``` - -### _dummy_run - Talker MTP Dummy Forward - -Location: Inside `set_ascend_forward_context` block, before main model forward - -```python -# ---------------------------------------Omni-new---------------------------------------------- -if getattr(self.model, "talker", None) is not None and hasattr(self.model, "talker_mtp"): - num_tokens_padded_talker_mtp = num_tokens_padded - if num_tokens_padded_talker_mtp == self.max_num_tokens: - num_tokens_padded_talker_mtp = self.talker_mtp_input_ids.gpu.shape[0] - outputs = self.talker_mtp( - self.talker_mtp_input_ids.gpu[:num_tokens_padded_talker_mtp], - self.talker_mtp_inputs_embeds.gpu[:num_tokens_padded_talker_mtp], - self.last_talker_hidden.gpu[:num_tokens_padded_talker_mtp], - self.text_step.gpu[:num_tokens_padded_talker_mtp], - ) - self.compilation_config.cache_dir = None -# ---------------------------------------Omni-new---------------------------------------------- -``` - -### _dummy_run - Extract Multimodal Outputs - -Location: After model forward, before dummy_compute_logits - -```python -# ---------------------------------------Omni-new---------------------------------------------- -hidden_states, multimodal_outputs = self.extract_multimodal_outputs(hidden_states) -# ---------------------------------------Omni-new---------------------------------------------- -``` - -### _model_forward - Omni Output Wrapping - -```python -def _model_forward( - self, - num_tokens_padded: int, - input_ids: torch.Tensor | None = None, - positions: torch.Tensor | None = None, - intermediate_tensors: IntermediateTensors | None = None, - inputs_embeds: torch.Tensor | None = None, - **model_kwargs: dict[str, Any], -): - """Override to combine NPUModelRunner's signature with OmniGPUModelRunner's logic.""" - # Omni-specific: build and inject extra model kwargs - model_kwargs_extra = self._build_model_kwargs_extra() - - # Call the model forward (same as NPUModelRunner) - assert self.model is not None - model_output = self.model( - input_ids=input_ids, - positions=positions, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - **model_kwargs, - **model_kwargs_extra, - ) - - # Omni-specific: wrap output if needed - if not isinstance(model_output, OmniOutput) and hasattr(self.model, "make_omni_output"): - model_output = self.model.make_omni_output(model_output, **model_kwargs_extra) - - # Omni-specific: cache model output for later sample_tokens - self._omni_last_model_output = model_output - - # NPU-specific: update full graph params (keep from vllm-ascend) - forward_context = get_forward_context() - # ... NPU graph update logic ... - - # NPU-specific: all-gather for sequence parallelism (keep from vllm-ascend) - if get_forward_context().sp_enabled and not isinstance(model_output, IntermediateTensors): - model_output = self._all_gather_hidden_states_and_aux(model_output) - - return model_output -``` - ---- - -## NPUARModelRunner (npu_ar_model_runner.py) - -### __init__ - KV Transfer Manager - -```python -def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.input_ids = self._make_buffer(self.max_num_tokens, dtype=torch.int32) - # each model stage has their own hidden size - self.hidden_size = self.model_config.hf_text_config.hidden_size - self.inputs_embeds = self._make_buffer(self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False) - # Initialize KV cache manager (preserve vllm_config fallback behavior) - self.kv_transfer_manager = OmniKVTransferManager.from_vllm_config(self.vllm_config, self.model_config) -``` - -### execute_model - KV Transfer Before Update States - -Location: At the very beginning of execute_model - -```python -# -------------------------------------- Omni-new ------------------------------------------------- -# [Omni] Handle KV transfer BEFORE updating states (which removes finished requests) -self.kv_extracted_req_ids = self.kv_transfer_manager.handle_finished_requests_kv_transfer( - finished_reqs=getattr(scheduler_output, "finished_requests_needing_kv_transfer", {}), - kv_caches=self.kv_caches, - block_size=self.cache_config.block_size, - cache_dtype=str(self.cache_config.cache_dtype), - request_id_resolver=self._resolve_global_request_id, -) -# -------------------------------------- Omni-new ------------------------------------------------- -``` - -### execute_model - Custom _update_states Call - -Location: Inside synchronize_input_prep context - -```python -# -------------------------------------- Omni-new ------------------------------------------------- -self._update_states(scheduler_output) -# ------------------------------------------------------------------------------------------------ -``` - -### execute_model - Extract Multimodal Outputs - -Location: In post process section, after hidden_states assignment - -```python -# -------------------------------------- Omni-new ------------------------------------------------- -hidden_states, multimodal_outputs = self.extract_multimodal_outputs(hidden_states) - -if multimodal_outputs is not None: - keys_or_type = ( - list(multimodal_outputs.keys()) - if isinstance(multimodal_outputs, dict) - else type(multimodal_outputs) - ) - logger.debug(f"[AR] execute_model: multimodal_outputs keys = {keys_or_type}") -else: - logger.debug("[AR] execute_model: multimodal_outputs is None") -# -------------------------------------- Omni-new ------------------------------------------------- -``` - -### execute_model - Compute Logits with sampling_metadata - -Location: In both broadcast_pp_output True and False branches - -```python -# -------------------------------------- Omni-new ------------------------------------------------- -# Try with sampling_metadata first; fall back to without for models that don't support it -try: - logits = self.model.compute_logits( - sample_hidden_states, sampling_metadata=self.input_batch.sampling_metadata - ) -except TypeError: - logits = self.model.compute_logits(sample_hidden_states) -# -------------------------------------- Omni-new ------------------------------------------------- -``` - -### sample_tokens - KV Extracted Req IDs - -Location: At the beginning of sample_tokens - -```python -# -------------------------------------- Omni-new ------------------------------------------------- -kv_extracted_req_ids = getattr(self, "kv_extracted_req_ids", None) -self.kv_extracted_req_ids = None -# -------------------------------------- Omni-new ------------------------------------------------- -``` - -### sample_tokens - Process Additional Information and Build Output - -Location: After bookkeeping sync, replacing the original output construction - -```python -# -------------------------------------- Omni-new ------------------------------------------------- -hidden_states_cpu = hidden_states.detach().to("cpu").contiguous() -num_scheduled_tokens_np = getattr(self, "_omni_num_scheduled_tokens_np", None) -if num_scheduled_tokens_np is None: - req_ids = self.input_batch.req_ids - num_scheduled_tokens_np = np.array( - [scheduler_output.num_scheduled_tokens[rid] for rid in req_ids], - dtype=np.int32, - ) - -self._process_additional_information_updates( - hidden_states, multimodal_outputs, num_scheduled_tokens_np, scheduler_output -) - -pooler_output: list[dict[str, object]] = [] -for rid in req_ids_output_copy: - idx = req_id_to_index_output_copy[rid] - start = int(self.query_start_loc.cpu[idx]) - sched = int(num_scheduled_tokens_np[idx]) - end = start + sched - hidden_slice = hidden_states_cpu[start:end] - payload: dict[str, object] = {"hidden": hidden_slice} - if isinstance(multimodal_outputs, dict) and multimodal_outputs: - # ... multimodal output slicing logic ... - pooler_output.append(payload) - -model_runner_output = OmniModelRunnerOutput( - req_ids=req_ids_output_copy, - req_id_to_index=req_id_to_index_output_copy, - sampled_token_ids=valid_sampled_token_ids, - logprobs=logprobs_lists, - prompt_logprobs_dict=prompt_logprobs_dict, - pooler_output=(pooler_output if self.vllm_config.model_config.engine_output_type != "text" else None), - kv_connector_output=kv_connector_output, -) -model_runner_output.kv_extracted_req_ids = kv_extracted_req_ids -# -------------------------------------- Omni-new ------------------------------------------------- -``` - ---- - -## NPUGenerationModelRunner (npu_generation_model_runner.py) - -### execute_model - Async Chunk Update - -Location: Inside prepare input section, before synchronize_input_prep - -```python -# -------------------------------------- Omni-new ------------------------------------------------- -if self.model_config.async_chunk and num_scheduled_tokens: - self._update_request_states(scheduler_output) -# -------------------------------------- Omni-new ------------------------------------------------- -``` - -### execute_model - Seq Token Counts - -Location: After _preprocess call - -```python -# [Omni] Pass token counts per request for code2wav output slicing -model_kwargs["seq_token_counts"] = tokens -``` - -### execute_model - Run Generation Model - -Location: Inside forward context - -```python -# -------------------------------------- Omni-new ------------------------------------------------- -outputs = self._run_generation_model( - num_tokens_padded=num_tokens_padded, - input_ids=input_ids, - positions=positions, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - model_kwargs=model_kwargs, - logits_indices=logits_indices, -) -_, multimodal_outputs = self.extract_multimodal_outputs(outputs) -# -------------------------------------- Omni-new ------------------------------------------------- -``` - -### sample_tokens - Multimodal Output Processing - -The entire sample_tokens method body is omni-specific for generation models: - -```python -# -------------------------------------- Omni-new ------------------------------------------------- -pooler_output: list[object] = [] -if isinstance(multimodal_outputs, torch.Tensor): - # ... tensor handling ... -elif isinstance(multimodal_outputs, list): - # ... list handling ... -elif isinstance(multimodal_outputs, dict): - # ... dict handling per request ... -else: - raise RuntimeError("Unsupported diffusion output type") -# [Omni] Copy req_id mappings to avoid async scheduling mutation. -req_ids_output_copy = self.input_batch.req_ids.copy() -req_id_to_index_output_copy = self.input_batch.req_id_to_index.copy() -output = OmniModelRunnerOutput( - req_ids=req_ids_output_copy, - req_id_to_index=req_id_to_index_output_copy, - sampled_token_ids=[], - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=pooler_output, - kv_connector_output=kv_connector_output, - num_nans_in_logits={}, - ec_connector_output=ec_connector_output if self.supports_mm_inputs else None, -) -# -------------------------------------- Omni-new ------------------------------------------------- -``` - -### _dummy_run - Model Kwargs Init and Multimodal Extract - -Location: Before model forward and after - -```python -model_kwargs = self._init_model_kwargs() # Before forward - -# ... forward ... - -# -------------------------------------- Omni-new ------------------------------------------------- -hidden_states, _ = self.extract_multimodal_outputs(hidden_states) -# ------------------------------------------------------------------------------------------------- -``` - ---- - -## ExecuteModelState Extension - -The `ExecuteModelState` NamedTuple is extended for omni: - -```python -class ExecuteModelState(NamedTuple): - """Ephemeral cached state transferred between execute_model() and - sample_tokens(), after execute_model() returns None.""" - - scheduler_output: SchedulerOutput - logits: torch.Tensor - spec_decode_metadata: SpecDecodeMetadata | None - spec_decode_common_attn_metadata: AscendCommonAttentionMetadata | None - hidden_states: torch.Tensor - sample_hidden_states: torch.Tensor - aux_hidden_states: list[torch.Tensor] | None - attn_metadata: PerLayerAttnMetadata - positions: torch.Tensor - ec_connector_output: ECConnectorOutput | None - cudagraph_stats: CUDAGraphStat | None - multimodal_outputs: Any # <-- Omni extension -``` - -This extended state must be imported from `npu_ar_model_runner` in `npu_generation_model_runner`. diff --git a/.claude/skills/vllm-omni-npu-upgrade/references/workflow-checklist.md b/.claude/skills/vllm-omni-npu-upgrade/references/workflow-checklist.md deleted file mode 100644 index 4f184df0ecb..00000000000 --- a/.claude/skills/vllm-omni-npu-upgrade/references/workflow-checklist.md +++ /dev/null @@ -1,222 +0,0 @@ -# NPU Model Runner Upgrade Workflow Checklist - -> **Note**: Reference documents (`omni-specific-blocks.md`) may not be complete. Always grep for `Omni-new` in GPU implementations to find all omni-specific code blocks. Update the reference docs when discovering new blocks. - -## Pre-Upgrade Preparation - -### 1. Version Information -- [ ] Identify current vllm-omni version: `_________` -- [ ] Identify target vllm-ascend version: `_________` -- [ ] Identify target vllm version: `_________` -- [ ] Last release date for GPU worker changes: `_________` - -### 2. Gather Git History -```bash -# GPU-side omni changes since last release -cd /root/vllm-workspace/vllm-omni -git log --oneline --since="YYYY-MM-DD" -- vllm_omni/worker/ - -# vllm-ascend NPUModelRunner changes -cd /root/vllm-workspace/vllm-ascend -git log --oneline .. -- vllm_ascend/worker/model_runner_v1.py -``` - -### 3. Backup Current Files -- [ ] Create backup of current NPU runners: - ```bash - cp -r vllm_omni/platforms/npu/worker vllm_omni/platforms/npu/worker.backup - ``` - ---- - -## OmniNPUModelRunner (npu_model_runner.py) - -### Read and Understand -- [ ] Read current `npu_model_runner.py` -- [ ] Read latest `vllm_ascend/worker/model_runner_v1.py` -- [ ] Read latest `vllm_omni/worker/gpu_model_runner.py` - -### Method: load_model -- [ ] Document existing omni-specific logic -- [ ] Copy latest NPUModelRunner.load_model structure -- [ ] Re-insert: `enable_sp(self.vllm_config)` call -- [ ] Re-insert: talker_mtp detection and setup -- [ ] Replace: `CUDAGraphWrapper` → `ACLGraphWrapper` -- [ ] Re-insert: Buffer allocations (talker_mtp_input_ids, etc.) - -### Method: _dummy_run -- [ ] Document existing omni-specific logic locations -- [ ] Copy latest NPUModelRunner._dummy_run -- [ ] Re-insert: talker_mtp dummy forward block (inside context) -- [ ] Re-insert: `extract_multimodal_outputs` call -- [ ] Verify: Comment markers are present - -### Method: _model_forward -- [ ] Copy latest NPUModelRunner._model_forward structure -- [ ] Re-insert: `_build_model_kwargs_extra()` call -- [ ] Re-insert: OmniOutput wrapping logic -- [ ] Re-insert: `_omni_last_model_output` caching -- [ ] Keep: NPU graph params update -- [ ] Keep: SP all-gather logic - -### Method: _talker_mtp_forward -- [ ] Verify: Uses `set_ascend_forward_context` -- [ ] Verify: Uses `ACLGraphWrapper` check -- [ ] Sync any changes from GPU `_talker_mtp_forward` - -### Imports -- [ ] Update vllm-ascend imports to latest paths -- [ ] Verify all omni imports are present -- [ ] Remove any deprecated imports - ---- - -## NPUARModelRunner (npu_ar_model_runner.py) - -### Read and Understand -- [ ] Read current `npu_ar_model_runner.py` -- [ ] Read latest `vllm_ascend/worker/model_runner_v1.py` execute_model -- [ ] Read latest `vllm_omni/worker/gpu_ar_model_runner.py` - -### Method: __init__ -- [ ] Sync any new initialization from GPU side -- [ ] Keep: `OmniKVTransferManager` setup -- [ ] Keep: Custom buffer allocations - -### Method: execute_model -- [ ] Document all omni blocks with line numbers -- [ ] Copy latest NPUModelRunner.execute_model structure -- [ ] Re-insert: KV transfer handling (beginning) -- [ ] Re-insert: Custom `_update_states` call -- [ ] Re-insert: `extract_multimodal_outputs` -- [ ] Re-insert: `compute_logits` with sampling_metadata try/except -- [ ] Update: ExecuteModelState to include multimodal_outputs - -### Method: sample_tokens -- [ ] Document all omni blocks -- [ ] Copy latest NPUModelRunner.sample_tokens structure -- [ ] Re-insert: `kv_extracted_req_ids` handling -- [ ] Re-insert: Hidden states CPU copy -- [ ] Re-insert: `_process_additional_information_updates` -- [ ] Re-insert: `OmniModelRunnerOutput` construction - -### ExecuteModelState -- [ ] Verify: `multimodal_outputs` field is present -- [ ] Verify: Imported/used correctly in execute_model - -### Imports -- [ ] Update all vllm-ascend imports -- [ ] Keep omni-specific imports - ---- - -## NPUGenerationModelRunner (npu_generation_model_runner.py) - -### Read and Understand -- [ ] Read current `npu_generation_model_runner.py` -- [ ] Read latest GPU `gpu_generation_model_runner.py` - -### Method: _update_request_states -- [ ] Verify: async_chunk handling is correct -- [ ] Sync any changes from GPU side - -### Method: execute_model -- [ ] Document all omni blocks -- [ ] Copy latest NPUModelRunner.execute_model base structure -- [ ] Re-insert: async_chunk update logic -- [ ] Re-insert: `seq_token_counts` injection -- [ ] Re-insert: `_run_generation_model` call -- [ ] Re-insert: `extract_multimodal_outputs` -- [ ] Use: ExecuteModelState from npu_ar_model_runner - -### Method: sample_tokens -- [ ] Keep: Entire omni multimodal output processing -- [ ] Update: Any new output fields needed -- [ ] Keep: `OmniModelRunnerOutput` construction - -### Method: _run_generation_model -- [ ] Sync any changes from GPU side -- [ ] Keep: `_model_forward` call with sampler - -### Method: _dummy_run -- [ ] Copy latest NPUModelRunner._dummy_run -- [ ] Re-insert: `model_kwargs = self._init_model_kwargs()` -- [ ] Re-insert: `extract_multimodal_outputs` at end - -### Imports -- [ ] Import ExecuteModelState from npu_ar_model_runner -- [ ] Update vllm-ascend imports - ---- - -## Post-Upgrade Validation - -### Syntax Validation -- [ ] `python -m py_compile vllm_omni/platforms/npu/worker/npu_model_runner.py` -- [ ] `python -m py_compile vllm_omni/platforms/npu/worker/npu_ar_model_runner.py` -- [ ] `python -m py_compile vllm_omni/platforms/npu/worker/npu_generation_model_runner.py` - -### Import Validation -- [ ] `python -c "from vllm_omni.platforms.npu.worker.npu_model_runner import OmniNPUModelRunner"` -- [ ] `python -c "from vllm_omni.platforms.npu.worker.npu_ar_model_runner import NPUARModelRunner"` -- [ ] `python -c "from vllm_omni.platforms.npu.worker.npu_generation_model_runner import NPUGenerationModelRunner"` - -### Comment Markers -- [ ] Grep for "Omni-new" in all three files -- [ ] Verify all omni blocks have closing markers - -### Code Review -- [ ] No `CUDAGraphWrapper` references -- [ ] All `set_forward_context` replaced with `set_ascend_forward_context` -- [ ] Parameter names correct (`aclgraph_runtime_mode` not `cudagraph_runtime_mode`) -- [ ] No duplicate code blocks -- [ ] No missing imports - ---- - -## Git Commit - -### Commit Message Template -``` -[NPU] Upgrade model runners to align with vllm-ascend vX.Y.Z - -- Update OmniNPUModelRunner with latest NPUModelRunner base -- Update NPUARModelRunner execute_model and sample_tokens -- Update NPUGenerationModelRunner for async_chunk changes -- Sync GPU-side omni changes from vX.Y.Z release -- Preserve all omni-specific logic (marked with Omni-new comments) - -Changes from vllm-ascend: -- - -Changes synced from GPU: -- -``` - -### Files to Stage -- [ ] `vllm_omni/platforms/npu/worker/npu_model_runner.py` -- [ ] `vllm_omni/platforms/npu/worker/npu_ar_model_runner.py` -- [ ] `vllm_omni/platforms/npu/worker/npu_generation_model_runner.py` -- [ ] Any other modified files - ---- - -## Troubleshooting - -### Import Errors -- Check if vllm-ascend module paths have changed -- Verify PYTHONPATH includes both vllm-ascend and vllm-omni - -### Type Errors -- Check method signatures match between GPU and NPU -- Verify NamedTuple fields match expected structure - -### Runtime Errors -- Enable debug logging: `export VLLM_LOGGING_LEVEL=DEBUG` -- Check graph capture issues: try `--enforce-eager` -- Check attention issues: verify AscendAttentionState usage - -### Performance Regression -- Compare with previous version on same model -- Check if graph capture is working: look for ACLGraph logs -- Verify SP/EP configurations are correct diff --git a/.github/scripts/pr_reviewer.py b/.github/scripts/pr_reviewer.py new file mode 100755 index 00000000000..da629a64587 --- /dev/null +++ b/.github/scripts/pr_reviewer.py @@ -0,0 +1,629 @@ +#!/usr/bin/env python3 +""" +PR Reviewer using GLM API for vllm-omni project. +""" + +import json +import logging +import os +import sys +import time +from dataclasses import dataclass +from typing import Any, TypedDict + +import requests + + +# Type definitions for API responses +class PRDetails(TypedDict): + """Type definition for GitHub PR details response.""" + + title: str + body: str + number: int + state: str + user: dict[str, Any] + + +class GLMMessage(TypedDict): + """Type definition for GLM API message.""" + + role: str + content: str + + +class GLMChoice(TypedDict): + """Type definition for GLM API choice.""" + + message: GLMMessage + finish_reason: str + + +class GLMResponse(TypedDict): + """Type definition for GLM API response.""" + + choices: list[GLMChoice] + usage: dict[str, int] | None + + +class GitHubComment(TypedDict): + """Type definition for GitHub comment.""" + + id: int + body: str + created_at: str + user: dict[str, Any] + + +# Configuration +TRIGGER_PHRASE: str = "@vllm-omni-reviewer" +DEFAULT_GLM_API_URL: str = "https://open.bigmodel.cn/api/paas/v4/chat/completions" # noqa: E501 +DEFAULT_GLM_MODEL: str = "glm-5" +DEFAULT_COOLDOWN_MINUTES: int = 5 +DEFAULT_MAX_RETRIES: int = 3 +DEFAULT_RETRY_DELAY: float = 1.0 +MAX_DIFF_SIZE: int = 100_000 # Maximum diff size in characters + + +@dataclass +class Config: + """Configuration for the PR reviewer.""" + + glm_api_url: str + glm_model: str + cooldown_minutes: int + max_retries: int + retry_delay: float + max_diff_size: int + + +# Setup logging +logging.basicConfig( + level=logging.INFO, + format="[PR Reviewer] %(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) +logger: logging.Logger = logging.getLogger(__name__) + + +def get_config() -> Config: + """Load configuration from environment variables with defaults.""" + return Config( + glm_api_url=os.getenv("GLM_API_URL", DEFAULT_GLM_API_URL), + glm_model=os.getenv("GLM_MODEL", DEFAULT_GLM_MODEL), + cooldown_minutes=int( + os.getenv( + "PR_REVIEWER_COOLDOWN_MINUTES", + str(DEFAULT_COOLDOWN_MINUTES), + ) + ), + max_retries=int( + os.getenv( + "PR_REVIEWER_MAX_RETRIES", + str(DEFAULT_MAX_RETRIES), + ) + ), + retry_delay=float(os.getenv("PR_REVIEWER_RETRY_DELAY", str(DEFAULT_RETRY_DELAY))), + max_diff_size=int(os.getenv("PR_REVIEWER_MAX_DIFF_SIZE", str(MAX_DIFF_SIZE))), # noqa: E501 + ) + + +def get_env_var(name: str) -> str: + """ + Get an environment variable or raise an error. + + Args: + name: Name of the environment variable. + + Returns: + The value of the environment variable. + + Raises: + SystemExit: If the environment variable is not set. + """ + value = os.environ.get(name) + if not value: + logger.error(f"Environment variable {name} is not set") + sys.exit(1) + return value + + +def check_trigger(comment_body: str) -> bool: + """ + Check if the comment contains the trigger phrase. + + Args: + comment_body: The body of the comment to check. + + Returns: + True if the trigger phrase is found, False otherwise. + """ + return TRIGGER_PHRASE in comment_body + + +def fetch_pr_diff( + repo_name: str, + pr_number: int, + token: str, + max_size: int = MAX_DIFF_SIZE, +) -> str | None: + """ + Fetch the diff for a pull request. + + Args: + repo_name: The repository name in format "owner/repo". + pr_number: The pull request number. + token: GitHub authentication token. + max_size: Maximum diff size in characters. + + Returns: + The diff content as a string, or None if fetching failed. + Returns empty string if diff is larger than max_size. + """ + url: str = f"https://api.github.com/repos/{repo_name}/pulls/{pr_number}" + headers: dict[str, str] = { + "Authorization": f"Bearer {token}", + "Accept": "application/vnd.github.v3.diff", + } + + logger.info(f"Fetching PR diff from {url}") + response = requests.get(url, headers=headers, timeout=30) + + if response.status_code == 200: + diff: str = response.text + if len(diff) > max_size: + logger.warning( + f"Diff size ({len(diff)} bytes) exceeds maximum " + f"({max_size} bytes), truncating to first " + f"{max_size} bytes" + ) + return diff[:max_size] + "\n\n... [Diff truncated due to size] ..." + logger.info(f"Successfully fetched diff ({len(diff)} bytes)") + return diff + else: + logger.error(f"Failed to fetch PR diff: {response.status_code}") + logger.error(f"Response: {response.text}") + return None + + +def fetch_pr_details( + repo_name: str, + pr_number: int, + token: str, +) -> PRDetails | None: + """ + Fetch PR details including title and description. + + Args: + repo_name: The repository name in format "owner/repo". + pr_number: The pull request number. + token: GitHub authentication token. + + Returns: + A dictionary containing PR details, or None if fetching failed. + """ + url: str = f"https://api.github.com/repos/{repo_name}/pulls/{pr_number}" + headers: dict[str, str] = { + "Authorization": f"Bearer {token}", + "Accept": "application/vnd.github.v3+json", + } + + logger.info(f"Fetching PR details from {url}") + response = requests.get(url, headers=headers, timeout=30) + + if response.status_code == 200: + return response.json() + else: + logger.error(f"Failed to fetch PR details: {response.status_code}") + return None + + +def build_review_prompt(pr_title: str, pr_description: str, diff: str) -> str: + """ + Build the prompt for the GLM-4.7 API. + + Args: + pr_title: The title of the pull request. + pr_description: The description/body of the pull request. + diff: The diff content of the pull request. + + Returns: + The formatted prompt string for the API. + """ + return f"""You are an expert code reviewer for the VLLM-Omni project. \ +Please review the following pull request: + +## Pull Request Details +**Title:** {pr_title} + +**Description:** +{pr_description if pr_description else "No description provided."} + +## Code Changes (Diff) +{diff} + +## Review Guidelines + +Please provide a comprehensive code review with the following sections: + +### 1. Overview +- Brief summary of the changes +- Overall assessment (positive, neutral, or concerns) + +### 2. Code Quality +- Code style and consistency +- Potential bugs or edge cases +- Performance considerations +- Error handling + +### 3. Architecture & Design +- Integration with existing codebase +- Design patterns and best practices +- Potential improvements + +### 4. Security & Safety +- Security concerns (if any) +- Resource management +- Input validation + +### 5. Testing & Documentation +- Test coverage considerations +- Documentation completeness +- Examples and usage clarity + +### 6. Specific Suggestions +- Line-by-line specific feedback (use `file:line` format) +- Concrete actionable suggestions +- Code examples for improvements (if applicable) + +### 7. Approval Status +- **LGTM** (Looks Good To Me) if the PR is ready to merge +- **LGTM with suggestions** if the PR is good but has minor suggestions +- **Changes requested** if significant changes are needed + +## Important Notes +- Be constructive and helpful +- Focus on objective technical feedback +- Acknowledge good practices when you see them +- Prioritize critical issues over nitpicks +- If the diff is empty or minimal, acknowledge this and provide + any relevant context-specific guidance + +Please format your response in Markdown with clear section headers. +""" + + +def validate_glm_response(data: dict[str, Any]) -> str | None: + """ + Validate and extract content from GLM API response. + + Args: + data: The response data from GLM API. + + Returns: + The review content string if valid, None otherwise. + """ + # Check if choices exists and is a non-empty list + if "choices" not in data: + logger.error("GLM API response missing 'choices' field") + logger.error(f"Response structure: {json.dumps(data, indent=2)}") + return None + + choices = data["choices"] + if not isinstance(choices, list): + logger.error(f"GLM API 'choices' is not a list: {type(choices)}") + return None + + if len(choices) == 0: + logger.error("GLM API 'choices' is an empty list") + return None + + # Check if first choice has message + try: + first_choice = choices[0] + if not isinstance(first_choice, dict): + logger.error(f"GLM API choice is not a dict: {type(first_choice)}") + return None + + if "message" not in first_choice: + logger.error("GLM API choice missing 'message' field") + logger.error(f"Choice structure: {json.dumps(first_choice, indent=2)}") # noqa: E501 + return None + + message = first_choice["message"] + if not isinstance(message, dict): + logger.error(f"GLM API message is not a dict: {type(message)}") + return None + + if "content" not in message: + logger.error("GLM API message missing 'content' field") + logger.error(f"Message structure: {json.dumps(message, indent=2)}") + return None + + content = message["content"] + if not isinstance(content, str): + logger.error(f"GLM API content is not a string: {type(content)}") + return None + + return content + + except (KeyError, IndexError, TypeError) as e: + logger.error(f"Failed to parse GLM API response: {e}") + logger.error(f"Response: {json.dumps(data, indent=2)}") + return None + + +def call_glm_api(prompt: str, api_key: str, config: Config) -> str | None: + """ + Call the GLM-4.7 API to get code review with retry logic. + + Args: + prompt: The prompt to send to the API. + api_key: The GLM API key. + config: Configuration object. + + Returns: + The review content as a string, or None if all retries failed. + """ + headers: dict[str, str] = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + payload: dict[str, Any] = { + "model": config.glm_model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.3, + "max_tokens": 32000, + "top_p": 0.9, + } + + last_error: str | None = None + + for attempt in range(config.max_retries): + try: + logger.info(f"Calling GLM API ({config.glm_model}) - Attempt {attempt + 1}/{config.max_retries}") + response = requests.post( + config.glm_api_url, + headers=headers, + json=payload, + timeout=120, + ) + + if response.status_code == 200: + data = response.json() + review = validate_glm_response(data) + if review: + logger.info(f"Successfully received review ({len(review)} chars)") # noqa: E501 + return review + else: + last_error = "Failed to validate API response structure" + logger.error(last_error) + else: + last_error = f"GLM API request failed: {response.status_code} - {response.text}" + logger.error(last_error) + + except requests.exceptions.Timeout: + last_error = f"GLM API request timed out (attempt {attempt + 1})" + logger.error(last_error) + except requests.exceptions.RequestException as e: + last_error = f"GLM API request exception: {e}" + logger.error(last_error) + except json.JSONDecodeError as e: + last_error = f"Failed to decode GLM API response as JSON: {e}" + logger.error(last_error) + + # Exponential backoff before retry + if attempt < config.max_retries - 1: + wait_time: float = config.retry_delay * (2**attempt) + logger.info(f"Waiting {wait_time}s before retry...") # noqa: E501 + time.sleep(wait_time) + + logger.error( + f"All {config.max_retries} attempts failed. Last error: {last_error}" # noqa: E501 + ) + return None + + +def check_cooldown( # noqa: E501 + repo_name: str, + pr_number: int, + token: str, + cooldown_minutes: int, +) -> bool: + """ + Check if the PR is within the cooldown period. + + Args: + repo_name: The repository name in format "owner/repo". + pr_number: The pull request number. + token: GitHub authentication token. + cooldown_minutes: Cooldown period in minutes. + + Returns: + True if within cooldown period (should skip), False otherwise. + """ + from datetime import datetime, timedelta + + url: str = ( + f"https://api.github.com/repos/{repo_name}/issues/" + f"{pr_number}/comments" # noqa: E501 + ) + headers: dict[str, str] = { + "Authorization": f"Bearer {token}", + "Accept": "application/vnd.github.v3+json", + } + + logger.info(f"Checking cooldown period ({cooldown_minutes} minutes)") + response = requests.get(url, headers=headers, timeout=30) + + if response.status_code != 200: + logger.warning(f"Failed to check cooldown: {response.status_code}, proceeding with review") + return False + + comments: list[dict[str, Any]] = response.json() + cutoff_time: datetime = datetime.utcnow() - timedelta(minutes=cooldown_minutes) # noqa: E501 + + for comment in reversed(comments): + # Check if this is a bot comment + body: str = comment.get("body", "") + if "VLLM-Omni PR Review" in body or "PR Reviewer Bot" in body: + created_at_str: str = comment.get("created_at", "") + try: + # Parse GitHub timestamp format + created_at = datetime.fromisoformat(created_at_str.replace("Z", "+00:00")) + created_at = created_at.replace(tzinfo=None) + if created_at > cutoff_time: + logger.info(f"PR is within cooldown period (last review: {created_at_str})") + return True + except ValueError: + logger.warning(f"Failed to parse comment timestamp: {created_at_str}") # noqa: E501 + continue + + logger.info("PR is outside cooldown period, proceeding with review") + return False + + +def post_review_comment( # noqa: E501 + repo_name: str, + pr_number: int, + token: str, + review: str, +) -> bool: + """ + Post the review as a comment on the PR. + + Args: + repo_name: The repository name in format "owner/repo". + pr_number: The pull request number. + token: GitHub authentication token. + review: The review content to post. + + Returns: + True if posting succeeded, False otherwise. + """ + url: str = ( + f"https://api.github.com/repos/{repo_name}/issues/" + f"{pr_number}/comments" # noqa: E501 + ) + headers: dict[str, str] = { + "Authorization": f"Bearer {token}", + "Accept": "application/vnd.github.v3+json", + } + + # Format the review comment + comment_body: str = f"""## 🤖 VLLM-Omni PR Review + +{review} + +--- +*This review was generated automatically by the VLLM-Omni PR Reviewer Bot +using {os.getenv("GLM_MODEL", DEFAULT_GLM_MODEL)}.* +""" + + payload: dict[str, str] = {"body": comment_body} + + logger.info(f"Posting review comment to PR #{pr_number}") + response = requests.post(url, headers=headers, json=payload, timeout=30) + + if response.status_code == 201: + logger.info("Successfully posted review comment") + return True + else: + logger.error(f"Failed to post comment: {response.status_code}") + logger.error(f"Response: {response.text}") + return False + + +def main() -> int: + """ + Main entry point for the PR reviewer bot. + + Returns: + 0 on success, 1 on error. + """ + logger.info("VLLM-Omni PR Reviewer Bot starting...") + + # Load configuration + config: Config = get_config() + logger.info( + f"Configuration: model={config.glm_model}, " + f"cooldown={config.cooldown_minutes}min, " + f"max_retries={config.max_retries}" + ) + + # Get environment variables + token: str = get_env_var("GITHUB_TOKEN") + api_key: str = get_env_var("GLM_API_KEY") + repo_name: str = get_env_var("REPO_NAME") + pr_number_str: str = get_env_var("PR_NUMBER") + comment_body: str = get_env_var("COMMENT_BODY") + + try: + pr_number: int = int(pr_number_str) + except ValueError: + logger.error(f"Invalid PR number: {pr_number_str}") + return 1 + + logger.info(f"Repository: {repo_name}") + logger.info(f"PR Number: {pr_number}") + + # Check if the comment contains the trigger phrase + if not check_trigger(comment_body): + logger.info( + f"Comment does not contain trigger phrase '{TRIGGER_PHRASE}', exiting" # noqa: E501 + ) + return 0 + + logger.info("Trigger phrase detected! Starting review process...") + + # Check cooldown period + if check_cooldown(repo_name, pr_number, token, config.cooldown_minutes): + logger.info("Skipping review due to cooldown period") + return 0 + + # Fetch PR details + logger.info("Step 1/4: Fetching PR details...") + pr_details: PRDetails | None = fetch_pr_details(repo_name, pr_number, token) # noqa: E501 + if not pr_details: + logger.error("Failed to fetch PR details") + return 1 + + pr_title: str = pr_details.get("title", "Unknown") + pr_description: str = pr_details.get("body", "") + + logger.info(f"PR Title: {pr_title}") + + # Fetch PR diff + logger.info("Step 2/4: Fetching PR diff...") + diff: str | None = fetch_pr_diff(repo_name, pr_number, token, config.max_diff_size) + if diff is None: + logger.error("Failed to fetch PR diff") + return 1 + + if not diff: + logger.warning("Warning: Empty diff - this might be a draft PR or no code changes") + + # Build prompt + logger.info("Step 3/4: Building review prompt...") + prompt: str = build_review_prompt(pr_title, pr_description, diff) + + # Call GLM API + logger.info("Step 4/4: Calling GLM API...") + review: str | None = call_glm_api(prompt, api_key, config) + if not review: + logger.error("Failed to get review from GLM API") + return 1 + + # Post review comment + logger.info("Posting review comment...") + if not post_review_comment(repo_name, pr_number, token, review): + logger.error("Failed to post review comment") + return 1 + + logger.info("PR review completed successfully!") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/workflows/pr-reviewer.yml b/.github/workflows/pr-reviewer.yml new file mode 100644 index 00000000000..8a41c69375e --- /dev/null +++ b/.github/workflows/pr-reviewer.yml @@ -0,0 +1,62 @@ +name: VLLM-Omni PR Reviewer + +on: + issue_comment: + types: [created] + +permissions: + contents: read + pull-requests: write + issues: write + +jobs: + pr-reviewer: + name: Review Pull Request + runs-on: ubuntu-latest + timeout-minutes: 10 + # Only run when the comment is from a collaborator/owner/member + if: | + github.event_name == 'issue_comment' && + github.event.issue.pull_request != null && + contains(github.event.comment.body, '@vllm-omni-reviewer') && + (github.event.comment.author_association == 'MEMBER' || + github.event.comment.author_association == 'COLLABORATOR' || + github.event.comment.author_association == 'OWNER') + steps: + - name: Checkout repository + uses: actions/checkout@v6.0.2 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v6.2.0 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip==24.0 + pip install requests==2.31.0 pyyaml==6.0.1 + + - name: Run PR Reviewer + id: reviewer + env: + GLM_API_KEY: ${{ secrets.GLM_API_KEY }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.issue.number || github.event.pull_request.number }} + COMMENT_BODY: ${{ github.event.comment.body }} + REPO_NAME: ${{ github.repository }} + PR_REVIEWER_COOLDOWN_MINUTES: 5 + PR_REVIEWER_MAX_RETRIES: 3 + run: | + python .github/scripts/pr_reviewer.py 2>&1 | tee "/tmp/pr_review_${PR_NUMBER}.log" + + - name: Upload review logs + if: always() + uses: actions/upload-artifact@v7.0.0 + with: + name: pr-review-logs-${{ github.event.issue.number || github.event.pull_request.number }} + path: /tmp/pr_review_*.log + retention-days: 7 + if-no-files-found: ignore diff --git a/.gitignore b/.gitignore index a7cd8f74eb4..28d56e0f6f0 100644 --- a/.gitignore +++ b/.gitignore @@ -83,9 +83,6 @@ target/ profile_default/ ipython_config.py -# uv -uv.lock - # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: @@ -158,23 +155,10 @@ cython_debug/ # Claude CLAUDE.md -/.claude/* -!.claude/skills/ -!.claude/skills/readme.md -!.claude/skills/add-diffusion-model/ -!.claude/skills/add-diffusion-model/SKILL.md -!.claude/skills/add-diffusion-model/references/ -!.claude/skills/add-diffusion-model/references/*.md -!.claude/skills/add-tts-model/ -!.claude/skills/add-tts-model/SKILL.md -!.claude/skills/review-pr/ -!.claude/skills/review-pr/SKILL.md -!.claude/skills/review-pr/references/ -!.claude/skills/review-pr/references/*.md +.claude/ # Codex AGENTS.md -.codex .codex/ # cursor @@ -204,7 +188,6 @@ checkpoints/ # Cache directories cache/ !vllm_omni/diffusion/cache/ -!tests/diffusion/cache/ .cache/ diffusion_cache/ kv_cache/ @@ -264,6 +247,3 @@ tmp_test vllm_omni/_version.py # output files *.wav -.worktrees/ -# CI overlay yamls materialized from tests/utils.py:_CI_OVERLAYS at test time -tests/.ci_generated/ diff --git a/benchmarks/accuracy/README.md b/benchmarks/accuracy/README.md index dbe20916a77..0d73215b692 100644 --- a/benchmarks/accuracy/README.md +++ b/benchmarks/accuracy/README.md @@ -23,5 +23,5 @@ Test guidance: - Local static/self-checks live in `tests/benchmarks/test_accuracy_bench_utils.py`. - End-to-end generation/evaluation should be validated in a remote GPU environment. In the current repo marker system there is `L4` but no `L5` - marker, so benchmark smoke tests should be wired as `full_model + - benchmark + L4` for nightly when GPU capacity is available. + marker, so benchmark smoke tests should be wired as `advanced_model + + benchmark + L4` when GPU capacity is available. diff --git a/benchmarks/accuracy/image_to_image/README.md b/benchmarks/accuracy/image_to_image/README.md index 86e7b0cf328..ee1d58f108b 100644 --- a/benchmarks/accuracy/image_to_image/README.md +++ b/benchmarks/accuracy/image_to_image/README.md @@ -99,5 +99,5 @@ Notes: - This flow requires the optional Hugging Face `datasets` package. - `generate` writes `generation_manifest.json` with local output coverage. - The current repo marker set exposes `L4` but not `L5`, so if you promote an - end-to-end smoke test into CI, use the `full_model`, `benchmark`, - and `L4` markers for nightly (or `advanced_model` for merge) or introduce a new repo-wide marker explicitly first. + end-to-end smoke test into CI, use the existing `advanced_model`, `benchmark`, + and `L4` markers or introduce a new repo-wide marker explicitly first. diff --git a/benchmarks/build_dataset/download_process_data_seedtts.md b/benchmarks/build_dataset/download_process_data_seedtts.md index faf072303b8..ec16f64424a 100644 --- a/benchmarks/build_dataset/download_process_data_seedtts.md +++ b/benchmarks/build_dataset/download_process_data_seedtts.md @@ -27,7 +27,7 @@ pip install gdown Download the dataset from Google Drive: ```bash -gdown 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP +gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP ``` ### 4. Extract the Dataset @@ -74,7 +74,7 @@ rm meta.lst # Full setup and benchmark cd benchmarks/build_dataset pip install gdown -gdown 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP +gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP tar -xf seedtts_testset.tar cp seedtts_testset/en/meta.lst meta.lst python extract_tts_prompts.py -i meta.lst -o top100.txt -n 100 diff --git a/benchmarks/build_dataset/seed_tts_design/en/meta.lst b/benchmarks/build_dataset/seed_tts_design/en/meta.lst deleted file mode 100644 index 7e364c2e517..00000000000 --- a/benchmarks/build_dataset/seed_tts_design/en/meta.lst +++ /dev/null @@ -1,20 +0,0 @@ -vd001|||The quick brown fox jumps over the lazy dog.|A warm, friendly female voice with a slight American Midwest accent, speaking at a moderate pace with natural inflection. -vd002|||Welcome to the future of text-to-speech synthesis.|A deep, authoritative male news anchor voice, clear and professional with a measured cadence. -vd003|||The sunset painted the sky in brilliant shades of orange and pink.|A gentle elderly female voice, soft and wise, with a slight Southern American accent. -vd004|||Scientists have discovered a new species of deep-sea creature.|A young male voice with an Australian accent, curious and enthusiastic. -vd005|||Breaking news: a major climate summit opens today in Geneva.|A crisp female newsreader voice, neutral accent, confident and precise. -vd006|||In the beginning, there was darkness and silence across the universe.|A rich, dramatic bass male narrator voice, slow and deeply resonant. -vd007|||Come closer, I have something important to tell you.|A soft, intimate female voice, slightly whispery, warm and gentle. -vd008|||And they're off! The horses race toward the first turn at incredible speed.|An energetic male sports commentator, fast-paced and excited. -vd009|||Once upon a time, in a land far away, lived a very clever fox.|A light, playful voice with childlike enthusiasm, bright and clear. -vd010|||The ancient manuscript reveals secrets hidden for a thousand years.|A wise, measured elderly male voice, slow and deliberate, British English accent. -vd011|||Good evening, ladies and gentlemen, and welcome to our show.|A sophisticated female voice with a slight French accent speaking English, elegant and refined. -vd012|||System initialized. Running diagnostics. All systems nominal.|A clear, precise robotic-sounding voice, neutral and monotone with slight synthetic quality. -vd013|||I hear what you are saying, and it is completely understandable to feel that way.|A warm, empathetic female therapist voice, calm and reassuring, unhurried pace. -vd014|||Attention all units: proceed to grid reference seven-seven-alpha.|A firm, authoritative military male voice, clipped and commanding. -vd015|||Oh my goodness, you have to try this amazing new recipe I just found!|An enthusiastic, bubbly female voice, high energy and friendly. -vd016|||Dude, the waves were totally amazing out there today. Super happy about it!|A relaxed male voice with a California accent, casual and laid-back. -vd017|||The quarterly results exceed expectations across all major metrics.|A sharp, businesslike female voice, confident and efficient, fast-paced delivery. -vd018|||Chapter one. The morning sun filtered gently through the forest canopy.|A smooth, rich male audiobook narrator voice, expressive and engaging. -vd019|||To be or not to be, that is the question.|A theatrical female voice, dramatic and expressive, stage projection quality. -vd020|||And that is all for tonight. Stay well out there, everyone.|A warm, velvety male late-night radio DJ voice, smooth and intimate. diff --git a/benchmarks/build_dataset/seed_tts_smoke/en/meta.lst b/benchmarks/build_dataset/seed_tts_smoke/en/meta.lst deleted file mode 100644 index afe4bc8abcd..00000000000 --- a/benchmarks/build_dataset/seed_tts_smoke/en/meta.lst +++ /dev/null @@ -1,20 +0,0 @@ -smoke001|||The quick brown fox jumps over the lazy dog near the riverbank at sunset. -smoke002|||Welcome to the future of text-to-speech synthesis in production systems. -smoke003|||Yesterday the team finished rolling out the new authentication flow. -smoke004|||She walked carefully across the wet cobblestones, careful not to slip. -smoke005|||The conference call is scheduled for nine in the morning, Pacific time. -smoke006|||Please remember to save your work before closing the editor. -smoke007|||Two plus two equals four, but five hundred and forty three digits is long. -smoke008|||I would like a coffee with oat milk and a chocolate croissant please. -smoke009|||The library closes at eight on weekdays and six on Saturdays. -smoke010|||During the Renaissance, art and science flourished in European cities. -smoke011|||He whispered the secret word so quietly that no one else could hear. -smoke012|||Our flight departs from gate twenty three at eleven fifteen. -smoke013|||The storm knocked out power for six hours, but the backup generator kicked in. -smoke014|||Reading a good book on a rainy afternoon is one of life's great pleasures. -smoke015|||When the kettle whistled, she poured the hot water over the fresh tea leaves. -smoke016|||The algorithm runs in linear time, which is a big improvement over the previous approach. -smoke017|||In the distance, the mountains were shrouded in thick morning fog. -smoke018|||Our company reported record revenue for the fourth quarter of the fiscal year. -smoke019|||She explained the new policy in detail during the staff meeting this morning. -smoke020|||The children laughed and played in the garden until the sun began to set. diff --git a/benchmarks/diffusion/backends.py b/benchmarks/diffusion/backends.py index d33160f1377..fa53f87aed7 100644 --- a/benchmarks/diffusion/backends.py +++ b/benchmarks/diffusion/backends.py @@ -122,18 +122,6 @@ async def async_request_chat_completions( output.peak_memory_mb = first_item.get("peak_memory_mb", 0.0) except (IndexError, TypeError, AttributeError): pass - - if (not output.stage_durations or output.peak_memory_mb == 0.0) and isinstance( - resp_json.get("metrics"), dict - ): - m = resp_json["metrics"] - if not output.stage_durations and isinstance(m.get("stage_durations"), dict): - output.stage_durations = m.get("stage_durations") or {} - if output.peak_memory_mb == 0.0 and m.get("peak_memory_mb") is not None: - try: - output.peak_memory_mb = float(m.get("peak_memory_mb") or 0.0) - except (TypeError, ValueError): - pass else: output.error = f"HTTP {response.status}: {await response.text()}" output.success = False @@ -318,8 +306,6 @@ async def async_request_v1_videos( video_bytes = await content_response.read() output.response_body = video_bytes output.success = True - if "stage_durations" in poll_json: - output.stage_durations = poll_json["stage_durations"] or {} if "peak_memory_mb" in poll_json: output.peak_memory_mb = poll_json["peak_memory_mb"] elif "peak_memory_mb" in resp_json: diff --git a/benchmarks/diffusion/diffusion_benchmark_serving.py b/benchmarks/diffusion/diffusion_benchmark_serving.py index 77b36b3d9c0..aad955b0d1d 100644 --- a/benchmarks/diffusion/diffusion_benchmark_serving.py +++ b/benchmarks/diffusion/diffusion_benchmark_serving.py @@ -12,15 +12,15 @@ - v1/videos: Use /v1/videos endpoint Usage: - # Video (v1/videos backend) + # Video (vllm-omni backend) t2v: python3 benchmarks/diffusion/diffusion_benchmark_serving.py \ - --backend v1/videos --dataset vbench --task t2v --num-prompts 10 \ + --backend vllm-omni --dataset vbench --task t2v --num-prompts 10 \ --height 480 --width 640 --fps 16 --num-frames 80 i2v: python3 benchmarks/diffusion/diffusion_benchmark_serving.py \ - --backend v1/videos --dataset vbench --task i2v --num-prompts 10 + --backend vllm-omni --dataset vbench --task i2v --num-prompts 10 # Image (vllm-omni backend) @@ -49,7 +49,7 @@ --backend openai --dataset vbench --task t2i --num-prompts 10 \ --height 1024 --width 1024 --port 3000 - # Video (v1/videos) + # Video (v1/vedeos) t2v: python3 benchmarks/diffusion/diffusion_benchmark_serving.py \ --backend v1/videos --dataset random --task t2v --num-prompts 1 \ @@ -558,7 +558,6 @@ def __init__(self, args, api_url: str, model: str, enable_negative_prompt: bool super().__init__(args, api_url, model) self.num_prompts = args.num_prompts self.enable_negative_prompt = enable_negative_prompt - self.num_input_images = max(1, args.num_input_images) self.random_request_config = getattr(args, "random_request_config", None) if self.random_request_config: self.random_request_config = json.loads(self.random_request_config) @@ -581,7 +580,11 @@ def __init__(self, args, api_url: str, model: str, enable_negative_prompt: bool # Random image generate if self.args.task in ["i2v", "ti2v", "ti2i", "i2i"]: - self._random_image_path = self._generate_random_image_paths() + img = Image.new("RGB", (512, 512), (255, 255, 255)) + + image_path = os.path.join(tempfile.gettempdir(), "diffusion_benchmark_random_image.png") + self._random_image_path = [image_path] + img.save(image_path) else: self._random_image_path = None @@ -616,18 +619,6 @@ def __getitem__(self, idx: int) -> RequestFuncInput: def get_requests(self) -> list[RequestFuncInput]: return [self[i] for i in range(len(self))] - def _generate_random_image_paths(self) -> list[str]: - image_paths: list[str] = [] - for image_idx in range(self.num_input_images): - img = Image.new("RGB", (512, 512), (255, 255, 255)) - image_path = os.path.join( - tempfile.gettempdir(), - f"diffusion_benchmark_random_image_{image_idx}.png", - ) - img.save(image_path) - image_paths.append(image_path) - return image_paths - def _compute_expected_latency_ms_from_base(req: RequestFuncInput, args, base_time_ms: float | None) -> float | None: """Compute expected execution time (ms) based on a base per-step-per-frame unit time. @@ -1124,15 +1115,6 @@ async def limited_request_func(req, session, pbar): '{"width":768,"height":768,"num_inference_steps":20,"weight":0.85}]' ), ) - parser.add_argument( - "--num-input-images", - type=int, - default=1, - help=( - "Number of synthetic input images to attach for image-conditioned tasks " - "(i2v, ti2v, ti2i, i2i) when using random dataset." - ), - ) args = parser.parse_args() diff --git a/benchmarks/fish-speech/bench_voice_cache.py b/benchmarks/fish-speech/bench_voice_cache.py deleted file mode 100644 index 8d465d6489f..00000000000 --- a/benchmarks/fish-speech/bench_voice_cache.py +++ /dev/null @@ -1,290 +0,0 @@ -"""Benchmark Fish Speech voice cache: inline ref_audio vs uploaded voice. - -Measures TTFP improvement from DAC-code caching when using uploaded voices. - -Setup: - 1. Start vllm-omni with Fish Speech S2 Pro (use our feat branch) - 2. Provide a reference audio file for voice cloning - -Usage: - python bench_voice_cache.py \ - --ref-audio /path/to/reference.wav \ - --ref-text "Transcript of the reference audio." \ - --num-prompts 20 \ - --port 8091 - -The script runs two rounds: - A) Inline ref_audio: every request sends base64 audio (no cache) - B) Uploaded voice: upload once, then use voice name (cache hits after 1st) -""" - -import argparse -import asyncio -import base64 -import json -import os -import sys -import time -from pathlib import Path - -import aiohttp - -# Allow imports from benchmarks/fish-speech/ -sys.path.insert(0, str(Path(__file__).resolve().parent)) - -from fish_bench_utils import ( # noqa: E402 - BenchmarkResult, - RequestResult, - compute_stats, - print_benchmark_results, - send_streaming_request, -) - -SAMPLE_RATE = 44100 -SAMPLE_WIDTH = 2 - -PROMPTS = [ - "Hello, welcome to the voice synthesis benchmark test.", - "She said she would be here by noon, but nobody showed up.", - "The quick brown fox jumps over the lazy dog near the riverbank.", - "I can't believe how beautiful the sunset looks from up here.", - "Please remember to bring your identification documents tomorrow morning.", - "Have you ever wondered what it would be like to travel through time?", - "The restaurant on the corner serves the best pasta I have ever tasted.", - "After the meeting, we should discuss the quarterly results.", - "Learning a new language takes patience and genuine curiosity.", - "The train leaves at half past seven, so we need to arrive early.", - "Could you please turn down the music, I'm trying to concentrate.", - "It was a dark and stormy night when the keeper heard a knock.", -] - - -def encode_audio_to_base64(audio_path: str) -> str: - """Encode a local audio file to base64 data URL.""" - ext = audio_path.lower().rsplit(".", 1)[-1] - mime_map = {"wav": "audio/wav", "mp3": "audio/mpeg", "flac": "audio/flac"} - mime_type = mime_map.get(ext, "audio/wav") - with open(audio_path, "rb") as f: - audio_b64 = base64.b64encode(f.read()).decode("utf-8") - return f"data:{mime_type};base64,{audio_b64}" - - -async def upload_voice( - host: str, - port: int, - audio_path: str, - ref_text: str, - voice_name: str = "bench_voice", -) -> dict: - """Upload a voice via POST /v1/audio/voices.""" - url = f"http://{host}:{port}/v1/audio/voices" - data = aiohttp.FormData() - data.add_field("name", voice_name) - data.add_field("consent", "true") - if ref_text: - data.add_field("ref_text", ref_text) - data.add_field( - "audio_sample", - open(audio_path, "rb"), - filename=os.path.basename(audio_path), - content_type="audio/wav", - ) - - async with aiohttp.ClientSession() as session: - async with session.post(url, data=data) as resp: - result = await resp.json() - print(f" Upload response ({resp.status}): {json.dumps(result, indent=2)}") - return result - - -async def delete_voice(host: str, port: int, voice_name: str) -> None: - """Delete an uploaded voice.""" - url = f"http://{host}:{port}/v1/audio/voices/{voice_name}" - async with aiohttp.ClientSession() as session: - async with session.delete(url) as resp: - if resp.status == 200: - print(f" Deleted voice '{voice_name}'") - - -async def run_round( - host: str, - port: int, - num_prompts: int, - create_payload_fn, - label: str, - num_warmups: int = 2, - timeout_s: float = 120.0, -) -> BenchmarkResult: - """Run one benchmark round and return results.""" - api_url = f"http://{host}:{port}/v1/audio/speech" - connector = aiohttp.TCPConnector(limit=1, limit_per_host=1) - session = aiohttp.ClientSession( - connector=connector, - timeout=aiohttp.ClientTimeout(total=timeout_s), - ) - - try: - # Warmup. - if num_warmups > 0: - print(f" [{label}] Warming up ({num_warmups} requests)...") - for i in range(num_warmups): - payload = create_payload_fn(PROMPTS[i % len(PROMPTS)]) - r = await send_streaming_request( - session, - api_url, - payload, - SAMPLE_RATE, - SAMPLE_WIDTH, - ) - status = "OK" if r.success else f"FAIL: {r.error[:80]}" - print(f" warmup {i + 1}: ttfp={r.ttfp * 1000:.0f}ms {status}") - - # Benchmark. - print(f" [{label}] Running {num_prompts} requests (concurrency=1)...") - results: list[RequestResult] = [] - start = time.perf_counter() - for i in range(num_prompts): - prompt = PROMPTS[i % len(PROMPTS)] - payload = create_payload_fn(prompt) - r = await send_streaming_request( - session, - api_url, - payload, - SAMPLE_RATE, - SAMPLE_WIDTH, - ) - results.append(r) - tag = "HIT" if i > 0 and label == "uploaded_voice" else "" - print( - f" req {i + 1:3d}: ttfp={r.ttfp * 1000:7.1f}ms " - f"e2e={r.e2e * 1000:7.1f}ms " - f"{'OK' if r.success else 'FAIL'} {tag}" - ) - wall_time = time.perf_counter() - start - finally: - await session.close() - - bench = compute_stats(results, wall_time) - bench.concurrency = 1 - bench.num_prompts = num_prompts - bench.config_name = label - return bench - - -async def main(): - parser = argparse.ArgumentParser( - description="Benchmark Fish Speech voice cache (inline vs uploaded)", - ) - parser.add_argument("--host", default="127.0.0.1") - parser.add_argument("--port", type=int, default=8091) - parser.add_argument("--ref-audio", required=True, help="Path to reference audio file") - parser.add_argument("--ref-text", required=True, help="Transcript of reference audio") - parser.add_argument("--num-prompts", type=int, default=20) - parser.add_argument("--num-warmups", type=int, default=2) - parser.add_argument("--voice-name", default="bench_voice") - args = parser.parse_args() - - if not os.path.exists(args.ref_audio): - print(f"Error: ref_audio not found: {args.ref_audio}") - sys.exit(1) - - ref_audio_b64 = encode_audio_to_base64(args.ref_audio) - print(f"Reference audio: {args.ref_audio} ({len(ref_audio_b64) // 1024}KB base64)") - - # ---- Round A: Inline ref_audio (no cache) ---- - print(f"\n{'=' * 60}") - print("Round A: INLINE ref_audio (every request sends full audio)") - print(f"{'=' * 60}") - - def make_inline_payload(prompt: str) -> dict: - return { - "input": prompt, - "voice": "default", - "stream": True, - "response_format": "pcm", - "ref_audio": ref_audio_b64, - "ref_text": args.ref_text, - "max_new_tokens": 2048, - } - - bench_inline = await run_round( - args.host, - args.port, - args.num_prompts, - make_inline_payload, - "inline_ref_audio", - num_warmups=args.num_warmups, - ) - print_benchmark_results(bench_inline) - - # ---- Upload voice ---- - print(f"\n{'=' * 60}") - print("Uploading voice for cache test...") - print(f"{'=' * 60}") - await delete_voice(args.host, args.port, args.voice_name) - await upload_voice( - args.host, - args.port, - args.ref_audio, - args.ref_text, - args.voice_name, - ) - - # ---- Round B: Uploaded voice (cache hits after 1st request) ---- - print(f"\n{'=' * 60}") - print("Round B: UPLOADED VOICE (cache hits after 1st request)") - print(f"{'=' * 60}") - - def make_uploaded_payload(prompt: str) -> dict: - return { - "input": prompt, - "voice": args.voice_name, - "stream": True, - "response_format": "pcm", - "ref_text": args.ref_text, - "max_new_tokens": 2048, - } - - bench_cached = await run_round( - args.host, - args.port, - args.num_prompts, - make_uploaded_payload, - "uploaded_voice", - num_warmups=args.num_warmups, - ) - print_benchmark_results(bench_cached) - - # ---- Comparison ---- - print(f"\n{'=' * 60}") - print("COMPARISON: Inline ref_audio vs Uploaded voice (cached)") - print(f"{'=' * 60}") - print(f"{'Metric':<30} {'Inline':>12} {'Cached':>12} {'Speedup':>10}") - print(f"{'-' * 64}") - - def fmt_speedup(inline_val: float, cached_val: float) -> str: - if cached_val > 0 and inline_val > 0: - ratio = inline_val / cached_val - return f"{ratio:.2f}x" - return "N/A" - - rows = [ - ("Mean TTFP (ms)", bench_inline.mean_ttfp_ms, bench_cached.mean_ttfp_ms), - ("Median TTFP (ms)", bench_inline.median_ttfp_ms, bench_cached.median_ttfp_ms), - ("P99 TTFP (ms)", bench_inline.p99_ttfp_ms, bench_cached.p99_ttfp_ms), - ("Mean E2E (ms)", bench_inline.mean_e2e_ms, bench_cached.mean_e2e_ms), - ("Median E2E (ms)", bench_inline.median_e2e_ms, bench_cached.median_e2e_ms), - ("Mean RTF", bench_inline.mean_rtf, bench_cached.mean_rtf), - ] - for label, a, b in rows: - print(f"{label:<30} {a:>12.1f} {b:>12.1f} {fmt_speedup(a, b):>10}") - - print("\nNote: Round B request #1 is a cache MISS (cold start).") - print(" Requests #2+ are cache HITs (skip DAC encoding).") - - # Cleanup. - await delete_voice(args.host, args.port, args.voice_name) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/benchmarks/fish-speech/fish_bench_utils.py b/benchmarks/fish-speech/fish_bench_utils.py deleted file mode 100644 index cc84c4037fe..00000000000 --- a/benchmarks/fish-speech/fish_bench_utils.py +++ /dev/null @@ -1,501 +0,0 @@ -"""Shared benchmark infrastructure for Fish Speech serving benchmarks. - -Provides common dataclasses, metrics computation, streaming HTTP client, -and result formatting used by model-specific benchmark scripts. - -Model-specific scripts supply a ``create_payload_fn(prompt) -> dict`` -callback and audio parameters; everything else is handled here. -""" - -import asyncio -import base64 -import json -import time -from collections.abc import Callable -from dataclasses import asdict, dataclass, field -from datetime import datetime -from pathlib import Path - -import aiohttp -import numpy as np -from tqdm.asyncio import tqdm - -# --------------------------------------------------------------------------- -# Shared test prompts (varying length for realistic workload) -# --------------------------------------------------------------------------- -PROMPTS = [ - "Hello, welcome to the voice synthesis benchmark test.", - "She said she would be here by noon, but nobody showed up.", - "The quick brown fox jumps over the lazy dog near the riverbank.", - "I can't believe how beautiful the sunset looks from up here on the mountain.", - "Please remember to bring your identification documents to the appointment tomorrow morning.", - "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?", - "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.", - "After the meeting, we should discuss the quarterly results and plan for the next phase.", - "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.", - "The train leaves at half past seven, so we need to arrive at the station before then.", - "Could you please turn down the music a little bit, I'm trying to concentrate on my work.", - "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", -] - - -# --------------------------------------------------------------------------- -# Dataclasses -# --------------------------------------------------------------------------- -@dataclass -class RequestResult: - success: bool = False - ttfp: float = 0.0 # Time to first audio packet (seconds) - e2e: float = 0.0 # End-to-end latency (seconds) - audio_bytes: int = 0 # Total audio bytes received - audio_duration: float = 0.0 # Audio duration in seconds - rtf: float = 0.0 # Real-time factor = e2e / audio_duration - prompt: str = "" - error: str = "" - - -@dataclass -class BenchmarkResult: - config_name: str = "" - concurrency: int = 0 - num_prompts: int = 0 - completed: int = 0 - failed: int = 0 - duration_s: float = 0.0 - # TTFP stats (ms) - mean_ttfp_ms: float = 0.0 - median_ttfp_ms: float = 0.0 - std_ttfp_ms: float = 0.0 - p90_ttfp_ms: float = 0.0 - p95_ttfp_ms: float = 0.0 - p99_ttfp_ms: float = 0.0 - # E2E stats (ms) - mean_e2e_ms: float = 0.0 - median_e2e_ms: float = 0.0 - std_e2e_ms: float = 0.0 - p90_e2e_ms: float = 0.0 - p95_e2e_ms: float = 0.0 - p99_e2e_ms: float = 0.0 - # RTF stats - mean_rtf: float = 0.0 - median_rtf: float = 0.0 - std_rtf: float = 0.0 - p99_rtf: float = 0.0 - # Audio stats - mean_audio_duration_s: float = 0.0 - total_audio_duration_s: float = 0.0 - audio_throughput: float = 0.0 # audio_duration / wall_time - request_throughput: float = 0.0 # requests / second - # Per-request details - per_request: list = field(default_factory=list) - - -# --------------------------------------------------------------------------- -# Audio helpers -# --------------------------------------------------------------------------- -def pcm_bytes_to_duration( - num_bytes: int, - sample_rate: int = 24000, - sample_width: int = 2, -) -> float: - """Convert raw PCM byte count to duration in seconds.""" - return num_bytes / sample_width / sample_rate - - -def _is_sse_response(response: aiohttp.ClientResponse) -> bool: - content_type = (response.headers.get("Content-Type") or "").lower() - return "text/event-stream" in content_type - - -async def _read_raw_audio_stream( - response: aiohttp.ClientResponse, - *, - start_time: float, -) -> tuple[int, float]: - first_audio_at = 0.0 - total_bytes = 0 - - async for chunk in response.content.iter_any(): - if chunk and first_audio_at <= 0: - first_audio_at = time.perf_counter() - start_time - total_bytes += len(chunk) - - return total_bytes, first_audio_at - - -def _extract_sse_payload(raw_event: bytes) -> bytes | None: - data_lines: list[bytes] = [] - for raw_line in raw_event.splitlines(): - line = raw_line.rstrip(b"\r") - if line.startswith(b"data: "): - data_lines.append(line[6:]) - elif line.startswith(b"data:"): - data_lines.append(line[5:].lstrip()) - - if not data_lines: - return None - return b"\n".join(data_lines).strip() - - -async def _read_sse_audio_stream( - response: aiohttp.ClientResponse, - *, - start_time: float, -) -> tuple[int, float]: - """Decode SSE events and count raw audio bytes from base64 payloads.""" - first_audio_at = 0.0 - total_bytes = 0 - pending = b"" - - async for chunk in response.content.iter_any(): - if not chunk: - continue - pending += chunk - pending = pending.replace(b"\r\n", b"\n") - - while b"\n\n" in pending: - raw_event, pending = pending.split(b"\n\n", 1) - payload_bytes = _extract_sse_payload(raw_event) - if payload_bytes is None: - continue - if payload_bytes == b"[DONE]": - return total_bytes, first_audio_at - - try: - payload = json.loads(payload_bytes) - except json.JSONDecodeError as exc: - raise ValueError(f"Invalid SSE JSON payload: {exc}") from exc - - audio = payload.get("audio") - if not isinstance(audio, dict): - continue - - audio_b64 = audio.get("data") - if not audio_b64: - continue - - try: - audio_bytes = base64.b64decode(audio_b64) - except Exception as exc: - raise ValueError(f"Invalid base64 audio chunk: {exc}") from exc - - if audio_bytes and first_audio_at <= 0: - first_audio_at = time.perf_counter() - start_time - total_bytes += len(audio_bytes) - - return total_bytes, first_audio_at - - -# --------------------------------------------------------------------------- -# Metrics -# --------------------------------------------------------------------------- -def compute_stats( - results: list[RequestResult], - wall_time: float, -) -> BenchmarkResult: - """Compute aggregate statistics from per-request results.""" - successful = [r for r in results if r.success] - failed = [r for r in results if not r.success] - - bench = BenchmarkResult( - completed=len(successful), - failed=len(failed), - duration_s=wall_time, - ) - - if not successful: - return bench - - ttfps = [r.ttfp * 1000 for r in successful] - e2es = [r.e2e * 1000 for r in successful] - rtfs = [r.rtf for r in successful] - audio_durs = [r.audio_duration for r in successful] - - bench.mean_ttfp_ms = float(np.mean(ttfps)) - bench.median_ttfp_ms = float(np.median(ttfps)) - bench.std_ttfp_ms = float(np.std(ttfps)) - bench.p90_ttfp_ms = float(np.percentile(ttfps, 90)) - bench.p95_ttfp_ms = float(np.percentile(ttfps, 95)) - bench.p99_ttfp_ms = float(np.percentile(ttfps, 99)) - - bench.mean_e2e_ms = float(np.mean(e2es)) - bench.median_e2e_ms = float(np.median(e2es)) - bench.std_e2e_ms = float(np.std(e2es)) - bench.p90_e2e_ms = float(np.percentile(e2es, 90)) - bench.p95_e2e_ms = float(np.percentile(e2es, 95)) - bench.p99_e2e_ms = float(np.percentile(e2es, 99)) - - bench.mean_rtf = float(np.mean(rtfs)) - bench.median_rtf = float(np.median(rtfs)) - bench.std_rtf = float(np.std(rtfs)) - bench.p99_rtf = float(np.percentile(rtfs, 99)) - - bench.mean_audio_duration_s = float(np.mean(audio_durs)) - bench.total_audio_duration_s = float(np.sum(audio_durs)) - bench.audio_throughput = bench.total_audio_duration_s / wall_time - bench.request_throughput = len(successful) / wall_time - - bench.per_request = [ - { - "ttfp_ms": r.ttfp * 1000, - "e2e_ms": r.e2e * 1000, - "rtf": r.rtf, - "audio_duration_s": r.audio_duration, - "prompt": r.prompt, - } - for r in successful - ] - - return bench - - -# --------------------------------------------------------------------------- -# Output formatting -# --------------------------------------------------------------------------- -def print_benchmark_results(bench: BenchmarkResult) -> None: - """Print benchmark results in standardized format.""" - W = 50 - print("") - print(f"{'=' * W}") - print(f"{'Serving Benchmark Result':^{W}}") - print(f"{'=' * W}") - print(f"{'Successful requests:':<40}{bench.completed:<10}") - print(f"{'Failed requests:':<40}{bench.failed:<10}") - print(f"{'Maximum request concurrency:':<40}{bench.concurrency:<10}") - print(f"{'Benchmark duration (s):':<40}{bench.duration_s:<10.2f}") - print(f"{'Request throughput (req/s):':<40}{bench.request_throughput:<10.2f}") - print(f"{'-' * W}") - print(f"{'End-to-end Latency':^{W}}") - print(f"{'-' * W}") - print(f"{'Mean E2EL (ms):':<40}{bench.mean_e2e_ms:<10.2f}") - print(f"{'Median E2EL (ms):':<40}{bench.median_e2e_ms:<10.2f}") - print(f"{'P99 E2EL (ms):':<40}{bench.p99_e2e_ms:<10.2f}") - print(f"{'=' * W}") - print(f"{'Audio Result':^{W}}") - print(f"{'=' * W}") - print(f"{'Total audio duration generated (s):':<40}{bench.total_audio_duration_s:<10.2f}") - print(f"{'Audio throughput (audio duration/s):':<40}{bench.audio_throughput:<10.2f}") - print(f"{'-' * W}") - print(f"{'Time to First Packet':^{W}}") - print(f"{'-' * W}") - print(f"{'Mean AUDIO_TTFP (ms):':<40}{bench.mean_ttfp_ms:<10.2f}") - print(f"{'Median AUDIO_TTFP (ms):':<40}{bench.median_ttfp_ms:<10.2f}") - print(f"{'P99 AUDIO_TTFP (ms):':<40}{bench.p99_ttfp_ms:<10.2f}") - print(f"{'-' * W}") - print(f"{'Real Time Factor':^{W}}") - print(f"{'-' * W}") - print(f"{'Mean AUDIO_RTF:':<40}{bench.mean_rtf:<10.3f}") - print(f"{'Median AUDIO_RTF:':<40}{bench.median_rtf:<10.3f}") - print(f"{'P99 AUDIO_RTF:':<40}{bench.p99_rtf:<10.3f}") - print(f"{'=' * W}") - print("") - - -def save_results( - all_results: list[dict], - result_dir: str, - config_name: str, -) -> Path: - """Save benchmark results as JSON and return the file path.""" - out = Path(result_dir) - out.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - result_file = out / f"bench_{config_name}_{timestamp}.json" - - with open(result_file, "w") as f: - json.dump(all_results, f, indent=2) - print(f"Results saved to {result_file}") - return result_file - - -# --------------------------------------------------------------------------- -# Streaming HTTP client -# --------------------------------------------------------------------------- -async def send_streaming_request( - session: aiohttp.ClientSession, - api_url: str, - payload: dict, - sample_rate: int, - sample_width: int, - pbar: tqdm | None = None, -) -> RequestResult: - """Send a streaming TTS request and measure latency metrics.""" - result = RequestResult(prompt=payload.get("input", "")) - st = time.perf_counter() - - try: - async with session.post(api_url, json=payload) as response: - if response.status != 200: - result.error = f"HTTP {response.status}: {await response.text()}" - else: - if _is_sse_response(response): - total_bytes, result.ttfp = await _read_sse_audio_stream( - response, - start_time=st, - ) - else: - total_bytes, result.ttfp = await _read_raw_audio_stream( - response, - start_time=st, - ) - - result.e2e = time.perf_counter() - st - result.audio_bytes = total_bytes - result.audio_duration = pcm_bytes_to_duration(total_bytes, sample_rate, sample_width) - - if total_bytes <= 0 or result.ttfp <= 0: - result.error = "HTTP 200 but no audio bytes were received" - else: - if result.audio_duration > 0: - result.rtf = result.e2e / result.audio_duration - result.success = True - - except Exception as e: - result.error = str(e) - result.e2e = time.perf_counter() - st - - finally: - if pbar: - pbar.update(1) - return result - - -# --------------------------------------------------------------------------- -# Benchmark runner -# --------------------------------------------------------------------------- -async def run_benchmark( - host: str, - port: int, - num_prompts: int, - max_concurrency: int, - create_payload_fn: Callable[[str], dict], - sample_rate: int, - sample_width: int = 2, - num_warmups: int = 3, - request_timeout_s: float = 120.0, -) -> BenchmarkResult: - """Run a TTS streaming benchmark at a given concurrency level. - - Args: - create_payload_fn: Model-specific function that takes a prompt string - and returns the request JSON payload dict. - sample_rate: PCM sample rate for audio duration calculation. - sample_width: PCM sample width in bytes (default 2 for 16-bit). - """ - api_url = f"http://{host}:{port}/v1/audio/speech" - - connector = aiohttp.TCPConnector( - limit=max_concurrency, - limit_per_host=max_concurrency, - keepalive_timeout=60, - ) - session = aiohttp.ClientSession( - connector=connector, - timeout=aiohttp.ClientTimeout( - total=request_timeout_s, - connect=min(10.0, request_timeout_s), - sock_connect=min(10.0, request_timeout_s), - sock_read=request_timeout_s, - ), - ) - - try: - # Warmup - if num_warmups > 0: - print(f" Warming up with {num_warmups} requests...") - warmup_tasks = [ - send_streaming_request( - session, - api_url, - create_payload_fn(PROMPTS[i % len(PROMPTS)]), - sample_rate, - sample_width, - ) - for i in range(num_warmups) - ] - warmup_results = await asyncio.gather(*warmup_tasks) - warmup_ok = sum(1 for r in warmup_results if r.success) - if warmup_ok == 0: - print(" WARNING: All warmup requests failed!") - for r in warmup_results: - if r.error: - print(f" {r.error[:200]}") - print(f" Warmup done ({warmup_ok}/{num_warmups} succeeded).") - - # Build request list - request_prompts = [PROMPTS[i % len(PROMPTS)] for i in range(num_prompts)] - - # Run - print(f" Running {num_prompts} requests with concurrency={max_concurrency}...") - semaphore = asyncio.Semaphore(max_concurrency) - pbar = tqdm(total=num_prompts, desc=f" concurrency={max_concurrency}") - - async def limited_request(prompt: str) -> RequestResult: - async with semaphore: - return await send_streaming_request( - session, - api_url, - create_payload_fn(prompt), - sample_rate, - sample_width, - pbar, - ) - - start_time = time.perf_counter() - tasks = [asyncio.create_task(limited_request(p)) for p in request_prompts] - results: list[RequestResult] = await asyncio.gather(*tasks) - wall_time = time.perf_counter() - start_time - pbar.close() - - finally: - await session.close() - - # Compute stats - bench = compute_stats(results, wall_time) - bench.concurrency = max_concurrency - bench.num_prompts = num_prompts - - print_benchmark_results(bench) - - # Print sample errors - failed = [r for r in results if not r.success] - if failed: - for r in failed[:3]: - print(f" [ERROR] {r.error[:200]}") - - return bench - - -async def run_benchmark_sweep( - host: str, - port: int, - num_prompts: int, - concurrency_levels: list[int], - create_payload_fn: Callable[[str], dict], - sample_rate: int, - sample_width: int = 2, - num_warmups: int = 3, - request_timeout_s: float = 120.0, - config_name: str = "benchmark", - result_dir: str = "results", -) -> list[dict]: - """Run benchmarks across multiple concurrency levels and save results.""" - all_results = [] - - for concurrency in concurrency_levels: - result = await run_benchmark( - host=host, - port=port, - num_prompts=num_prompts, - max_concurrency=concurrency, - create_payload_fn=create_payload_fn, - sample_rate=sample_rate, - sample_width=sample_width, - num_warmups=num_warmups, - request_timeout_s=request_timeout_s, - ) - result.config_name = config_name - all_results.append(asdict(result)) - - save_results(all_results, result_dir, config_name) - return all_results diff --git a/benchmarks/glm_image/README.md b/benchmarks/glm_image/README.md deleted file mode 100644 index 485e081426f..00000000000 --- a/benchmarks/glm_image/README.md +++ /dev/null @@ -1,157 +0,0 @@ -# GLM-Image Benchmarks - -Benchmark GLM-Image T2I (text-to-image) and I2I (image-to-image) performance across three backends: HuggingFace baseline, vLLM-Omni offline, and vLLM-Omni online serving. - -## Benchmarks - -| Benchmark | Script | Description | -|-----------|--------|-------------| -| HuggingFace Baseline | `huggingface/inference.py` | Single-GPU transformers + diffusers pipeline | -| vLLM-Omni Offline | `vllm-omni/inference.py` | Offline inference with continuous batching | -| vLLM-Omni Online | `benchmark_glm_image.py` | Online serving via `/v1/chat/completions` | - -## HuggingFace Baseline - -Single-request sequential inference using the reference HuggingFace pipeline. - -```bash -# T2I -CUDA_VISIBLE_DEVICES=0 python benchmarks/glm_image/huggingface/inference.py \ - --model-path /path/to/GLM-Image --mode t2i --num-prompts 10 - -# I2I -CUDA_VISIBLE_DEVICES=0 python benchmarks/glm_image/huggingface/inference.py \ - --model-path /path/to/GLM-Image --mode i2i --num-prompts 10 -``` - -### Options - -| Flag | Default | Description | -|------|---------|-------------| -| `--model-path` | `zai-org/GLM-Image` | Model path | -| `--mode` | `t2i` | `t2i` or `i2i` | -| `--dataset-path` | `prompt/prompt.json` | Path to prompt.json | -| `--num-prompts` | `10` | Number of images to generate | -| `--width` / `--height` | `1024` | Output image size | -| `--num-inference-steps` | `50` | Diffusion denoising steps | -| `--output-dir` | `benchmarks/glm_image/huggingface/outputs` | Output directory | -| `--output-file` | - | JSON file for metrics | - -## vLLM-Omni Offline - -Multi-GPU offline inference with pipeline parallelism and continuous batching. - -```bash -# T2I -CUDA_VISIBLE_DEVICES=0,1 python benchmarks/glm_image/vllm-omni/inference.py \ - --model-path /path/to/GLM-Image --mode t2i --num-prompts 10 - -# I2I -CUDA_VISIBLE_DEVICES=0,1 python benchmarks/glm_image/vllm-omni/inference.py \ - --model-path /path/to/GLM-Image --mode i2i --num-prompts 10 -``` - -### Options - -| Flag | Default | Description | -|------|---------|-------------| -| `--model-path` | `zai-org/GLM-Image` | Model path | -| `--deploy-config` | - | Deploy config YAML | -| `--mode` | `t2i` | `t2i` or `i2i` | -| `--dataset-path` | `prompt/prompt.json` | Path to prompt.json | -| `--num-prompts` | `10` | Number of images to generate | -| `--width` / `--height` | `1024` | Output image size | -| `--num-inference-steps` | `50` | Diffusion denoising steps | -| `--output-dir` | `benchmarks/glm_image/vllm-omni/outputs` | Output directory | -| `--output-file` | - | JSON file for metrics | -| `--stage-init-timeout` | `600` | Stage initialization timeout (s) | - -### Latency Computation - -In offline mode all requests are submitted simultaneously and processed with continuous batching. The per-request latency is computed by summing the actual per-stage times (with `stage_0_gen_ms` diffed against the previous request to remove accumulated queue/scheduling wait). - -## vLLM-Omni Online Serving - -### Start the server - -```bash -CUDA_VISIBLE_DEVICES=0,1 vllm serve /path/to/GLM-Image \ - --omni --port 8091 --host 0.0.0.0 \ - --served-model-name glm-image -``` - -### Run the benchmark - -```bash -# T2I -python benchmarks/glm_image/benchmark_glm_image.py \ - --mode t2i --num-prompts 10 --model glm-image - -# I2I -python benchmarks/glm_image/benchmark_glm_image.py \ - --mode i2i --num-prompts 10 --model glm-image - -# Custom dataset -python benchmarks/glm_image/benchmark_glm_image.py \ - --mode i2i --dataset custom \ - --dataset-path prompts.json --num-prompts 5 -``` - -### Options - -| Flag | Default | Description | -|------|---------|-------------| -| `--mode` | `t2i` | `t2i` or `i2i` | -| `--dataset` | `prompt` | `prompt`, `random`, or `custom` | -| `--dataset-path` | - | JSON file path (required for `custom`) | -| `--num-prompts` | `10` | Number of benchmark requests | -| `--max-concurrency` | `1` | Max concurrent requests | -| `--request-rate` | `inf` | Requests per second (Poisson arrival) | -| `--warmup-requests` | `1` | Warmup requests before measurement | -| `--width` / `--height` | `1024` | Output image size | -| `--num-inference-steps` | `50` | Diffusion denoising steps | -| `--seed` | - | Random seed | -| `--model` | `default` | Model name (must match `--served-model-name`) | -| `--host` | `localhost` | Server host | -| `--port` | `8091` | Server port | -| `--output-file` | - | JSON output file for metrics | -| `--num-input-images` | `1` | Number of input images for random I2I | - -## Dataset - -The default dataset is hosted on [HuggingFace](https://huggingface.co/datasets/JaredforReal/glm-image-bench) (`prompt.json`). It is automatically downloaded and cached to `prompt/prompt.json` on first run. No manual setup needed. - -Each entry contains: - -- `t2i_prompt`: Text prompt for text-to-image generation -- `i2i_prompt`: Text prompt for image-to-image editing -- `image_url`: Source image URL for I2I (downloaded and cached on first use) - -Custom datasets use the same JSON format and can be provided via `--dataset-path`. - -## Pipeline Timings - -All three benchmarks report per-stage pipeline timings (in milliseconds): - -| Key | Description | -|-----|-------------| -| `preprocess_ms` | Input preprocessing (tokenization, multimodal encoding) | -| `stage_0_gen_ms` | AR (autoregressive) model generation time | -| `ar2diffusion_ms` | AR output to diffusion input conversion | -| `stage_1_gen_ms` | Diffusion model denoising time | -| `queue_wait_ms` | Queue wait time before processing | - -The stages are ordered by execution: `preprocess → stage_0 (AR) → ar2diffusion → stage_1 (Diffusion)`. - -## Sample Results - -Tested on 2x GPU with 10 prompts, 1024x1024, 50 denoising steps: - -| Backend | Mode | Latency Mean (s) | Throughput (img/s) | -|---------|------|-------------------|--------------------| -| HuggingFace | T2I | 72.6 | 0.014 | -| HuggingFace | I2I | 70.9 | 0.014 | -| vLLM-Omni Offline | T2I | 35.0 | 0.044 | -| vLLM-Omni Offline | I2I | 31.0 | 0.053 | -| vLLM-Omni Online | T2I | 38.8 | 0.026 | -| vLLM-Omni Online | I2I | 34.7 | 0.029 | diff --git a/benchmarks/glm_image/__init__.py b/benchmarks/glm_image/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/benchmarks/glm_image/benchmark_glm_image.py b/benchmarks/glm_image/benchmark_glm_image.py deleted file mode 100644 index 9f8df3f1986..00000000000 --- a/benchmarks/glm_image/benchmark_glm_image.py +++ /dev/null @@ -1,464 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Online serving benchmark for GLM-Image (T2I and I2I modes). - -Sends requests to the /v1/chat/completions endpoint and reports end-to-end -latency, throughput, and per-stage durations (when the server is started with ---enable-diffusion-pipeline-profiler and/or --enable-ar-profiler). - -Supports three dataset types: - - prompt: Use prompt.json (default). T2I uses t2i_prompt, I2I uses i2i_prompt - and sends source images from image_url. - - random: Generate synthetic prompts (and random images for I2I). - - custom: Load from a user-specified JSON file. - -Usage: - # T2I with prompt.json (default) - python benchmarks/glm_image/benchmark_glm_image.py \ - --mode t2i --num-prompts 10 - - # I2I with prompt.json (downloads source images automatically) - python benchmarks/glm_image/benchmark_glm_image.py \ - --mode i2i --num-prompts 10 - - # Random dataset - python benchmarks/glm_image/benchmark_glm_image.py \ - --mode t2i --dataset random --num-prompts 20 - - # Custom dataset - python benchmarks/glm_image/benchmark_glm_image.py \ - --mode i2i --dataset custom \ - --dataset-path my_prompts.json --num-prompts 5 -""" - -import argparse -import asyncio -import base64 -import json -import os -import sys -import tempfile -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import aiohttp -import numpy as np -import requests as sync_requests -from PIL import Image -from tqdm.asyncio import tqdm - -# Import backends from the diffusion benchmark (add parent dirs to path) -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "diffusion")) -from backends import RequestFuncOutput - -BENCHMARK_DIR = Path(__file__).resolve().parent -DEFAULT_PROMPT_JSON = BENCHMARK_DIR / "prompt" / "prompt.json" -IMAGE_CACHE_DIR = BENCHMARK_DIR / "prompt" / "images" - -DATASET_REPO = "JaredforReal/glm-image-bench" -DATASET_FILE = "prompt.json" - - -def _ensure_prompt_json(dataset_path: str | None) -> str: - """Return path to prompt.json, downloading from HuggingFace if needed.""" - if dataset_path: - return dataset_path - local = DEFAULT_PROMPT_JSON - if local.exists(): - return str(local) - print(f"Downloading {DATASET_FILE} from {DATASET_REPO} ...") - try: - from huggingface_hub import hf_hub_download - - downloaded = hf_hub_download( - repo_id=DATASET_REPO, - filename=DATASET_FILE, - repo_type="dataset", - ) - local.parent.mkdir(parents=True, exist_ok=True) - import shutil - - shutil.copy2(downloaded, local) - print(f"Saved to {local}") - except ImportError: - url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{DATASET_FILE}" - import urllib.request - - local.parent.mkdir(parents=True, exist_ok=True) - urllib.request.urlretrieve(url, local) - print(f"Saved to {local}") - return str(local) - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -@dataclass -class GLMImageRequest: - prompt: str - image_path: str | None = None # Only for I2I mode - - -def download_image(url: str) -> str: - """Download an image to cache and return the local path.""" - IMAGE_CACHE_DIR.mkdir(parents=True, exist_ok=True) - fname = url.rsplit("/", 1)[-1] - local_path = IMAGE_CACHE_DIR / fname - if local_path.exists(): - return str(local_path) - resp = sync_requests.get(url, timeout=30) - resp.raise_for_status() - local_path.write_bytes(resp.content) - return str(local_path) - - -def encode_image_as_data_url(path: str) -> str: - """Encode a local image file as a base64 data URL.""" - with open(path, "rb") as f: - encoded = base64.b64encode(f.read()).decode("utf-8") - ext = Path(path).suffix.lower() - mime = {"png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg"}.get(ext, "image/png") - return f"data:{mime};base64,{encoded}" - - -# --------------------------------------------------------------------------- -# Datasets -# --------------------------------------------------------------------------- - - -class PromptDataset: - """Load from prompt.json. T2I uses t2i_prompt, I2I uses i2i_prompt + image_url.""" - - def __init__(self, args: argparse.Namespace): - path = _ensure_prompt_json(args.dataset_path) - with open(path, encoding="utf-8") as f: - raw = json.load(f) - - prompt_key = "t2i_prompt" if args.mode == "t2i" else "i2i_prompt" - self.items: list[GLMImageRequest] = [] - - for entry in raw: - prompt = entry.get(prompt_key, "").strip() - if not prompt: - continue - image_path = None - if args.mode == "i2i": - url = entry.get("image_url", "") - if url: - image_path = download_image(url) - self.items.append(GLMImageRequest(prompt=prompt, image_path=image_path)) - - if args.num_prompts and len(self.items) > args.num_prompts: - self.items = self.items[: args.num_prompts] - - def __len__(self) -> int: - return len(self.items) - - def __getitem__(self, idx: int) -> GLMImageRequest: - return self.items[idx] - - def get_requests(self) -> list[GLMImageRequest]: - return list(self.items) - - -class RandomDataset: - """Generate synthetic prompts (and optional random images for I2I).""" - - def __init__(self, args: argparse.Namespace): - self.args = args - self.num_prompts = args.num_prompts - self._random_image_paths: list[str] | None = None - if args.mode == "i2i": - self._random_image_paths = self._generate_random_images() - - def _generate_random_images(self) -> list[str]: - paths: list[str] = [] - for i in range(self.args.num_input_images): - img = Image.new("RGB", (512, 512), (128 + i * 30 % 128, 64, 192)) - path = os.path.join(tempfile.gettempdir(), f"glm_image_bench_input_{i}.png") - img.save(path) - paths.append(path) - return paths - - def __len__(self) -> int: - return self.num_prompts - - def __getitem__(self, idx: int) -> GLMImageRequest: - image_path = None - if self._random_image_paths is not None: - image_path = self._random_image_paths[idx % len(self._random_image_paths)] - return GLMImageRequest( - prompt=f"A beautiful scene with vivid colors and intricate details, prompt {idx}", - image_path=image_path, - ) - - def get_requests(self) -> list[GLMImageRequest]: - return [self[i] for i in range(len(self))] - - -class CustomDataset: - """Load from a user-specified JSON file. - - Expected format: - [ - {"prompt": "A cat sitting on a windowsill"}, - {"prompt": "Make it look like winter", "image_path": "/path/to/img.png"} - ] - """ - - def __init__(self, args: argparse.Namespace): - if not args.dataset_path: - raise ValueError("--dataset-path is required for custom dataset") - with open(args.dataset_path, encoding="utf-8") as f: - raw = json.load(f) - self.items: list[GLMImageRequest] = [] - for item in raw: - self.items.append( - GLMImageRequest( - prompt=item.get("prompt", ""), - image_path=item.get("image_path"), - ) - ) - if args.num_prompts and len(self.items) > args.num_prompts: - self.items = self.items[: args.num_prompts] - - def __len__(self) -> int: - return len(self.items) - - def __getitem__(self, idx: int) -> GLMImageRequest: - return self.items[idx] - - def get_requests(self) -> list[GLMImageRequest]: - return list(self.items) - - -# --------------------------------------------------------------------------- -# Async request for GLM-Image (chat completions with image support) -# --------------------------------------------------------------------------- - - -async def async_glm_image_request( - req: GLMImageRequest, - api_url: str, - model: str, - session: aiohttp.ClientSession, - pbar: Any, - args: argparse.Namespace, -) -> RequestFuncOutput: - """Send a single T2I or I2I request via chat completions endpoint.""" - output = RequestFuncOutput() - output.start_time = time.perf_counter() - - # Build messages - if req.image_path and args.mode == "i2i": - data_url = encode_image_as_data_url(req.image_path) - content = [ - {"type": "text", "text": req.prompt}, - {"type": "image_url", "image_url": {"url": data_url}}, - ] - else: - content = req.prompt - - messages = [{"role": "user", "content": content}] - - extra_body: dict[str, Any] = {} - if args.height: - extra_body["height"] = args.height - if args.width: - extra_body["width"] = args.width - if args.num_inference_steps: - extra_body["num_inference_steps"] = args.num_inference_steps - if args.seed is not None: - extra_body["seed"] = args.seed - - payload: dict[str, Any] = { - "model": model, - "messages": messages, - } - if extra_body: - payload["extra_body"] = extra_body - - try: - async with session.post(api_url, json=payload) as response: - if response.status == 200: - resp_json = await response.json() - output.response_body = resp_json - output.success = True - try: - choices = resp_json.get("choices", []) - if choices and isinstance(choices, list): - msg = choices[0].get("message", {}) - if isinstance(msg, dict): - resp_content = msg.get("content", []) - if resp_content and isinstance(resp_content, list) and len(resp_content) > 0: - first_item = resp_content[0] - if isinstance(first_item, dict): - output.stage_durations = first_item.get("stage_durations") or {} - output.peak_memory_mb = first_item.get("peak_memory_mb", 0.0) - except (IndexError, TypeError, AttributeError): - pass - else: - output.error = f"HTTP {response.status}: {await response.text()}" - output.success = False - except Exception as e: - output.error = str(e) - output.success = False - - output.latency = time.perf_counter() - output.start_time - if pbar: - pbar.update(1) - return output - - -# --------------------------------------------------------------------------- -# Benchmark -# --------------------------------------------------------------------------- - - -async def iter_requests(n: int, request_rate: float) -> Any: - import random as _random - - for i in range(n): - if request_rate != float("inf") and i > 0: - await asyncio.sleep(_random.expovariate(request_rate)) - yield i - - -def calculate_metrics(outputs: list[RequestFuncOutput], total_duration: float) -> dict[str, Any]: - success = [o for o in outputs if o.success] - errors = [o for o in outputs if not o.success] - latencies = [o.latency for o in success] - peak_mems = [o.peak_memory_mb for o in success if o.peak_memory_mb > 0] - - stage_duration_lists: dict[str, list[float]] = {} - for o in success: - for stage, dur in (o.stage_durations or {}).items(): - stage_duration_lists.setdefault(stage, []).append(dur) - - return { - "duration": total_duration, - "completed_requests": len(success), - "failed_requests": len(errors), - "throughput_qps": len(success) / total_duration if total_duration > 0 else 0, - "latency_mean": float(np.mean(latencies)) if latencies else 0, - "latency_median": float(np.median(latencies)) if latencies else 0, - "latency_p99": float(np.percentile(latencies, 99)) if latencies else 0, - "latency_p95": float(np.percentile(latencies, 95)) if latencies else 0, - "peak_memory_mb_max": max(peak_mems) if peak_mems else 0, - "stage_durations_mean": {s: float(np.mean(v)) for s, v in stage_duration_lists.items()}, - "stage_durations_p50": {s: float(np.percentile(v, 50)) for s, v in stage_duration_lists.items()}, - } - - -async def benchmark(args: argparse.Namespace) -> None: - api_url = f"http://{args.host}:{args.port}/v1/chat/completions" - - # Load dataset - if args.dataset == "prompt": - dataset = PromptDataset(args) - elif args.dataset == "random": - dataset = RandomDataset(args) - elif args.dataset == "custom": - dataset = CustomDataset(args) - else: - raise ValueError(f"Unknown dataset: {args.dataset}") - - glm_requests = dataset.get_requests() - print(f"Prepared {len(glm_requests)} requests (mode={args.mode}, dataset={args.dataset})") - - semaphore = asyncio.Semaphore(args.max_concurrency) if args.max_concurrency else None - - async def limited_request(idx: int, req: GLMImageRequest, session: aiohttp.ClientSession, pbar: Any): - if semaphore: - async with semaphore: - return await async_glm_image_request(req, api_url, args.model, session, pbar, args) - return await async_glm_image_request(req, api_url, args.model, session, pbar, args) - - async with aiohttp.ClientSession() as session: - # Warmup - if args.warmup_requests and glm_requests: - print(f"Running {args.warmup_requests} warmup request(s)...") - for i in range(args.warmup_requests): - await limited_request(i, glm_requests[i % len(glm_requests)], session, None) - - # Main benchmark - pbar = tqdm(total=len(glm_requests), disable=args.disable_tqdm) - start_time = time.perf_counter() - tasks = [] - async for idx in iter_requests(len(glm_requests), args.request_rate): - tasks.append(asyncio.create_task(limited_request(idx, glm_requests[idx], session, pbar))) - outputs = await asyncio.gather(*tasks) - total_duration = time.perf_counter() - start_time - pbar.close() - - # Metrics - metrics = calculate_metrics(outputs, total_duration) - metrics["mode"] = args.mode - metrics["model"] = args.model - metrics["dataset"] = args.dataset - - print(f"\n{' GLM-Image Online Benchmark Result ':=^60}") - print(f"{'Mode:':<40} {args.mode}") - print(f"{'Model:':<40} {args.model}") - print(f"{'Dataset:':<40} {args.dataset}") - print("-" * 50) - print(f"{'Benchmark duration (s):':<40} {metrics['duration']:.2f}") - print(f"{'Request rate:':<40} {args.request_rate}") - print(f"{'Max concurrency:':<40} {args.max_concurrency}") - print(f"{'Successful requests:':<40} {metrics['completed_requests']}/{len(glm_requests)}") - print("-" * 50) - print(f"{'Throughput (req/s):':<40} {metrics['throughput_qps']:.2f}") - print(f"{'Latency Mean (s):':<40} {metrics['latency_mean']:.4f}") - print(f"{'Latency Median (s):':<40} {metrics['latency_median']:.4f}") - print(f"{'Latency P95 (s):':<40} {metrics['latency_p95']:.4f}") - print(f"{'Latency P99 (s):':<40} {metrics['latency_p99']:.4f}") - - if metrics["peak_memory_mb_max"] > 0: - print("-" * 50) - print(f"{'Peak Memory Max (MB):':<40} {metrics['peak_memory_mb_max']:.2f}") - - if metrics["stage_durations_mean"]: - print("-" * 50) - print("Stage Durations Mean:") - for stage, val in sorted(metrics["stage_durations_mean"].items()): - unit = "ms" if stage.endswith("_ms") else "s" - print(f" {stage + ':':<38} {val:.4f} ({unit})") - - print("=" * 60) - - if args.output_file: - with open(args.output_file, "w") as f: - json.dump(metrics, f, indent=2) - print(f"Metrics saved to {args.output_file}") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Benchmark GLM-Image T2I/I2I online serving.") - parser.add_argument("--mode", type=str, default="t2i", choices=["t2i", "i2i"]) - parser.add_argument("--dataset", type=str, default="prompt", choices=["prompt", "random", "custom"]) - parser.add_argument("--dataset-path", type=str, default=None) - parser.add_argument("--num-prompts", type=int, default=10) - parser.add_argument("--max-concurrency", type=int, default=1) - parser.add_argument("--request-rate", type=float, default=float("inf")) - parser.add_argument("--warmup-requests", type=int, default=1) - parser.add_argument("--width", type=int, default=1024) - parser.add_argument("--height", type=int, default=1024) - parser.add_argument("--num-inference-steps", type=int, default=50) - parser.add_argument("--seed", type=int, default=None) - parser.add_argument("--model", type=str, default="default") - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=8091) - parser.add_argument("--output-file", type=str, default=None) - parser.add_argument("--disable-tqdm", action="store_true") - parser.add_argument("--num-input-images", type=int, default=1, help="For random I2I dataset.") - args = parser.parse_args() - asyncio.run(benchmark(args)) - - -if __name__ == "__main__": - main() diff --git a/benchmarks/glm_image/huggingface/inference.py b/benchmarks/glm_image/huggingface/inference.py deleted file mode 100644 index ff826080e8c..00000000000 --- a/benchmarks/glm_image/huggingface/inference.py +++ /dev/null @@ -1,291 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -HuggingFace (transformers + diffusers) baseline benchmark for GLM-Image. - -Supports T2I and I2I modes with the prompt.json dataset. -Downloads source images for I2I from image_url on first run and caches locally. - -Usage: - # T2I mode (text-to-image, no source images needed) - python benchmarks/glm_image/huggingface/inference.py \ - --model-path zai-org/GLM-Image \ - --mode t2i --num-prompts 10 - - # I2I mode (image-to-image, downloads source images) - python benchmarks/glm_image/huggingface/inference.py \ - --model-path zai-org/GLM-Image \ - --mode i2i --num-prompts 10 - - # With custom prompt.json - python benchmarks/glm_image/huggingface/inference.py \ - --model-path zai-org/GLM-Image \ - --mode i2i --dataset-path prompts.json --num-prompts 5 -""" - -import argparse -import json -import os -import time -from pathlib import Path - -import numpy as np -import requests -import torch -from PIL import Image - -BENCHMARK_DIR = Path(__file__).resolve().parent.parent -DEFAULT_PROMPT_JSON = BENCHMARK_DIR / "prompt" / "prompt.json" -IMAGE_CACHE_DIR = BENCHMARK_DIR / "prompt" / "images" - -DATASET_REPO = "JaredforReal/glm-image-bench" -DATASET_FILE = "prompt.json" - - -def _ensure_prompt_json(dataset_path: str | None) -> str: - """Return path to prompt.json, downloading from HuggingFace if needed.""" - if dataset_path: - return dataset_path - local = DEFAULT_PROMPT_JSON - if local.exists(): - return str(local) - print(f"Downloading {DATASET_FILE} from {DATASET_REPO} ...") - try: - from huggingface_hub import hf_hub_download - - downloaded = hf_hub_download( - repo_id=DATASET_REPO, - filename=DATASET_FILE, - repo_type="dataset", - ) - local.parent.mkdir(parents=True, exist_ok=True) - import shutil - - shutil.copy2(downloaded, local) - print(f"Saved to {local}") - except ImportError: - url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{DATASET_FILE}" - import urllib.request - - local.parent.mkdir(parents=True, exist_ok=True) - urllib.request.urlretrieve(url, local) - print(f"Saved to {local}") - return str(local) - - -HEIGHT = 1024 -WIDTH = 1024 -SEED = 42 -NUM_INFERENCE_STEPS = 50 -GUIDANCE_SCALE = 1.5 - - -# --------------------------------------------------------------------------- -# Dataset -# --------------------------------------------------------------------------- - - -def load_dataset( - dataset_path: str | None, - mode: str, - num_prompts: int, -) -> list[dict]: - """Load prompts from prompt.json and prepare per-request data.""" - path = _ensure_prompt_json(dataset_path) - with open(path, encoding="utf-8") as f: - raw = json.load(f) - - items = [] - for entry in raw: - if mode == "t2i": - prompt_key = "t2i_prompt" - else: - prompt_key = "i2i_prompt" - - prompt_text = entry.get(prompt_key, "").strip() - if not prompt_text: - continue - - item = {"prompt": prompt_text} - if mode == "i2i": - item["image_url"] = entry.get("image_url", "") - items.append(item) - - if num_prompts and len(items) > num_prompts: - items = items[:num_prompts] - return items - - -def download_image(url: str, cache_dir: Path) -> str: - """Download an image to cache_dir and return the local path.""" - cache_dir.mkdir(parents=True, exist_ok=True) - fname = url.rsplit("/", 1)[-1] - local_path = cache_dir / fname - if local_path.exists(): - return str(local_path) - print(f" Downloading {url} ...") - resp = requests.get(url, timeout=30) - resp.raise_for_status() - local_path.write_bytes(resp.content) - return str(local_path) - - -# --------------------------------------------------------------------------- -# Benchmark -# --------------------------------------------------------------------------- - - -def benchmark(args: argparse.Namespace) -> None: - from diffusers.pipelines.glm_image import GlmImagePipeline - - print("=" * 60) - print("GLM-Image HuggingFace Baseline Benchmark") - print(f"Mode: {args.mode} | Model: {args.model_path}") - print(f"Size: {args.height}x{args.width} | Steps: {args.num_inference_steps}") - print("=" * 60) - - # Load dataset - items = load_dataset(args.dataset_path, args.mode, args.num_prompts) - if not items: - print("No prompts loaded. Exiting.") - return - print(f"Loaded {len(items)} prompts for {args.mode} mode") - - # Download I2I source images - if args.mode == "i2i": - print("Preparing source images...") - for item in items: - url = item.get("image_url", "") - if url: - item["image_path"] = download_image(url, IMAGE_CACHE_DIR) - else: - item["image_path"] = None - - # Load pipeline - print(f"\nLoading pipeline from {args.model_path} ...") - t0 = time.perf_counter() - pipe = GlmImagePipeline.from_pretrained( - args.model_path, - torch_dtype=torch.bfloat16, - device_map="cuda", - ) - init_time = time.perf_counter() - t0 - print(f"Pipeline loaded in {init_time:.2f}s") - - # Create output dir - os.makedirs(args.output_dir, exist_ok=True) - - # Run benchmark - generator = torch.Generator(device="cuda").manual_seed(args.seed) - latencies = [] - success = 0 - failed = 0 - - print(f"\nRunning {len(items)} requests sequentially...") - print("-" * 60) - - for i, item in enumerate(items): - prompt = item["prompt"] - gen_kwargs: dict = { - "prompt": prompt, - "height": args.height, - "width": args.width, - "num_inference_steps": args.num_inference_steps, - "guidance_scale": args.guidance_scale, - "generator": generator, - } - - if args.mode == "i2i": - img_path = item.get("image_path") - if img_path and os.path.exists(img_path): - gen_kwargs["image"] = [Image.open(img_path).convert("RGB")] - else: - print(f" [{i + 1}] SKIP: no source image") - failed += 1 - continue - - t_start = time.perf_counter() - try: - result = pipe(**gen_kwargs) - image = result.images[0] - elapsed = time.perf_counter() - t_start - latencies.append(elapsed) - success += 1 - - out_path = os.path.join(args.output_dir, f"{i:04d}.png") - image.save(out_path) - print(f" [{i + 1}/{len(items)}] {elapsed:.3f}s -> {out_path}") - except Exception as e: - elapsed = time.perf_counter() - t_start - failed += 1 - print(f" [{i + 1}/{len(items)}] FAILED ({elapsed:.3f}s): {e}") - - # Report - total_gen_time = sum(latencies) if latencies else 0 - print("\n" + "=" * 60) - print("HuggingFace Baseline Results") - print("=" * 60) - print(f"{'Mode:':<40} {args.mode}") - print(f"{'Model:':<40} {args.model_path}") - print(f"{'Image size:':<40} {args.height}x{args.width}") - print(f"{'Num inference steps:':<40} {args.num_inference_steps}") - print("-" * 50) - print(f"{'Pipeline init time (s):':<40} {init_time:.2f}") - print(f"{'Successful:':<40} {success}/{len(items)}") - print(f"{'Failed:':<40} {failed}") - print("-" * 50) - if latencies: - arr = np.array(latencies) - print(f"{'Total generation time (s):':<40} {total_gen_time:.2f}") - print(f"{'Throughput (img/s):':<40} {success / total_gen_time:.4f}") - print(f"{'Latency Mean (s):':<40} {arr.mean():.4f}") - print(f"{'Latency Median (s):':<40} {np.median(arr):.4f}") - print(f"{'Latency P95 (s):':<40} {np.percentile(arr, 95):.4f}") - print(f"{'Latency P99 (s):':<40} {np.percentile(arr, 99):.4f}") - - print(f"\n{'Output dir:':<40} {args.output_dir}") - print("=" * 60) - - # Save metrics JSON - metrics = { - "backend": "huggingface", - "mode": args.mode, - "model": args.model_path, - "height": args.height, - "width": args.width, - "num_inference_steps": args.num_inference_steps, - "init_time_s": init_time, - "completed_requests": success, - "failed_requests": failed, - "total_gen_time_s": total_gen_time, - "throughput_qps": success / total_gen_time if total_gen_time > 0 else 0, - "latency_mean": float(np.mean(latencies)) if latencies else 0, - "latency_median": float(np.median(latencies)) if latencies else 0, - "latency_p95": float(np.percentile(latencies, 95)) if latencies else 0, - "latency_p99": float(np.percentile(latencies, 99)) if latencies else 0, - } - if args.output_file: - with open(args.output_file, "w") as f: - json.dump(metrics, f, indent=2) - print(f"Metrics saved to {args.output_file}") - - -def main() -> None: - parser = argparse.ArgumentParser(description="GLM-Image HuggingFace baseline benchmark") - parser.add_argument("--model-path", type=str, default="zai-org/GLM-Image") - parser.add_argument("--mode", type=str, default="t2i", choices=["t2i", "i2i"]) - parser.add_argument("--dataset-path", type=str, default=None, help="Path to prompt.json") - parser.add_argument("--num-prompts", type=int, default=10) - parser.add_argument("--height", type=int, default=HEIGHT) - parser.add_argument("--width", type=int, default=WIDTH) - parser.add_argument("--num-inference-steps", type=int, default=NUM_INFERENCE_STEPS) - parser.add_argument("--guidance-scale", type=float, default=GUIDANCE_SCALE) - parser.add_argument("--seed", type=int, default=SEED) - parser.add_argument("--output-dir", type=str, default="benchmarks/glm_image/huggingface/outputs") - parser.add_argument("--output-file", type=str, default=None, help="JSON file for metrics") - args = parser.parse_args() - benchmark(args) - - -if __name__ == "__main__": - main() diff --git a/benchmarks/glm_image/vllm-omni/inference.py b/benchmarks/glm_image/vllm-omni/inference.py deleted file mode 100644 index 5729da07174..00000000000 --- a/benchmarks/glm_image/vllm-omni/inference.py +++ /dev/null @@ -1,505 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -vLLM-Omni offline benchmark for GLM-Image. - -Supports T2I and I2I modes with the prompt.json dataset. -Downloads source images for I2I from image_url on first run and caches locally. - -Usage: - # T2I mode - python benchmarks/glm_image/vllm-omni/inference.py \ - --model-path zai-org/GLM-Image \ - --mode t2i --num-prompts 10 - - # I2I mode (downloads source images) - python benchmarks/glm_image/vllm-omni/inference.py \ - --model-path zai-org/GLM-Image \ - --mode i2i --num-prompts 10 -""" - -import argparse -import json -import math -import os -import time -from pathlib import Path - -import numpy as np -import requests -from PIL import Image -from vllm import SamplingParams - -from vllm_omni.entrypoints.omni import Omni -from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -BENCHMARK_DIR = Path(__file__).resolve().parent.parent -DEFAULT_PROMPT_JSON = BENCHMARK_DIR / "prompt" / "prompt.json" -IMAGE_CACHE_DIR = BENCHMARK_DIR / "prompt" / "images" -DEFAULT_DEPLOY_CONFIG = "vllm_omni/deploy/glm_image.yaml" - -DATASET_REPO = "JaredforReal/glm-image-bench" -DATASET_FILE = "prompt.json" - - -def _ensure_prompt_json(dataset_path: str | None) -> str: - """Return path to prompt.json, downloading from HuggingFace if needed.""" - if dataset_path: - return dataset_path - local = DEFAULT_PROMPT_JSON - if local.exists(): - return str(local) - print(f"Downloading {DATASET_FILE} from {DATASET_REPO} ...") - try: - from huggingface_hub import hf_hub_download - - downloaded = hf_hub_download( - repo_id=DATASET_REPO, - filename=DATASET_FILE, - repo_type="dataset", - ) - local.parent.mkdir(parents=True, exist_ok=True) - import shutil - - shutil.copy2(downloaded, local) - print(f"Saved to {local}") - except ImportError: - url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{DATASET_FILE}" - import urllib.request - - local.parent.mkdir(parents=True, exist_ok=True) - urllib.request.urlretrieve(url, local) - print(f"Saved to {local}") - return str(local) - - -SEED = 42 -HEIGHT = 1024 -WIDTH = 1024 -NUM_INFERENCE_STEPS = 50 -GUIDANCE_SCALE = 1.5 - -GLM_IMAGE_EOS_TOKEN_ID = 16385 -GLM_IMAGE_VISION_VOCAB_SIZE = 16512 - - -# --------------------------------------------------------------------------- -# Dataset -# --------------------------------------------------------------------------- - - -def load_dataset( - dataset_path: str | None, - mode: str, - num_prompts: int, -) -> list[dict]: - path = _ensure_prompt_json(dataset_path) - with open(path, encoding="utf-8") as f: - raw = json.load(f) - - items = [] - for entry in raw: - prompt_key = "t2i_prompt" if mode == "t2i" else "i2i_prompt" - prompt_text = entry.get(prompt_key, "").strip() - if not prompt_text: - continue - - item = {"prompt": prompt_text} - if mode == "i2i": - item["image_url"] = entry.get("image_url", "") - items.append(item) - - if num_prompts and len(items) > num_prompts: - items = items[:num_prompts] - return items - - -def download_image(url: str, cache_dir: Path) -> str: - cache_dir.mkdir(parents=True, exist_ok=True) - fname = url.rsplit("/", 1)[-1] - local_path = cache_dir / fname - if local_path.exists(): - return str(local_path) - print(f" Downloading {url} ...") - resp = requests.get(url, timeout=30) - resp.raise_for_status() - local_path.write_bytes(resp.content) - return str(local_path) - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def compute_max_tokens(height: int, width: int, is_i2i: bool = False) -> int: - factor = 32 - token_h = height // factor - token_w = width // factor - large_tokens = token_h * token_w - - # Small preview tokens (half resolution in each dimension) - - ratio = token_h / token_w if token_w > 0 else 1.0 - small_token_h = max(1, int(math.sqrt(ratio) * (factor // 2))) - small_token_w = max(1, int(math.sqrt(1 / ratio) * (factor // 2))) - small_tokens = small_token_h * small_token_w - - # Mode-dependent totals: - # - t2i: small + large + EOS - # - i2i: large + EOS - if is_i2i: - return large_tokens + 1 - return small_tokens + large_tokens + 1 - - -def build_prompt_t2i(prompt: str, height: int, width: int, **gen_kw) -> dict: - return { - "prompt": prompt, - "height": height, - "width": width, - "mm_processor_kwargs": {"target_h": height, "target_w": width}, - **gen_kw, - } - - -def build_prompt_i2i(prompt: str, image_path: str, height: int, width: int, **gen_kw) -> dict: - return { - "prompt": prompt, - "height": height, - "width": width, - "mm_processor_kwargs": {"target_h": height, "target_w": width}, - "multi_modal_data": {"image": Image.open(image_path).convert("RGB")}, - **gen_kw, - } - - -def resolve_deploy_config(args: argparse.Namespace) -> str: - if args.deploy_config: - return args.deploy_config - if os.path.exists(DEFAULT_DEPLOY_CONFIG): - return DEFAULT_DEPLOY_CONFIG - fallback = Path(__file__).resolve().parents[3] / DEFAULT_DEPLOY_CONFIG - if fallback.exists(): - return str(fallback) - raise FileNotFoundError("Deploy config not found. Specify --deploy-config.") - - -# --------------------------------------------------------------------------- -# Benchmark -# --------------------------------------------------------------------------- - - -def benchmark(args: argparse.Namespace) -> None: - is_i2i = args.mode == "i2i" - - print("=" * 60) - print("GLM-Image vLLM-Omni Benchmark") - print(f"Mode: {args.mode} | Model: {args.model_path}") - print(f"Size: {args.height}x{args.width} | Steps: {args.num_inference_steps}") - print("=" * 60) - - # Load dataset - items = load_dataset(args.dataset_path, args.mode, args.num_prompts) - if not items: - print("No prompts loaded. Exiting.") - return - print(f"Loaded {len(items)} prompts for {args.mode} mode") - - # Download I2I source images - if is_i2i: - print("Preparing source images...") - for item in items: - url = item.get("image_url", "") - if url: - item["image_path"] = download_image(url, IMAGE_CACHE_DIR) - else: - item["image_path"] = None - - # Init Omni - deploy_config = resolve_deploy_config(args) - print(f"\nInitializing vLLM-Omni (deploy config: {deploy_config}) ...") - t0 = time.perf_counter() - - omni = Omni( - model=args.model_path, - deploy_config=deploy_config, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, - enable_ar_profiler=args.enable_ar_profiler, - ) - - init_time = time.perf_counter() - t0 - print(f"Initialized in {init_time:.2f}s") - - # Sampling params - max_tokens = compute_max_tokens(args.height, args.width, is_i2i=is_i2i) - ar_params = SamplingParams( - temperature=0.9, - top_p=0.75, - top_k=GLM_IMAGE_VISION_VOCAB_SIZE, - max_tokens=max_tokens, - stop_token_ids=[GLM_IMAGE_EOS_TOKEN_ID], - seed=args.seed, - detokenize=False, - extra_args={"target_h": args.height, "target_w": args.width}, - ) - diff_params = OmniDiffusionSamplingParams( - num_inference_steps=args.num_inference_steps, - guidance_scale=args.guidance_scale, - height=args.height, - width=args.width, - seed=args.seed, - ) - sampling_params_list = [ar_params, diff_params] - - # Build all prompts - gen_kw = { - "seed": args.seed, - "num_inference_steps": args.num_inference_steps, - "guidance_scale": args.guidance_scale, - } - all_prompts = [] - for item in items: - if is_i2i: - img_path = item.get("image_path") - if not img_path or not os.path.exists(img_path): - continue - all_prompts.append(build_prompt_i2i(item["prompt"], img_path, args.height, args.width, **gen_kw)) - else: - all_prompts.append(build_prompt_t2i(item["prompt"], args.height, args.width, **gen_kw)) - - valid = len(all_prompts) - print(f"Valid prompts: {valid}") - - # Create output dir - os.makedirs(args.output_dir, exist_ok=True) - - # Warmup: run 1 request to prime caches, CUDA graphs, etc. - if all_prompts: - print("Running warmup request...") - try: - warmup_prompt = [all_prompts[0]] - omni.generate(warmup_prompt, sampling_params_list, py_generator=False) - print("Warmup done.\n") - except Exception as e: - print(f"Warmup failed (continuing): {e}") - - # Run - print(f"\nRunning {valid} requests...") - print("-" * 60) - - latencies = [] - all_stage_durations: list[dict[str, float]] = [] - success = 0 - failed = 0 - wall_start = time.perf_counter() - - try: - output_idx = 0 - for stage_outputs in omni.generate(all_prompts, sampling_params_list, py_generator=True): - if stage_outputs.final_output_type == "image": - request_output = stage_outputs.request_output - request_id = getattr(request_output, "request_id", "") - - images = getattr(request_output, "images", []) - if not images and hasattr(request_output, "multimodal_output"): - mm = request_output.multimodal_output - if isinstance(mm, dict): - images = mm.get("images", []) - - elapsed = time.perf_counter() - wall_start - if images: - for img in images: - if isinstance(img, Image.Image): - out_path = os.path.join(args.output_dir, f"{output_idx:04d}.png") - img.save(out_path) - success += 1 - latencies.append(elapsed) - stage_durations = getattr(stage_outputs, "stage_durations", {}) - if stage_durations: - all_stage_durations.append(stage_durations) - # Show wall-clock elapsed and pipeline breakdown if available - preprocess_str = "" - if "preprocess_ms" in stage_durations: - preprocess_str = f" preprocess={stage_durations['preprocess_ms'] / 1000.0:.2f}s" - print(f" [{success}/{valid}] id={request_id[:8]} {elapsed:.2f}s{preprocess_str}") - output_idx += 1 - else: - failed += 1 - except Exception as e: - print(f"Error: {e}") - failed = valid - success - - total_gen_time = time.perf_counter() - wall_start - - # Diff stage_0_gen_ms with previous request to remove accumulated wait time. - # stage_0_gen_ms is measured from submit_ts (same for all requests submitted - # at once), so it accumulates queue/scheduling overhead across requests. - # Other stages and pipeline timings are per-request already. - _TIMING_ORDER = [ - "preprocess_ms", - "stage_0_gen_ms", - "ar2diffusion_ms", - "stage_1_gen_ms", - "queue_wait_ms", - ] - - per_request_actual: list[dict[str, float]] = [] - prev_stage_0_ms = 0.0 - for sd in all_stage_durations: - actual = dict(sd) - s0 = sd.get("stage_0_gen_ms", 0.0) - actual["stage_0_gen_ms"] = s0 - prev_stage_0_ms - prev_stage_0_ms = s0 - per_request_actual.append(actual) - - per_request_e2e_ms: list[float] = [] - for actual in per_request_actual: - e2e_ms = sum(v for k, v in actual.items() if k in _TIMING_ORDER) - if e2e_ms > 0: - per_request_e2e_ms.append(e2e_ms) - - # Report - print("\n" + "=" * 60) - print("vLLM-Omni Benchmark Results") - print("=" * 60) - print(f"{'Mode:':<40} {args.mode}") - print(f"{'Model:':<40} {args.model_path}") - print(f"{'Image size:':<40} {args.height}x{args.width}") - print(f"{'Num inference steps:':<40} {args.num_inference_steps}") - print("-" * 50) - print(f"{'Init time (s):':<40} {init_time:.2f}") - print(f"{'Successful:':<40} {success}/{valid}") - print(f"{'Failed:':<40} {failed}") - print("-" * 50) - - if per_request_e2e_ms: - per_request_s = np.array(per_request_e2e_ms) / 1000.0 - print(f"{'Total generation time (s):':<40} {total_gen_time:.2f}") - print(f"{'Throughput (img/s):':<40} {success / total_gen_time:.4f}") - print(f"{'Latency Mean (s):':<40} {per_request_s.mean():.4f}") - print(f"{'Latency Median (s):':<40} {np.median(per_request_s):.4f}") - print(f"{'Latency P95 (s):':<40} {np.percentile(per_request_s, 95):.4f}") - print(f"{'Latency P99 (s):':<40} {np.percentile(per_request_s, 99):.4f}") - print(f"{'Latency Min (s):':<40} {per_request_s.min():.4f}") - print(f"{'Latency Max (s):':<40} {per_request_s.max():.4f}") - elif latencies: - per_request = np.diff([0.0] + list(latencies)) - print(f"{'Total generation time (s):':<40} {total_gen_time:.2f}") - print(f"{'Throughput (img/s):':<40} {success / total_gen_time:.4f}") - print(f"{'Latency Mean (s) [wall-clock]:':<40} {per_request.mean():.4f}") - print(f"{'Latency Median (s) [wall-clock]:':<40} {np.median(per_request):.4f}") - print(f"{'Latency P95 (s) [wall-clock]:':<40} {np.percentile(per_request, 95):.4f}") - print(f"{'Latency P99 (s) [wall-clock]:':<40} {np.percentile(per_request, 99):.4f}") - print(f"{'Latency Min (s) [wall-clock]:':<40} {per_request.min():.4f}") - print(f"{'Latency Max (s) [wall-clock]:':<40} {per_request.max():.4f}") - - if per_request_actual: - print("-" * 50) - print("Pipeline Timings Mean:") - for key in _TIMING_ORDER: - vals = [d.get(key, 0.0) for d in per_request_actual] - if any(v != 0 for v in vals): - unit = "ms" if key.endswith("_ms") else "s" - print(f" {key + ':':<38} {np.mean(vals):.4f} ({unit})") - # Show any extra keys not in the ordered list - ordered_set = set(_TIMING_ORDER) - extra_keys = sorted(k for k in per_request_actual[0].keys() if k not in ordered_set) - for key in extra_keys: - vals = [d.get(key, 0.0) for d in per_request_actual] - if any(v != 0 for v in vals): - unit = "ms" if key.endswith("_ms") else "s" - print(f" {key + ':':<38} {np.mean(vals):.4f} ({unit})") - - print(f"\n{'Output dir:':<40} {args.output_dir}") - print("=" * 60) - - # Metrics JSON - metrics = { - "backend": "vllm-omni", - "mode": args.mode, - "model": args.model_path, - "height": args.height, - "width": args.width, - "num_inference_steps": args.num_inference_steps, - "init_time_s": init_time, - "completed_requests": success, - "failed_requests": failed, - "total_gen_time_s": total_gen_time, - "throughput_qps": success / total_gen_time if total_gen_time > 0 else 0, - } - if per_request_e2e_ms: - per_request_s = np.array(per_request_e2e_ms) / 1000.0 - metrics["latency_mean"] = float(per_request_s.mean()) - metrics["latency_median"] = float(np.median(per_request_s)) - metrics["latency_p95"] = float(np.percentile(per_request_s, 95)) - metrics["latency_p99"] = float(np.percentile(per_request_s, 99)) - elif latencies: - per_request = np.diff([0.0] + list(latencies)) - metrics["latency_mean"] = float(per_request.mean()) - metrics["latency_median"] = float(np.median(per_request)) - metrics["latency_p95"] = float(np.percentile(per_request, 95)) - metrics["latency_p99"] = float(np.percentile(per_request, 99)) - else: - metrics["latency_mean"] = 0 - metrics["latency_median"] = 0 - metrics["latency_p95"] = 0 - metrics["latency_p99"] = 0 - if per_request_actual: - all_keys = list(_TIMING_ORDER) + sorted(k for k in per_request_actual[0].keys() if k not in set(_TIMING_ORDER)) - stage_metrics = {} - for key in all_keys: - vals = [d.get(key, 0.0) for d in per_request_actual] - stage_metrics[key] = { - "mean": float(np.mean(vals)), - "median": float(np.median(vals)), - "p95": float(np.percentile(vals, 95)), - } - metrics["stage_durations"] = stage_metrics - if args.output_file: - with open(args.output_file, "w") as f: - json.dump(metrics, f, indent=2) - print(f"Metrics saved to {args.output_file}") - - omni.close() - print("Done!") - - -def main() -> None: - parser = argparse.ArgumentParser(description="GLM-Image vLLM-Omni offline benchmark") - parser.add_argument("--model-path", type=str, default="zai-org/GLM-Image") - parser.add_argument("--deploy-config", type=str, default=None, help="Deploy config YAML") - parser.add_argument("--mode", type=str, default="t2i", choices=["t2i", "i2i"]) - parser.add_argument("--dataset-path", type=str, default=None, help="Path to prompt.json") - parser.add_argument("--num-prompts", type=int, default=10) - parser.add_argument("--height", type=int, default=HEIGHT) - parser.add_argument("--width", type=int, default=WIDTH) - parser.add_argument("--num-inference-steps", type=int, default=NUM_INFERENCE_STEPS) - parser.add_argument("--guidance-scale", type=float, default=GUIDANCE_SCALE) - parser.add_argument("--seed", type=int, default=SEED) - parser.add_argument("--output-dir", type=str, default="benchmarks/glm_image/vllm-omni/outputs") - parser.add_argument("--output-file", type=str, default=None, help="JSON file for metrics") - parser.add_argument("--stage-init-timeout", type=int, default=600) - parser.add_argument( - "--enable-diffusion-pipeline-profiler", - action="store_true", - help="Enable diffusion pipeline profiler for stage-level timing", - ) - parser.add_argument( - "--enable-ar-profiler", - action="store_true", - help="Enable AR stage profiler to include AR timing in stage_durations", - ) - parser.add_argument( - "--log-stats", - action="store_true", - help="Enable detailed per-request pipeline stats logging", - ) - args = parser.parse_args() - benchmark(args) - - -if __name__ == "__main__": - main() diff --git a/benchmarks/qwen3-omni/README.md b/benchmarks/qwen3-omni/README.md new file mode 100644 index 00000000000..de27c05c2c4 --- /dev/null +++ b/benchmarks/qwen3-omni/README.md @@ -0,0 +1,86 @@ +# Benchmarks Guide + +This README explains how to (1) prepare benchmark datasets and (2) run the provided Qwen3-Omni benchmarks. + +## 1) Prepare the dataset (SeedTTS top100) + +```bash +cd benchmarks/build_dataset +pip install gdown + +# Download SeedTTS test set from Google Drive +gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP + +# Extract +tar -xf seedtts_testset.tar + +# Copy metadata and extract top-100 prompts +cp seedtts_testset/en/meta.lst meta.lst +python extract_prompts.py -i meta.lst -o top100.txt -n 100 + +# (Optional) clean up to save space +rm -rf seedtts_testset seedtts_testset.tar meta.lst +``` + +Artifacts: +- `benchmarks/build_dataset/top100.txt` — 100 text prompts (one per line). + +## 2) Run benchmarks + +All commands assume repo root (`vllm-omni`). + +### A. Transformers benchmark (offline, HF Transformers) + +``` +bash benchmarks/qwen3-omni/transformers/eval_qwen3_moe_omni_transformers.sh +``` + +What it does: +- Runs `qwen3_omni_moe_transformers.py` over `top100.txt` with `--num-prompts 100`. +- Outputs to `benchmarks/qwen3-omni/transformers/benchmark_results/`: + - `perf_stats.json` — aggregated & per-prompt TPS/latency (thinker/talker/code2wav/overall). + - `results.json` — per-prompt outputs and audio paths. + - `audio/` — ~100 generated `.wav` files. + +Key checks: +- `overall_tps` and `*_tps_avg` should be non-zero and reasonably stable. +- Investigate any 0/NaN or unusually low TPS / long-tail latency. + +### B. vLLM Omni end-to-end benchmark (pipeline) + +``` +bash benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh +``` + +What it does: +- Runs `examples/offline_inference/qwen3_omni/end2end.py` with `--log-stats`. +- Uses `benchmarks/build_dataset/top100.txt` and writes to: + - Logs: `benchmarks/qwen3-omni/vllm_omni/logs/` + - `omni_pipeline_text.orchestrator.stats.jsonl` — per-stage latency stats. + - `omni_pipeline_text.overall.stats.jsonl` — end-to-end latency/TPS. + - `omni_pipeline_text.stage{0,1,2}.log` — per-stage detailed logs/errors. + - Outputs: `benchmarks/qwen3-omni/vllm_omni/outputs/` — ~100 text and `.wav` files. + +Key checks: +- Overall stats: end-to-end latency/TPS should be reasonable. +- Orchestrator stats: per-stage latency should be stable; investigate long tails. +- Stage logs: ensure no errors and no unusually slow stages. + + +## Performance snapshot + +The chart below summarizes our measured Qwen3-Omni MoE end-to-end benchmark, comparing vLLM-Omni against HF Transformers. It shows the overall throughput advantage for vLLM-Omni. These are actual experiment results—please refer to this performance when evaluating or reproducing the benchmark. + +![vLLM-Omni vs HF](./vllm-omni-vs-hf.png) + +## Directory layout +- `benchmarks/build_dataset/` — dataset prep utilities (e.g., SeedTTS top100). +- `benchmarks//vllm_omni/` — vLLM-Omni pipeline benchmarks, logs, outputs. +- Add new tasks under `benchmarks//...` with the same pattern: `transformers/`, `vllm_omni/`, task-specific README, and (optionally) dataset prep notes. +- `benchmarks//vllm-omni-vs-hf.png` — current performance snapshot (overall throughput comparison). +- `benchmarks//transformers/` — HF Transformers benchmarks (offline reference). + +## Troubleshooting +- Make sure GPU/driver/FlashAttention2 requirements are met for the chosen model. +- If downloads fail, confirm network access to Google Drive (`gdown`) and Hugging Face. +- If audio files are missing, check for errors in stage logs or model generation.*** diff --git a/benchmarks/qwen3-omni/transformers/eval_qwen3_moe_omni_transformers.sh b/benchmarks/qwen3-omni/transformers/eval_qwen3_moe_omni_transformers.sh new file mode 100644 index 00000000000..bae514fab28 --- /dev/null +++ b/benchmarks/qwen3-omni/transformers/eval_qwen3_moe_omni_transformers.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Qwen3-Omni Transformers Benchmark Evaluation Script +# This script must be run from the vllm-omni root directory + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Navigate to vllm-omni root directory (4 levels up from script location) +VLLM_OMNI_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +cd "$VLLM_OMNI_ROOT" || { echo "Error: Failed to navigate to vllm-omni directory"; exit 1; } + +echo "Working directory: $(pwd)" +# Verify we're in the correct directory and run benchmark +if [[ ! -f "benchmarks/qwen3-omni/transformers/qwen3_omni_moe_transformers.py" ]]; then + echo "Error: Not in vllm-omni root directory. Please run from vllm-omni folder." +else + cd benchmarks/qwen3-omni/transformers + + python qwen3_omni_moe_transformers.py --prompts-file ../../build_dataset/top100.txt --num-prompts 100 + + echo "Logs and outputs are saved to $(pwd)/benchmark_results:" + echo " - perf_stats.json Aggregated/per-prompt TPS and latency (thinker/talker/code2wav/overall)" + echo " - results.json Per-prompt outputs and audio paths" + echo " - audio/ Generated wav files, there should be 100 wav file generated" + echo "Key checks: overall_tps and *_tps_avg should be non-zero and stable; investigate 0/NaN or unusually low TPS/long-tail latency." +fi diff --git a/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_model.py b/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_model.py new file mode 100644 index 00000000000..43b56f3e995 --- /dev/null +++ b/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_model.py @@ -0,0 +1,265 @@ +import time + +import torch +from transformers import Qwen3OmniMoeForConditionalGeneration + + +class Qwen3OmniMoeForConditionalGenerationWithLogging(Qwen3OmniMoeForConditionalGeneration): + @torch.no_grad() + def generate( + self, + input_ids: torch.Tensor | None = None, + speaker: str = "Ethan", + use_audio_in_video: bool = False, + return_audio: bool | None = None, + thinker_max_new_tokens: int = 1024, + thinker_eos_token_id: int = 151645, + talker_max_new_tokens: int = 4096, + talker_do_sample: bool = True, + talker_top_k: int = 50, + talker_top_p: float = 1.0, + talker_temperature: float = 0.9, + talker_repetition_penalty: float = 1.05, + **kwargs, + ): + total_t0 = time.time() + perf_stats = { + "thinker_tokens": 0, + "thinker_time_s": 0.0, + "thinker_tps": 0.0, + "talker_tokens": 0, + "talker_time_s": 0.0, + "talker_tps": 0.0, + "code2wav_tokens": 0, + "code2wav_time_s": 0.0, + "code2wav_tps": 0.0, + "total_tokens": 0, + "total_time_s": 0.0, + "total_tps": 0.0, + } + if return_audio and not self.has_talker: + raise ValueError( + "Cannot use talker when talker module not initialized. " + "Use `enable_talker` method or set enable_talker in config " + "to enable talker." + ) + if return_audio is None: + return_audio = self.has_talker + + shared_kwargs = {"use_audio_in_video": use_audio_in_video} + thinker_kwargs = { + "max_new_tokens": thinker_max_new_tokens, + "eos_token_id": thinker_eos_token_id, + } + + talker_kwargs = {} + token2wav_kwargs = {} + if return_audio: + speaker_id = self.config.talker_config.speaker_id.get(speaker.lower()) + if speaker_id is None: + raise NotImplementedError(f"Speaker {speaker} not implemented") + if input_ids.shape[0] != 1: + raise NotImplementedError("Qwen3-Omni currently does not support batched inference with audio output") + talker_suppressed_tokens = [ + i + for i in range( + self.config.talker_config.text_config.vocab_size - 1024, + self.config.talker_config.text_config.vocab_size, + ) + if i != self.config.talker_config.codec_eos_token_id + ] # Suppress additional special tokens, should not be predicted + talker_kwargs = { + "max_new_tokens": talker_max_new_tokens, + "do_sample": talker_do_sample, + "top_k": talker_top_k, + "top_p": talker_top_p, + "temperature": talker_temperature, + "eos_token_id": self.config.talker_config.codec_eos_token_id, + "repetition_penalty": talker_repetition_penalty, + "suppress_tokens": talker_suppressed_tokens, + "output_hidden_states": True, + "return_dict_in_generate": True, + } + token2wav_kwargs = {} + + for key, value in kwargs.items(): + if key.startswith("thinker_"): + thinker_kwargs[key[len("thinker_") :]] = value + elif key.startswith("talker_"): + talker_kwargs[key[len("talker_") :]] = value + elif key.startswith("token2wav_"): + token2wav_kwargs[key[len("token2wav_") :]] = value + # Process special input values + elif key == "feature_attention_mask": + thinker_kwargs[key] = value + talker_kwargs["audio_feature_lengths"] = torch.sum(value, dim=1) + elif key in ("input_features", "attention_mask"): + thinker_kwargs[key] = value + # Put other key to shared kwargs + else: + shared_kwargs[key] = value + + # Merge kwargs + for key, value in shared_kwargs.items(): + if key not in thinker_kwargs: + thinker_kwargs[key] = value + if key not in talker_kwargs and key in ["image_grid_thw", "video_grid_thw", "video_second_per_grid"]: + talker_kwargs[key] = value + if key not in token2wav_kwargs: + token2wav_kwargs[key] = value + + # 1. Generate from thinker module + generate_audio = return_audio and self.has_talker + if generate_audio: + thinker_kwargs["output_hidden_states"] = True + thinker_kwargs["return_dict_in_generate"] = True + + t0 = time.time() + thinker_result = self.thinker.generate(input_ids=input_ids, **thinker_kwargs) + t1 = time.time() + perf_stats["thinker_time_s"] = max(0.0, t1 - t0) + try: + prompt_len = int(input_ids.shape[1]) if input_ids is not None else 0 + total_len = int(thinker_result.sequences.shape[-1]) + thinker_out_len = max(0, total_len - prompt_len) + except Exception: + thinker_out_len = 0 + perf_stats["thinker_tokens"] = thinker_out_len + perf_stats["thinker_tps"] = ( + (thinker_out_len / perf_stats["thinker_time_s"]) if perf_stats["thinker_time_s"] > 0 else 0.0 + ) + + if not generate_audio: + perf_stats["total_tokens"] = perf_stats["thinker_tokens"] + perf_stats["total_time_s"] = time.time() - total_t0 + perf_stats["total_tps"] = ( + (perf_stats["total_tokens"] / perf_stats["total_time_s"]) if perf_stats["total_time_s"] > 0 else 0.0 + ) + # attach stats to self + setattr(self, "_perf_stats_last", perf_stats) + if not hasattr(self, "_perf_stats_history"): + setattr(self, "_perf_stats_history", []) + self._perf_stats_history.append(perf_stats) + return thinker_result, None + + # 2. Prepare talker input + thinker_embed = torch.cat([hidden_states[0] for hidden_states in thinker_result.hidden_states], dim=1).to( + self.talker.device + ) # [1 t d] + thinker_hidden = torch.cat( + [ + hidden_states[self.config.talker_config.accept_hidden_layer] + for hidden_states in thinker_result.hidden_states + ], + dim=1, + ).to(self.talker.device) # [1 t d] + + im_start_indexes = torch.cat( + ( + torch.nonzero(input_ids[0] == self.config.im_start_token_id).squeeze(), + torch.tensor([thinker_result.sequences.shape[-1]], device=input_ids.device, dtype=input_ids.dtype), + ), + dim=-1, + ).to(self.talker.device) # Shape [n_starts + 1]; Take batch 0 since batched inference is not supported here. + multimodal_mask = ( + (thinker_result.sequences == self.config.thinker_config.audio_token_id) | + (thinker_result.sequences == self.config.thinker_config.image_token_id) | + (thinker_result.sequences == self.config.thinker_config.video_token_id) + ).to(self.talker.device) # [1 t] # fmt: skip + + talker_special_tokens = torch.tensor( + [[self.config.tts_bos_token_id, self.config.tts_eos_token_id, self.config.tts_pad_token_id]], + device=self.thinker.device, + dtype=input_ids.dtype, + ) + tts_bos_embed, tts_eos_embed, tts_pad_embed = ( + self.talker.text_projection(self.thinker.get_input_embeddings()(talker_special_tokens)) + .to(self.talker.device) + .chunk(3, dim=1) + ) # 3 * [1 1 d] + + talker_input_embeds = [] # [1 t d] + talker_input_ids = [] + # For every chatml parts + for i in range(len(im_start_indexes) - 1): + im_start_index = im_start_indexes[i] + segment_end_index = im_start_indexes[i + 1] + role_token = input_ids[0][im_start_index + 1] + # Talker should ignore thinker system prompt + if role_token == self.config.system_token_id: + continue + # Talker takes word embeddings for tokens and hidden state from `accept_hidden_layer` for multimodal inputs + elif role_token == self.config.user_token_id: + talker_user_part = self._get_talker_user_parts( + im_start_index, segment_end_index, multimodal_mask, thinker_hidden, thinker_embed + ) + talker_input_embeds.append(talker_user_part) + talker_input_ids.append(thinker_result.sequences[:, im_start_index:segment_end_index]) + # Take assistant output (for now) + elif role_token == self.config.assistant_token_id and i == len(im_start_indexes) - 2: + talker_assistant_embeds, talker_assistant_ids, trailing_text_hidden = self._get_talker_assistant_parts( + im_start_index, + segment_end_index, + speaker_id, + thinker_embed, + tts_pad_embed, + tts_bos_embed, + tts_eos_embed, + ) + talker_input_embeds.append(talker_assistant_embeds) + talker_input_ids.append(talker_assistant_ids) + # History assistant output (ignore for now) + elif role_token == self.config.assistant_token_id and i != len(im_start_indexes) - 2: + continue + else: + raise AssertionError("Expect role id after <|im_start|> (assistant, user, system)") + talker_input_embed = torch.cat([embed.to(self.talker.device) for embed in talker_input_embeds], dim=1) + talker_input_id = torch.cat([embed.to(self.talker.device) for embed in talker_input_ids], dim=1) + t2 = time.time() + talker_result = self.talker.generate( + inputs_embeds=talker_input_embed, + trailing_text_hidden=trailing_text_hidden, + tts_pad_embed=tts_pad_embed, + talker_input_ids=talker_input_id, # Not use input_ids to prevent repetition penalty out of bound + **talker_kwargs, + ) + t3 = time.time() + perf_stats["talker_time_s"] = max(0.0, t3 - t2) + talker_codes = ( + torch.stack([hid[-1] for hid in talker_result.hidden_states if hid[-1] is not None], dim=1) + .transpose(1, 2) + .to(self.code2wav.device) + ) + try: + # codes shape: (B, num_quantizers, T). We log T as token length. + perf_stats["talker_tokens"] = int(talker_codes.shape[-1]) + except Exception: + perf_stats["talker_tokens"] = 0 + perf_stats["talker_tps"] = ( + (perf_stats["talker_tokens"] / perf_stats["talker_time_s"]) if perf_stats["talker_time_s"] > 0 else 0.0 + ) + t4 = time.time() + talker_wavs = self.code2wav.chunked_decode(talker_codes, chunk_size=300, left_context_size=25).float() + t5 = time.time() + perf_stats["code2wav_time_s"] = max(0.0, t5 - t4) + perf_stats["code2wav_tokens"] = perf_stats["talker_tokens"] # same T, not times 16 + perf_stats["code2wav_tps"] = ( + (perf_stats["code2wav_tokens"] / perf_stats["code2wav_time_s"]) + if perf_stats["code2wav_time_s"] > 0 + else 0.0 + ) + perf_stats["total_tokens"] = perf_stats["thinker_tokens"] + perf_stats["talker_tokens"] + perf_stats["total_time_s"] = time.time() - total_t0 + perf_stats["total_tps"] = ( + (perf_stats["total_tokens"] / perf_stats["total_time_s"]) if perf_stats["total_time_s"] > 0 else 0.0 + ) + setattr(self, "_perf_stats_last", perf_stats) + if not hasattr(self, "_perf_stats_history"): + setattr(self, "_perf_stats_history", []) + self._perf_stats_history.append(perf_stats) + return thinker_result, talker_wavs.float() + + +__all__ = [ + "Qwen3OmniMoeForConditionalGenerationWithLogging", +] diff --git a/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_transformers.py b/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_transformers.py new file mode 100644 index 00000000000..87d3de797b8 --- /dev/null +++ b/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_transformers.py @@ -0,0 +1,275 @@ +import argparse +import json +import os + +import soundfile as sf +from qwen3_omni_moe_model import Qwen3OmniMoeForConditionalGenerationWithLogging +from qwen_omni_utils import process_mm_info +from tqdm import tqdm +from transformers import Qwen3OmniMoeProcessor + +MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct" +# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking" + + +def load_prompts(prompts_file: str) -> list[str]: + """Load prompts from a text file, one prompt per line.""" + prompts = [] + with open(prompts_file, encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + prompts.append(line) + return prompts + + +def run_benchmark( + model, + processor, + prompts: list[str], + output_dir: str = "benchmark_results", + speaker: str = "Ethan", + use_audio_in_video: bool = True, +): + """ + Run benchmark on a list of prompts and collect performance stats. + + Args: + model: The Qwen3OmniMoe model + processor: The Qwen3OmniMoe processor + prompts: List of text prompts to process + output_dir: Directory to save results + speaker: Speaker voice for audio output + use_audio_in_video: Whether to use audio in video + + Returns: + tuple: (aggregated_stats, results, audio_outputs) + - aggregated_stats: dict with aggregated performance statistics + - results: list of dicts with per-prompt results + - audio_outputs: list of audio tensors/arrays (or None if no audio) + """ + os.makedirs(output_dir, exist_ok=True) + audio_dir = os.path.join(output_dir, "audio") + os.makedirs(audio_dir, exist_ok=True) + + all_stats = [] + results = [] + audio_outputs = [] + + for idx, prompt in enumerate(tqdm(prompts, desc="Processing prompts")): + conversation = [ + { + "role": "user", + "content": [{"type": "text", "text": prompt}], + }, + ] + + # Preparation for inference + text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) + audios, images, videos = process_mm_info(conversation, use_audio_in_video=use_audio_in_video) + inputs = processor( + text=text, + audio=audios, + images=images, + videos=videos, + return_tensors="pt", + padding=True, + use_audio_in_video=use_audio_in_video, + ) + inputs = inputs.to(model.device).to(model.dtype) + + # Inference: Generation of the output text and audio + text_ids, audio = model.generate( + **inputs, speaker=speaker, thinker_return_dict_in_generate=True, use_audio_in_video=use_audio_in_video + ) + + # Decode output text + output_text = processor.batch_decode( + text_ids.sequences[:, inputs["input_ids"].shape[1] :], + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + )[0] + + # Collect performance stats + perf_stats = None + if hasattr(model, "_perf_stats_last"): + perf_stats = model._perf_stats_last.copy() + perf_stats["prompt_idx"] = idx + perf_stats["prompt"] = prompt + all_stats.append(perf_stats) + + # Save audio and collect audio output + audio_path = None + audio_data = None + if audio is not None: + audio_data = audio.reshape(-1).detach().cpu().numpy() + audio_path = os.path.join(audio_dir, f"output_{idx:04d}.wav") + sf.write( + audio_path, + audio_data, + samplerate=24000, + ) + audio_outputs.append(audio_data) + else: + audio_outputs.append(None) + + # Save result + result = { + "idx": idx, + "prompt": prompt, + "output": output_text, + "audio_path": audio_path, + "perf_stats": perf_stats, + } + results.append(result) + + # Aggregate statistics + aggregated_stats = aggregate_stats(all_stats) + + # Save all results + results_path = os.path.join(output_dir, "results.json") + with open(results_path, "w", encoding="utf-8") as f: + json.dump(results, f, ensure_ascii=False, indent=2) + + # Save aggregated stats + stats_path = os.path.join(output_dir, "perf_stats.json") + with open(stats_path, "w", encoding="utf-8") as f: + json.dump({"aggregated": aggregated_stats, "per_prompt": all_stats}, f, ensure_ascii=False, indent=2) + + # Count saved audio files + num_audio_saved = sum(1 for a in audio_outputs if a is not None) + print(f"\nSaved {num_audio_saved} audio files to {audio_dir}/") + + return aggregated_stats, results, audio_outputs + + +def aggregate_stats(all_stats: list[dict]) -> dict: + """Aggregate performance statistics from multiple runs.""" + if not all_stats: + return {} + + keys = [ + "thinker_tokens", + "thinker_time_s", + "thinker_tps", + "talker_tokens", + "talker_time_s", + "talker_tps", + "code2wav_tokens", + "code2wav_time_s", + "code2wav_tps", + "total_tokens", + "total_time_s", + "total_tps", + ] + + aggregated = { + "num_samples": len(all_stats), + } + + for key in keys: + values = [s.get(key, 0) for s in all_stats if key in s] + if values: + aggregated[f"{key}_sum"] = sum(values) + aggregated[f"{key}_avg"] = sum(values) / len(values) + aggregated[f"{key}_min"] = min(values) + aggregated[f"{key}_max"] = max(values) + + # Calculate overall throughput + total_tokens = aggregated.get("total_tokens_sum", 0) + total_time = aggregated.get("total_time_s_sum", 0) + if total_time > 0: + aggregated["overall_tps"] = total_tokens / total_time + + return aggregated + + +def print_stats(stats: dict): + """Print performance statistics in a formatted way.""" + print("\n" + "=" * 60) + print("Performance Statistics Summary") + print("=" * 60) + + print(f"\nNumber of samples: {stats.get('num_samples', 0)}") + + print("\n--- Thinker ---") + print(f" Total tokens: {stats.get('thinker_tokens_sum', 0):.0f}") + print(f" Total time: {stats.get('thinker_time_s_sum', 0):.2f}s") + print(f" Avg TPS: {stats.get('thinker_tps_avg', 0):.2f}") + print(f" Min TPS: {stats.get('thinker_tps_min', 0):.2f}") + print(f" Max TPS: {stats.get('thinker_tps_max', 0):.2f}") + + print("\n--- Talker ---") + print(f" Total tokens: {stats.get('talker_tokens_sum', 0):.0f}") + print(f" Total time: {stats.get('talker_time_s_sum', 0):.2f}s") + print(f" Avg TPS: {stats.get('talker_tps_avg', 0):.2f}") + print(f" Min TPS: {stats.get('talker_tps_min', 0):.2f}") + print(f" Max TPS: {stats.get('talker_tps_max', 0):.2f}") + + print("\n--- Code2Wav ---") + print(f" Total tokens: {stats.get('code2wav_tokens_sum', 0):.0f}") + print(f" Total time: {stats.get('code2wav_time_s_sum', 0):.2f}s") + print(f" Avg TPS: {stats.get('code2wav_tps_avg', 0):.2f}") + print(f" Min TPS: {stats.get('code2wav_tps_min', 0):.2f}") + print(f" Max TPS: {stats.get('code2wav_tps_max', 0):.2f}") + + print("\n--- Overall ---") + print(f" Total tokens: {stats.get('total_tokens_sum', 0):.0f}") + print(f" Total time: {stats.get('total_time_s_sum', 0):.2f}s") + print(f" Overall TPS: {stats.get('overall_tps', 0):.2f}") + print(f" Avg TPS: {stats.get('total_tps_avg', 0):.2f}") + print(f" Min TPS: {stats.get('total_tps_min', 0):.2f}") + print(f" Max TPS: {stats.get('total_tps_max', 0):.2f}") + + print("=" * 60 + "\n") + + +def main(): + parser = argparse.ArgumentParser(description="Qwen3-Omni Benchmark Script") + parser.add_argument( + "--prompts-file", + type=str, + default="benchmark/build_dataset/top100.txt", + help="Path to the prompts file (one prompt per line)", + ) + parser.add_argument( + "--output-dir", type=str, default="benchmark_results", help="Directory to save benchmark results" + ) + parser.add_argument("--model-path", type=str, default=MODEL_PATH, help="Path to the model") + parser.add_argument("--speaker", type=str, default="Ethan", help="Speaker voice for audio output") + parser.add_argument("--num-prompts", type=int, default=None, help="Number of prompts to process (default: all)") + args = parser.parse_args() + + # Load model and processor + print(f"Loading model from {args.model_path}...") + model = Qwen3OmniMoeForConditionalGenerationWithLogging.from_pretrained( + args.model_path, + dtype="auto", + device_map="auto", + attn_implementation="flash_attention_2", + ) + processor = Qwen3OmniMoeProcessor.from_pretrained(args.model_path) + + # Benchmark mode + print(f"Loading prompts from {args.prompts_file}...") + prompts = load_prompts(args.prompts_file) + + if args.num_prompts: + prompts = prompts[: args.num_prompts] + + print(f"Running benchmark on {len(prompts)} prompts...") + + aggregated_stats, results, audio_outputs = run_benchmark( + model=model, + processor=processor, + prompts=prompts, + output_dir=args.output_dir, + speaker=args.speaker, + ) + + print_stats(aggregated_stats) + print(f"\nResults saved to {args.output_dir}/") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/qwen3-omni/vllm-omni-vs-hf.png b/benchmarks/qwen3-omni/vllm-omni-vs-hf.png new file mode 100644 index 00000000000..e47079335be Binary files /dev/null and b/benchmarks/qwen3-omni/vllm-omni-vs-hf.png differ diff --git a/benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh b/benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh new file mode 100644 index 00000000000..e4c83e97510 --- /dev/null +++ b/benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Qwen3-Omni Benchmark Evaluation Script +# This script must be run from the vllm-omni root directory + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Navigate to vllm-omni root directory (4 levels up from script location) +VLLM_OMNI_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +cd "$VLLM_OMNI_ROOT" || { echo "Error: Failed to navigate to vllm-omni directory"; exit 1; } + +echo "Working directory: $(pwd)" + +# Verify we're in the correct directory and run benchmark +if [[ ! -d "benchmarks/qwen3-omni/vllm_omni" ]]; then + echo "Error: Not in vllm-omni root directory. Please run from vllm-omni folder." +else + log_dir=benchmarks/qwen3-omni/vllm_omni/logs + outputs_dir=benchmarks/qwen3-omni/vllm_omni/outputs + end2end_script_path=examples/offline_inference/qwen3_omni/end2end.py + build_dataset_path=benchmarks/build_dataset/top100.txt + + python $end2end_script_path --output-wav $outputs_dir \ + --query-type text \ + --txt-prompts $build_dataset_path \ + --log-stats \ + --log-dir $log_dir + echo "Logs and outputs are saved in ${log_dir} and ${outputs_dir} respectively:" + echo " - omni_pipeline_text run dir/base name" + echo " - omni_pipeline_text.orchestrator.stats.jsonl orchestrator-stage latency stats" + echo " - omni_pipeline_text.overall.stats.jsonl overall latency/TPS stats" + echo " - omni_pipeline_text.stage0.log per-stage detailed logs" + echo " - omni_pipeline_text.stage1.log" + echo " - omni_pipeline_text.stage2.log" + echo "Key checks: overall.stats.jsonl for end-to-end latency/TPS; orchestrator.stats.jsonl for stable per-stage latency; stage*.log for errors or long tails." + echo " - outputs/ Generated txt and wav files, there should be 100 text and wav files generated respectively" +fi diff --git a/benchmarks/qwen3-tts/README.md b/benchmarks/qwen3-tts/README.md new file mode 100644 index 00000000000..9c01f29aa9f --- /dev/null +++ b/benchmarks/qwen3-tts/README.md @@ -0,0 +1,103 @@ +# Qwen3-TTS Benchmark + +Benchmarks for Qwen3-TTS text-to-speech models, comparing vLLM-Omni streaming serving against HuggingFace Transformers offline inference. + +## Prerequisites + +```bash +pip install matplotlib aiohttp soundfile numpy tqdm +pip install qwen_tts # for HF baseline +``` + +## Quick Start + +Run the full benchmark (vllm-omni + HF baseline) with a single command: + +```bash +cd benchmarks/qwen3-tts +bash run_benchmark.sh +``` + +Results (JSON + PNG plots) are saved to `results/`. + +### Common options + +```bash +# Only vllm-omni (skip HF baseline) +bash run_benchmark.sh --async-only + +# Only HF baseline +bash run_benchmark.sh --hf-only + +# Use a different model (e.g. 1.7B) +MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only + +# Use a Voice Clone model +MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-Base TASK_TYPE=Base bash run_benchmark.sh --async-only + +# Use bs16 config for higher throughput +STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs16.yaml bash run_benchmark.sh --async-only + +# Custom GPU, prompt count, concurrency levels +GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh +``` + +## Manual Steps + +### 1) Start the vLLM-Omni server + +```bash +CUDA_VISIBLE_DEVICES=0 python -m vllm_omni.entrypoints.cli.main serve \ + "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" \ + --omni --host 127.0.0.1 --port 8000 \ + --stage-configs-path benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml \ + --trust-remote-code +``` + +### 2) Run online serving benchmark + +```bash +python benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py \ + --port 8000 \ + --num-prompts 50 \ + --max-concurrency 1 4 10 \ + --config-name "async_chunk" \ + --result-dir results/ +``` + +### 3) Run HuggingFace baseline + +```bash +python benchmarks/qwen3-tts/transformers/bench_tts_hf.py \ + --model "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" \ + --num-prompts 50 \ + --gpu-device 0 \ + --result-dir results/ +``` + +### 4) Generate comparison plots + +```bash +python benchmarks/qwen3-tts/plot_results.py \ + --results results/bench_async_chunk_*.json results/bench_hf_transformers_*.json \ + --labels "vllm-omni" "hf_transformers" \ + --output results/comparison.png +``` + +## Stage Configs + +| Config | max_num_seqs | Description | +|--------|:------------:|-------------| +| `vllm_omni/configs/qwen3_tts_bs1.yaml` | 1 | Single-request processing (lowest latency) | +| `vllm_omni/configs/qwen3_tts_bs16.yaml` | 16 | High-throughput concurrent processing | + +All configs use a 2-stage pipeline (Talker -> Code2Wav) with `async_chunk` streaming enabled. The `SharedMemoryConnector` streams codec frames (25-frame chunks with 25-frame context overlap) between stages. + +The model is specified via the CLI `--model` flag (or `MODEL` env var), so the same configs work for both the 0.6B and 1.7B model variants. + +## Metrics + +- **TTFP (Time to First Audio Packet)**: Time from request to first audio chunk (streaming latency) +- **E2E (End-to-End Latency)**: Total time from request to complete audio response +- **RTF (Real-Time Factor)**: E2E latency / audio duration. RTF < 1.0 means faster-than-real-time synthesis +- **Throughput**: Total audio seconds generated per wall-clock second diff --git a/benchmarks/qwen3-tts/plot_results.py b/benchmarks/qwen3-tts/plot_results.py new file mode 100644 index 00000000000..e750101e324 --- /dev/null +++ b/benchmarks/qwen3-tts/plot_results.py @@ -0,0 +1,254 @@ +"""Plot Qwen3-TTS benchmark results. + +Generates comparison bar charts similar to the async_chunk design doc: +- TTFP (Time-to-First-Packet) across concurrency levels +- E2E latency across concurrency levels +- RTF (Real-Time Factor) across concurrency levels + +Usage: + # Compare two configs (async_chunk vs no_async_chunk): + python plot_results.py \ + --results results/bench_async_chunk_*.json results/bench_no_async_chunk_*.json \ + --labels "async_chunk" "no_async_chunk" \ + --output results/qwen3_tts_benchmark.png + + # Single config: + python plot_results.py \ + --results results/bench_async_chunk_*.json \ + --labels "async_chunk" \ + --output results/qwen3_tts_benchmark.png +""" + +import argparse +import json +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + + +def load_results(result_files: list[str]) -> list[list[dict]]: + """Load benchmark results from JSON files.""" + all_results = [] + for f in result_files: + with open(f) as fh: + data = json.load(fh) + all_results.append(data) + return all_results + + +def plot_comparison( + all_results: list[list[dict]], + labels: list[str], + output_path: str, + title_prefix: str = "Qwen3-TTS", +): + """Generate comparison bar charts.""" + n_configs = len(all_results) + + # Collect concurrency levels present in ALL configs (skip missing data) + all_concurrencies = [set(r["concurrency"] for r in results) for results in all_results] + concurrencies = sorted(set.union(*all_concurrencies)) + + # Build data arrays, using None for missing concurrency levels + ttfp_data = {label: [] for label in labels} + e2e_data = {label: [] for label in labels} + rtf_data = {label: [] for label in labels} + throughput_data = {label: [] for label in labels} + + for results, label in zip(all_results, labels): + conc_map = {r["concurrency"]: r for r in results} + for c in concurrencies: + r = conc_map.get(c) + ttfp_data[label].append(r["mean_ttfp_ms"] if r else None) + e2e_data[label].append(r["mean_e2e_ms"] if r else None) + rtf_data[label].append(r["mean_rtf"] if r else None) + throughput_data[label].append(r["audio_throughput"] if r else None) + + fig, axes = plt.subplots(2, 2, figsize=(14, 10)) + fig.suptitle(f"{title_prefix} Performance Benchmark", fontsize=16, fontweight="bold") + + x = np.arange(len(concurrencies)) + width = 0.35 if n_configs == 2 else 0.5 + if n_configs > 1: + offsets = np.linspace(-width / 2 * (n_configs - 1), width / 2 * (n_configs - 1), n_configs) + else: + offsets = [0] + + colors = ["#2196F3", "#FF5722", "#4CAF50", "#FFC107"] + + def plot_metric(ax, data_dict, ylabel, title, fmt=".1f"): + bars = [] + for i, (label, values) in enumerate(data_dict.items()): + # Replace None with 0 for plotting, but track which are missing + plot_values = [v if v is not None else 0 for v in values] + color = colors[i % len(colors)] + bar = ax.bar(x + offsets[i], plot_values, width, label=label, color=color, alpha=0.85) + bars.append(bar) + # Add value labels on bars (skip None/missing data) + max_val = max((v for v in values if v is not None), default=1) + for rect, val in zip(bar, values): + if val is not None and val > 0: + ax.text( + rect.get_x() + rect.get_width() / 2, + rect.get_height() + max_val * 0.02, + f"{val:{fmt}}", + ha="center", + va="bottom", + fontsize=9, + fontweight="bold", + ) + ax.set_xlabel("Concurrency", fontsize=12) + ax.set_ylabel(ylabel, fontsize=12) + ax.set_title(title, fontsize=13, fontweight="bold") + ax.set_xticks(x) + ax.set_xticklabels([str(c) for c in concurrencies]) + ax.legend(fontsize=10) + ax.grid(axis="y", alpha=0.3) + ax.set_axisbelow(True) + + plot_metric(axes[0, 0], ttfp_data, "TTFP (ms)", "Time to First Audio Packet (TTFP)") + plot_metric(axes[0, 1], e2e_data, "E2E Latency (ms)", "End-to-End Latency (E2E)") + plot_metric(axes[1, 0], rtf_data, "RTF", "Real-Time Factor (RTF)", fmt=".3f") + plot_metric(axes[1, 1], throughput_data, "Audio-sec / Wall-sec", "Audio Throughput", fmt=".2f") + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"Plot saved to {output_path}") + plt.close() + + +def plot_single_summary(results: list[dict], label: str, output_path: str): + """Generate a single-config summary with percentile breakdown.""" + concurrencies = [r["concurrency"] for r in results] + + fig, axes = plt.subplots(1, 3, figsize=(16, 5)) + fig.suptitle(f"Qwen3-TTS Benchmark - {label}", fontsize=15, fontweight="bold") + + # TTFP breakdown + ax = axes[0] + means = [r["mean_ttfp_ms"] for r in results] + medians = [r["median_ttfp_ms"] for r in results] + p90s = [r["p90_ttfp_ms"] for r in results] + p99s = [r["p99_ttfp_ms"] for r in results] + x = np.arange(len(concurrencies)) + w = 0.2 + ax.bar(x - 1.5 * w, means, w, label="mean", color="#2196F3") + ax.bar(x - 0.5 * w, medians, w, label="median", color="#4CAF50") + ax.bar(x + 0.5 * w, p90s, w, label="p90", color="#FF9800") + ax.bar(x + 1.5 * w, p99s, w, label="p99", color="#F44336") + ax.set_xticks(x) + ax.set_xticklabels([str(c) for c in concurrencies]) + ax.set_xlabel("Concurrency") + ax.set_ylabel("TTFP (ms)") + ax.set_title("Time to First Audio Packet") + ax.legend(fontsize=9) + ax.grid(axis="y", alpha=0.3) + + # E2E breakdown + ax = axes[1] + means = [r["mean_e2e_ms"] for r in results] + medians = [r["median_e2e_ms"] for r in results] + p90s = [r["p90_e2e_ms"] for r in results] + p99s = [r["p99_e2e_ms"] for r in results] + ax.bar(x - 1.5 * w, means, w, label="mean", color="#2196F3") + ax.bar(x - 0.5 * w, medians, w, label="median", color="#4CAF50") + ax.bar(x + 0.5 * w, p90s, w, label="p90", color="#FF9800") + ax.bar(x + 1.5 * w, p99s, w, label="p99", color="#F44336") + ax.set_xticks(x) + ax.set_xticklabels([str(c) for c in concurrencies]) + ax.set_xlabel("Concurrency") + ax.set_ylabel("E2E Latency (ms)") + ax.set_title("End-to-End Latency") + ax.legend(fontsize=9) + ax.grid(axis="y", alpha=0.3) + + # RTF + ax = axes[2] + means = [r["mean_rtf"] for r in results] + medians = [r["median_rtf"] for r in results] + ax.bar(x - 0.15, means, 0.3, label="mean", color="#2196F3") + ax.bar(x + 0.15, medians, 0.3, label="median", color="#4CAF50") + ax.set_xticks(x) + ax.set_xticklabels([str(c) for c in concurrencies]) + ax.set_xlabel("Concurrency") + ax.set_ylabel("RTF") + ax.set_title("Real-Time Factor") + ax.legend(fontsize=9) + ax.grid(axis="y", alpha=0.3) + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"Plot saved to {output_path}") + plt.close() + + +def print_comparison_table(all_results: list[list[dict]], labels: list[str]): + """Print a markdown-formatted comparison table.""" + concurrencies = sorted(set(r["concurrency"] for r in all_results[0])) + + print("\n## Benchmark Results\n") + header = "| Metric | Concurrency |" + sep = "| --- | --- |" + for label in labels: + header += f" {label} |" + sep += " --- |" + print(header) + print(sep) + + for metric, key, fmt in [ + ("TTFP (ms)", "mean_ttfp_ms", ".1f"), + ("E2E (ms)", "mean_e2e_ms", ".1f"), + ("RTF", "mean_rtf", ".3f"), + ("Throughput (audio-s/s)", "audio_throughput", ".2f"), + ]: + for c in concurrencies: + row = f"| {metric} | {c} |" + for results in all_results: + conc_map = {r["concurrency"]: r for r in results} + val = conc_map.get(c, {}).get(key, 0) + row += f" {val:{fmt}} |" + print(row) + + # Improvement calculation (only if 2 configs) + if len(all_results) == 2: + print(f"\n## Improvement ({labels[0]} vs {labels[1]})\n") + print("| Metric | Concurrency | Improvement |") + print("| --- | --- | --- |") + for metric, key in [("TTFP", "mean_ttfp_ms"), ("E2E", "mean_e2e_ms"), ("RTF", "mean_rtf")]: + for c in concurrencies: + m0 = {r["concurrency"]: r for r in all_results[0]} + m1 = {r["concurrency"]: r for r in all_results[1]} + v0 = m0.get(c, {}).get(key, 0) + v1 = m1.get(c, {}).get(key, 0) + if v1 > 0: + pct = (v1 - v0) / v1 * 100 + print(f"| {metric} | {c} | {pct:+.1f}% |") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Plot Qwen3-TTS benchmark results") + parser.add_argument( + "--results", type=str, nargs="+", required=True, help="Path(s) to result JSON files (one per config)" + ) + parser.add_argument( + "--labels", type=str, nargs="+", required=True, help="Labels for each config (must match --results count)" + ) + parser.add_argument("--output", type=str, default="results/qwen3_tts_benchmark.png", help="Output image path") + parser.add_argument("--title", type=str, default="Qwen3-TTS", help="Title prefix for the plot") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + assert len(args.results) == len(args.labels), "--results and --labels must have the same count" + + all_results = load_results(args.results) + print_comparison_table(all_results, args.labels) + + Path(args.output).parent.mkdir(parents=True, exist_ok=True) + + if len(all_results) == 1: + plot_single_summary(all_results[0], args.labels[0], args.output) + else: + plot_comparison(all_results, args.labels, args.output, title_prefix=args.title) diff --git a/benchmarks/qwen3-tts/results/.gitignore b/benchmarks/qwen3-tts/results/.gitignore new file mode 100644 index 00000000000..5b6759ef717 --- /dev/null +++ b/benchmarks/qwen3-tts/results/.gitignore @@ -0,0 +1,3 @@ +# Benchmark results are machine-specific - do not commit +* +!.gitignore diff --git a/benchmarks/qwen3-tts/run_benchmark.sh b/benchmarks/qwen3-tts/run_benchmark.sh new file mode 100755 index 00000000000..283b6b844c1 --- /dev/null +++ b/benchmarks/qwen3-tts/run_benchmark.sh @@ -0,0 +1,280 @@ +#!/bin/bash +# Qwen3-TTS Benchmark Runner +# +# Compares vllm-omni streaming serving vs HuggingFace transformers offline inference. +# Produces JSON results and comparison plots. +# +# Usage: +# # Full comparison (vllm-omni + HF): +# bash run_benchmark.sh +# +# # Only vllm-omni async_chunk config: +# bash run_benchmark.sh --async-only +# +# # Only HuggingFace baseline: +# bash run_benchmark.sh --hf-only +# +# # vllm-omni only (skip HF): +# bash run_benchmark.sh --skip-hf +# +# # Custom settings: +# GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh +# +# # Use 1.7B model: +# MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only +# +# # Use Voice Clone model +# MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-Base TASK_TYPE=Base bash run_benchmark.sh --async-only +# +# # Use batch_size=4 config: +# STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs4.yaml bash run_benchmark.sh --async-only +# +# Environment variables: +# GPU_DEVICE - GPU index to use (default: 0) +# NUM_PROMPTS - Number of prompts per concurrency level (default: 50) +# CONCURRENCY - Space-separated concurrency levels (default: "1 4 10") +# MODEL - Model name (default: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice) +# PORT - Server port (default: 8000) +# GPU_MEM_TALKER - gpu_memory_utilization for talker stage (default: 0.3) +# GPU_MEM_CODE2WAV - gpu_memory_utilization for code2wav stage (default: 0.2) +# STAGE_CONFIG - Path to stage config YAML (default: configs/qwen3_tts_bs1.yaml) +# TASK_TYPE - Task type: CustomVoice, VoiceDesign, Base (default: CustomVoice) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +# Defaults +GPU_DEVICE="${GPU_DEVICE:-0}" +NUM_PROMPTS="${NUM_PROMPTS:-50}" +CONCURRENCY="${CONCURRENCY:-1 4 10}" +MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice}" +PORT="${PORT:-8000}" +GPU_MEM_TALKER="${GPU_MEM_TALKER:-0.3}" +GPU_MEM_CODE2WAV="${GPU_MEM_CODE2WAV:-0.2}" +NUM_WARMUPS="${NUM_WARMUPS:-3}" +STAGE_CONFIG="${STAGE_CONFIG:-vllm_omni/configs/qwen3_tts_bs1.yaml}" +RESULT_DIR="${SCRIPT_DIR}/results" +TIMESTAMP="$(date +%Y%m%d_%H%M%S)" +TASK_TYPE="${TASK_TYPE:-CustomVoice}" + +# Parse args +RUN_ASYNC=true +RUN_HF=true +for arg in "$@"; do + case "$arg" in + --async-only) RUN_HF=false ;; + --hf-only) RUN_ASYNC=false ;; + --skip-hf) RUN_HF=false ;; + esac +done + +mkdir -p "${RESULT_DIR}" + +echo "============================================================" +echo " Qwen3-TTS Benchmark" +echo "============================================================" +echo " GPU: ${GPU_DEVICE}" +echo " Model: ${MODEL}" +echo " Prompts: ${NUM_PROMPTS}" +echo " Concurrency: ${CONCURRENCY}" +echo " Port: ${PORT}" +echo " Stage config: ${STAGE_CONFIG}" +echo " Results: ${RESULT_DIR}" +echo " Task type: ${TASK_TYPE}" +echo "============================================================" + +# Prepare stage config with correct GPU device and memory settings +prepare_config() { + local config_template="$1" + local config_name="$2" + local output_path="${RESULT_DIR}/${config_name}_stage_config.yaml" + + # Use sed to patch GPU device and memory utilization + sed \ + -e "s/devices: \"0\"/devices: \"${GPU_DEVICE}\"/g" \ + -e "s/gpu_memory_utilization: 0.3/gpu_memory_utilization: ${GPU_MEM_TALKER}/g" \ + -e "s/gpu_memory_utilization: 0.2/gpu_memory_utilization: ${GPU_MEM_CODE2WAV}/g" \ + "${config_template}" > "${output_path}" + + echo "${output_path}" +} + +# Start server and wait for it to be ready +start_server() { + local stage_config="$1" + local config_name="$2" + local log_file="${RESULT_DIR}/server_${config_name}_${TIMESTAMP}.log" + + echo "" + echo "Starting server with config: ${config_name}" + echo " Stage config: ${stage_config}" + echo " Log file: ${log_file}" + + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + CUDA_VISIBLE_DEVICES="${GPU_DEVICE}" \ + python -m vllm_omni.entrypoints.cli.main serve "${MODEL}" \ + --omni \ + --host 127.0.0.1 \ + --port "${PORT}" \ + --stage-configs-path "${stage_config}" \ + --stage-init-timeout 120 \ + --trust-remote-code \ + --disable-log-stats \ + > "${log_file}" 2>&1 & + + SERVER_PID=$! + echo " Server PID: ${SERVER_PID}" + + # Wait for server to be ready + echo " Waiting for server to be ready..." + local max_wait=300 + local waited=0 + while [ ${waited} -lt ${max_wait} ]; do + if curl -sf "http://127.0.0.1:${PORT}/v1/models" > /dev/null 2>&1; then + echo " Server is ready! (waited ${waited}s)" + return 0 + fi + # Check if process is still alive + if ! kill -0 ${SERVER_PID} 2>/dev/null; then + echo " ERROR: Server process died. Check log: ${log_file}" + tail -20 "${log_file}" + return 1 + fi + sleep 2 + waited=$((waited + 2)) + done + + echo " ERROR: Server did not start within ${max_wait}s. Check log: ${log_file}" + kill ${SERVER_PID} 2>/dev/null || true + return 1 +} + +# Stop the server +stop_server() { + if [ -n "${SERVER_PID:-}" ]; then + echo " Stopping server (PID: ${SERVER_PID})..." + kill ${SERVER_PID} 2>/dev/null || true + wait ${SERVER_PID} 2>/dev/null || true + # Kill any remaining child processes on the port + local pids + pids=$(lsof -ti:${PORT} 2>/dev/null || true) + if [ -n "${pids}" ]; then + echo " Cleaning up remaining processes on port ${PORT}..." + echo "${pids}" | xargs kill -9 2>/dev/null || true + fi + echo " Server stopped." + SERVER_PID="" + fi +} + +# Cleanup on exit +trap 'stop_server' EXIT + +# Run benchmark for a given config +run_bench() { + local config_name="$1" + local config_template="$2" + + echo "" + echo "============================================================" + echo " Benchmarking: ${config_name}" + echo "============================================================" + + local stage_config + stage_config=$(prepare_config "${config_template}" "${config_name}") + + start_server "${stage_config}" "${config_name}" + + # Convert concurrency string to args + local conc_args="" + for c in ${CONCURRENCY}; do + conc_args="${conc_args} ${c}" + done + + cd "${PROJECT_ROOT}" + python "${SCRIPT_DIR}/vllm_omni/bench_tts_serve.py" \ + --host 127.0.0.1 \ + --port "${PORT}" \ + --num-prompts "${NUM_PROMPTS}" \ + --max-concurrency ${conc_args} \ + --num-warmups "${NUM_WARMUPS}" \ + --config-name "${config_name}" \ + --result-dir "${RESULT_DIR}" \ + --task-type "${TASK_TYPE}" + + stop_server + + # Allow GPU memory to settle + sleep 5 +} + +# Run vllm-omni benchmark +if [ "${RUN_ASYNC}" = true ]; then + run_bench "async_chunk" "${SCRIPT_DIR}/${STAGE_CONFIG}" +fi + +# Run HuggingFace baseline benchmark +if [ "${RUN_HF}" = true ]; then + echo "" + echo "============================================================" + echo " Benchmarking: HuggingFace transformers (offline)" + echo "============================================================" + + cd "${PROJECT_ROOT}" + python "${SCRIPT_DIR}/transformers/bench_tts_hf.py" \ + --model "${MODEL}" \ + --num-prompts "${NUM_PROMPTS}" \ + --num-warmups "${NUM_WARMUPS}" \ + --gpu-device "${GPU_DEVICE}" \ + --config-name "hf_transformers" \ + --result-dir "${RESULT_DIR}" \ + --task-type "${TASK_TYPE}" + + # Allow GPU memory to settle + sleep 5 +fi + +# Plot results +echo "" +echo "============================================================" +echo " Generating plots..." +echo "============================================================" + +RESULT_FILES="" +LABELS="" + +if [ "${RUN_ASYNC}" = true ]; then + ASYNC_FILE=$(ls -t "${RESULT_DIR}"/bench_async_chunk_*.json 2>/dev/null | head -1) + if [ -n "${ASYNC_FILE}" ]; then + RESULT_FILES="${ASYNC_FILE}" + LABELS="async_chunk" + fi +fi + +if [ "${RUN_HF}" = true ]; then + HF_FILE=$(ls -t "${RESULT_DIR}"/bench_hf_transformers_*.json 2>/dev/null | head -1) + if [ -n "${HF_FILE}" ]; then + if [ -n "${RESULT_FILES}" ]; then + RESULT_FILES="${RESULT_FILES} ${HF_FILE}" + LABELS="${LABELS} hf_transformers" + else + RESULT_FILES="${HF_FILE}" + LABELS="hf_transformers" + fi + fi +fi + +if [ -n "${RESULT_FILES}" ]; then + python "${SCRIPT_DIR}/plot_results.py" \ + --results ${RESULT_FILES} \ + --labels ${LABELS} \ + --output "${RESULT_DIR}/qwen3_tts_benchmark_${TIMESTAMP}.png" +fi + +echo "" +echo "============================================================" +echo " Benchmark complete!" +echo " Results: ${RESULT_DIR}" +echo "============================================================" diff --git a/benchmarks/qwen3-tts/transformers/bench_tts_hf.py b/benchmarks/qwen3-tts/transformers/bench_tts_hf.py new file mode 100644 index 00000000000..ed04ee264c4 --- /dev/null +++ b/benchmarks/qwen3-tts/transformers/bench_tts_hf.py @@ -0,0 +1,301 @@ +"""Benchmark Qwen3-TTS using HuggingFace transformers (qwen_tts library). + +Measures E2E latency, RTF, and audio duration for offline (non-serving) inference. +Results are saved in the same JSON format as bench_tts_serve.py for unified plotting. + +Usage: + python bench_tts_hf.py \ + --model Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice \ + --num-prompts 50 \ + --num-warmups 3 \ + --gpu-device 0 \ + --result-dir results/ +""" + +import argparse +import json +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path + +import numpy as np +import soundfile as sf +import torch + +PROMPTS = [ + "Hello, welcome to the voice synthesis benchmark test.", + "She said she would be here by noon, but nobody showed up.", + "The quick brown fox jumps over the lazy dog near the riverbank.", + "I can't believe how beautiful the sunset looks from up here on the mountain.", + "Please remember to bring your identification documents to the appointment tomorrow morning.", + "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?", + "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.", + "After the meeting, we should discuss the quarterly results and plan for the next phase.", + "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.", + "The train leaves at half past seven, so we need to arrive at the station before then.", + "Could you please turn down the music a little bit, I'm trying to concentrate on my work.", + "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", +] + +REF_AUDIO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav" +REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you." +INSTRUCT = "Speak in an incredulous tone, but with a hint of panic beginning to creep into your voice." + + +@dataclass +class BenchmarkResult: + config_name: str = "" + concurrency: int = 1 # always 1 for offline + num_prompts: int = 0 + completed: int = 0 + failed: int = 0 + duration_s: float = 0.0 + # TTFP stats - not applicable for HF offline, set to E2E for compatibility + mean_ttfp_ms: float = 0.0 + median_ttfp_ms: float = 0.0 + std_ttfp_ms: float = 0.0 + p90_ttfp_ms: float = 0.0 + p95_ttfp_ms: float = 0.0 + p99_ttfp_ms: float = 0.0 + # E2E stats (ms) + mean_e2e_ms: float = 0.0 + median_e2e_ms: float = 0.0 + std_e2e_ms: float = 0.0 + p90_e2e_ms: float = 0.0 + p95_e2e_ms: float = 0.0 + p99_e2e_ms: float = 0.0 + # RTF stats + mean_rtf: float = 0.0 + median_rtf: float = 0.0 + std_rtf: float = 0.0 + p99_rtf: float = 0.0 + # Audio stats + mean_audio_duration_s: float = 0.0 + total_audio_duration_s: float = 0.0 + audio_throughput: float = 0.0 + request_throughput: float = 0.0 + # Per-request details + per_request: list = field(default_factory=list) + + +def generate_audio(model, prompt: str, args): + if args.task_type == "Base": + return model.generate_voice_clone( + text=prompt, + language=args.language, + ref_audio=REF_AUDIO, + ref_text=REF_TEXT, + ) + + if args.task_type == "VoiceDesign": + return model.generate_voice_design( + text=prompt, + language=args.language, + instruct=INSTRUCT, + ) + + return model.generate_custom_voice( + text=prompt, + language=args.language, + speaker=args.voice, + ) + + +def run_benchmark(args): + from qwen_tts import Qwen3TTSModel + + device = f"cuda:{args.gpu_device}" + print(f"Loading model: {args.model} on {device}") + model = Qwen3TTSModel.from_pretrained( + args.model, + device_map=device, + dtype=torch.bfloat16, + ) + print("Model loaded.") + + # Build prompt list + prompts = [PROMPTS[i % len(PROMPTS)] for i in range(args.num_prompts)] + + # Warmup + if args.num_warmups > 0: + print(f"Warming up with {args.num_warmups} requests...") + for i in range(args.num_warmups): + p = PROMPTS[i % len(PROMPTS)] + wavs, sr = generate_audio(model, p, args) + # Sync GPU + torch.cuda.synchronize(device) + print("Warmup done.") + + # Benchmark + print(f"Running {args.num_prompts} requests sequentially...") + e2e_times = [] + rtfs = [] + audio_durations = [] + per_request = [] + failed = 0 + + audio_dir = None + if args.save_audio: + audio_dir = Path(args.result_dir) / "audio_hf" + audio_dir.mkdir(parents=True, exist_ok=True) + + total_start = time.perf_counter() + + for i, prompt in enumerate(prompts): + try: + torch.cuda.synchronize(device) + st = time.perf_counter() + + wavs, sr = generate_audio(model, prompt, args) + + torch.cuda.synchronize(device) + elapsed = time.perf_counter() - st + + # Compute audio duration + audio_samples = wavs[0] + if isinstance(audio_samples, torch.Tensor): + audio_samples = audio_samples.cpu().numpy() + audio_dur = len(audio_samples) / sr + + rtf = elapsed / audio_dur if audio_dur > 0 else 0.0 + + e2e_times.append(elapsed) + rtfs.append(rtf) + audio_durations.append(audio_dur) + per_request.append( + { + "e2e_ms": elapsed * 1000, + "ttfp_ms": elapsed * 1000, # no streaming, TTFP = E2E + "rtf": rtf, + "audio_duration_s": audio_dur, + "prompt": prompt, + } + ) + + if audio_dir: + sf.write(str(audio_dir / f"output_{i:04d}.wav"), audio_samples, sr) + + if (i + 1) % 10 == 0 or i == 0: + print( + f" [{i + 1}/{args.num_prompts}] e2e={elapsed * 1000:.0f}ms rtf={rtf:.3f} audio={audio_dur:.2f}s" + ) + + except Exception as e: + print(f" [{i + 1}/{args.num_prompts}] FAILED: {e}") + failed += 1 + + total_duration = time.perf_counter() - total_start + completed = len(e2e_times) + + # Compute stats + result = BenchmarkResult( + config_name=args.config_name, + concurrency=1, + num_prompts=args.num_prompts, + completed=completed, + failed=failed, + duration_s=total_duration, + ) + + if e2e_times: + e2e_ms = [t * 1000 for t in e2e_times] + + result.mean_e2e_ms = float(np.mean(e2e_ms)) + result.median_e2e_ms = float(np.median(e2e_ms)) + result.std_e2e_ms = float(np.std(e2e_ms)) + result.p90_e2e_ms = float(np.percentile(e2e_ms, 90)) + result.p95_e2e_ms = float(np.percentile(e2e_ms, 95)) + result.p99_e2e_ms = float(np.percentile(e2e_ms, 99)) + + # For HF offline, TTFP = E2E (no streaming) + result.mean_ttfp_ms = result.mean_e2e_ms + result.median_ttfp_ms = result.median_e2e_ms + result.std_ttfp_ms = result.std_e2e_ms + result.p90_ttfp_ms = result.p90_e2e_ms + result.p95_ttfp_ms = result.p95_e2e_ms + result.p99_ttfp_ms = result.p99_e2e_ms + + result.mean_rtf = float(np.mean(rtfs)) + result.median_rtf = float(np.median(rtfs)) + result.std_rtf = float(np.std(rtfs)) + result.p99_rtf = float(np.percentile(rtfs, 99)) + + result.mean_audio_duration_s = float(np.mean(audio_durations)) + result.total_audio_duration_s = float(np.sum(audio_durations)) + result.audio_throughput = result.total_audio_duration_s / total_duration + result.request_throughput = completed / total_duration + result.per_request = per_request + + # Print summary in standardized performance template + W = 50 + print("") + print(f"{'=' * W}") + print(f"{'Serving Benchmark Result':^{W}}") + print(f"{'=' * W}") + print(f"{'Successful requests:':<40}{completed:<10}") + print(f"{'Failed requests:':<40}{failed:<10}") + print(f"{'Maximum request concurrency:':<40}{1:<10}") + print(f"{'Benchmark duration (s):':<40}{total_duration:<10.2f}") + print(f"{'Request throughput (req/s):':<40}{result.request_throughput:<10.2f}") + print(f"{'-' * W}") + print(f"{'End-to-end Latency':^{W}}") + print(f"{'-' * W}") + print(f"{'Mean E2EL (ms):':<40}{result.mean_e2e_ms:<10.2f}") + print(f"{'Median E2EL (ms):':<40}{result.median_e2e_ms:<10.2f}") + print(f"{'P99 E2EL (ms):':<40}{result.p99_e2e_ms:<10.2f}") + print(f"{'=' * W}") + print(f"{'Audio Result':^{W}}") + print(f"{'=' * W}") + print(f"{'Total audio duration generated (s):':<40}{result.total_audio_duration_s:<10.2f}") + print(f"{'Audio throughput (audio duration/s):':<40}{result.audio_throughput:<10.2f}") + print(f"{'-' * W}") + print(f"{'Time to First Packet':^{W}}") + print(f"{'-' * W}") + print(f"{'Mean AUDIO_TTFP (ms):':<40}{result.mean_ttfp_ms:<10.2f}") + print(f"{'Median AUDIO_TTFP (ms):':<40}{result.median_ttfp_ms:<10.2f}") + print(f"{'P99 AUDIO_TTFP (ms):':<40}{result.p99_ttfp_ms:<10.2f}") + print(f"{'-' * W}") + print(f"{'Real Time Factor':^{W}}") + print(f"{'-' * W}") + print(f"{'Mean AUDIO_RTF:':<40}{result.mean_rtf:<10.3f}") + print(f"{'Median AUDIO_RTF:':<40}{result.median_rtf:<10.3f}") + print(f"{'P99 AUDIO_RTF:':<40}{result.p99_rtf:<10.3f}") + print(f"{'=' * W}") + print("") + + # Save results (as a list with single concurrency=1 entry, matching serve format) + result_dir = Path(args.result_dir) + result_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + result_file = result_dir / f"bench_{args.config_name}_{timestamp}.json" + + with open(result_file, "w") as f: + json.dump([asdict(result)], f, indent=2) + print(f"Results saved to {result_file}") + + return result + + +def parse_args(): + parser = argparse.ArgumentParser(description="Qwen3-TTS HuggingFace Benchmark") + parser.add_argument( + "--model", type=str, default="Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", help="HuggingFace model name or path" + ) + parser.add_argument("--num-prompts", type=int, default=50) + parser.add_argument("--num-warmups", type=int, default=3) + parser.add_argument("--gpu-device", type=int, default=0) + parser.add_argument("--voice", type=str, default="Vivian") + parser.add_argument("--language", type=str, default="English") + parser.add_argument("--task-type", type=str, default="CustomVoice", choices=["CustomVoice", "VoiceDesign", "Base"]) + parser.add_argument( + "--config-name", type=str, default="hf_transformers", help="Label for this config (used in filenames)" + ) + parser.add_argument("--result-dir", type=str, default="results") + parser.add_argument("--save-audio", action="store_true", help="Save generated audio files") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + run_benchmark(args) diff --git a/benchmarks/qwen3-tts/vllm_omni/bench_async_chunk.py b/benchmarks/qwen3-tts/vllm_omni/bench_async_chunk.py new file mode 100644 index 00000000000..3497ae82152 --- /dev/null +++ b/benchmarks/qwen3-tts/vllm_omni/bench_async_chunk.py @@ -0,0 +1,301 @@ +"""Benchmark comparing async_chunk on vs off for Qwen3-TTS. + +Measures TTFP (Time-to-First-Packet), E2E latency, and RTF across +concurrency levels for both async_chunk modes. Saves results as JSON. + +Usage: + # Run against a server already serving with a given config: + python bench_async_chunk.py \ + --host 127.0.0.1 --port 8000 \ + --config-name async_chunk_on \ + --num-prompts 50 \ + --max-concurrency 1 10 \ + --result-dir results/ +""" + +import argparse +import asyncio +import json +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm + +PROMPTS = [ + "Hello, welcome to the voice synthesis benchmark test.", + "She said she would be here by noon, but nobody showed up.", + "The quick brown fox jumps over the lazy dog near the riverbank.", + "I can't believe how beautiful the sunset looks from up here on the mountain.", + "Please remember to bring your identification documents to the appointment tomorrow morning.", + "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?", + "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.", + "After the meeting, we should discuss the quarterly results and plan for the next phase.", + "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.", + "The train leaves at half past seven, so we need to arrive at the station before then.", + "Could you please turn down the music a little bit, I'm trying to concentrate on my work.", + "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", +] + + +@dataclass +class RequestResult: + success: bool = False + ttfp: float = 0.0 + e2e: float = 0.0 + audio_bytes: int = 0 + audio_duration: float = 0.0 + rtf: float = 0.0 + prompt: str = "" + error: str = "" + + +@dataclass +class BenchmarkResult: + config_name: str = "" + concurrency: int = 0 + num_prompts: int = 0 + completed: int = 0 + failed: int = 0 + duration_s: float = 0.0 + mean_ttfp_ms: float = 0.0 + median_ttfp_ms: float = 0.0 + std_ttfp_ms: float = 0.0 + p90_ttfp_ms: float = 0.0 + p95_ttfp_ms: float = 0.0 + p99_ttfp_ms: float = 0.0 + mean_e2e_ms: float = 0.0 + median_e2e_ms: float = 0.0 + std_e2e_ms: float = 0.0 + p90_e2e_ms: float = 0.0 + p95_e2e_ms: float = 0.0 + p99_e2e_ms: float = 0.0 + mean_rtf: float = 0.0 + median_rtf: float = 0.0 + std_rtf: float = 0.0 + mean_audio_duration_s: float = 0.0 + total_audio_duration_s: float = 0.0 + audio_throughput: float = 0.0 + request_throughput: float = 0.0 + per_request: list = field(default_factory=list) + + +def pcm_bytes_to_duration(num_bytes: int, sample_rate: int = 24000, sample_width: int = 2) -> float: + return num_bytes / sample_width / sample_rate + + +async def send_tts_request( + session: aiohttp.ClientSession, + api_url: str, + prompt: str, + voice: str = "vivian", + language: str = "English", + stream: bool = True, + pbar: tqdm | None = None, +) -> RequestResult: + payload = { + "input": prompt, + "voice": voice, + "language": language, + "stream": stream, + "response_format": "pcm", + } + + result = RequestResult(prompt=prompt) + st = time.perf_counter() + + try: + async with session.post(api_url, json=payload) as response: + if response.status != 200: + result.error = f"HTTP {response.status}: {await response.text()}" + return result + + first_chunk = True + total_bytes = 0 + + async for chunk in response.content.iter_any(): + if first_chunk and len(chunk) > 0: + result.ttfp = time.perf_counter() - st + first_chunk = False + total_bytes += len(chunk) + + result.e2e = time.perf_counter() - st + result.audio_bytes = total_bytes + result.audio_duration = pcm_bytes_to_duration(total_bytes) + if result.audio_duration > 0: + result.rtf = result.e2e / result.audio_duration + result.success = True + + except Exception as e: + result.error = str(e) + result.e2e = time.perf_counter() - st + + if pbar: + pbar.update(1) + return result + + +async def run_benchmark( + host: str, + port: int, + num_prompts: int, + max_concurrency: int, + num_warmups: int = 3, + voice: str = "vivian", + language: str = "English", + stream: bool = True, +) -> BenchmarkResult: + api_url = f"http://{host}:{port}/v1/audio/speech" + + connector = aiohttp.TCPConnector(limit=max_concurrency, limit_per_host=max_concurrency, keepalive_timeout=60) + session = aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=600)) + + if num_warmups > 0: + print(f" Warming up with {num_warmups} requests...") + warmup_tasks = [ + send_tts_request(session, api_url, PROMPTS[i % len(PROMPTS)], voice, language, stream) + for i in range(num_warmups) + ] + await asyncio.gather(*warmup_tasks) + print(" Warmup done.") + + request_prompts = [PROMPTS[i % len(PROMPTS)] for i in range(num_prompts)] + + print(f" Running {num_prompts} requests with concurrency={max_concurrency}...") + semaphore = asyncio.Semaphore(max_concurrency) + pbar = tqdm(total=num_prompts, desc=f" concurrency={max_concurrency}") + + async def limited_request(prompt): + async with semaphore: + return await send_tts_request(session, api_url, prompt, voice, language, stream, pbar) + + start_time = time.perf_counter() + tasks = [asyncio.create_task(limited_request(p)) for p in request_prompts] + results: list[RequestResult] = await asyncio.gather(*tasks) + duration = time.perf_counter() - start_time + pbar.close() + + await session.close() + + successful = [r for r in results if r.success] + failed = [r for r in results if not r.success] + + bench = BenchmarkResult( + concurrency=max_concurrency, + num_prompts=num_prompts, + completed=len(successful), + failed=len(failed), + duration_s=duration, + ) + + if successful: + ttfps = [r.ttfp * 1000 for r in successful] + e2es = [r.e2e * 1000 for r in successful] + rtfs = [r.rtf for r in successful] + audio_durs = [r.audio_duration for r in successful] + + bench.mean_ttfp_ms = float(np.mean(ttfps)) + bench.median_ttfp_ms = float(np.median(ttfps)) + bench.std_ttfp_ms = float(np.std(ttfps)) + bench.p90_ttfp_ms = float(np.percentile(ttfps, 90)) + bench.p95_ttfp_ms = float(np.percentile(ttfps, 95)) + bench.p99_ttfp_ms = float(np.percentile(ttfps, 99)) + + bench.mean_e2e_ms = float(np.mean(e2es)) + bench.median_e2e_ms = float(np.median(e2es)) + bench.std_e2e_ms = float(np.std(e2es)) + bench.p90_e2e_ms = float(np.percentile(e2es, 90)) + bench.p95_e2e_ms = float(np.percentile(e2es, 95)) + bench.p99_e2e_ms = float(np.percentile(e2es, 99)) + + bench.mean_rtf = float(np.mean(rtfs)) + bench.median_rtf = float(np.median(rtfs)) + bench.std_rtf = float(np.std(rtfs)) + + bench.mean_audio_duration_s = float(np.mean(audio_durs)) + bench.total_audio_duration_s = float(np.sum(audio_durs)) + bench.audio_throughput = bench.total_audio_duration_s / duration + bench.request_throughput = len(successful) / duration + + bench.per_request = [ + { + "ttfp_ms": r.ttfp * 1000, + "e2e_ms": r.e2e * 1000, + "rtf": r.rtf, + "audio_duration_s": r.audio_duration, + "prompt": r.prompt, + } + for r in successful + ] + + print(f"\n{'=' * 60}") + print(f" Concurrency: {max_concurrency} | Completed: {bench.completed} | Failed: {bench.failed}") + print(f" Duration: {duration:.2f}s | Throughput: {bench.request_throughput:.2f} req/s") + print( + f" TTFP (ms): mean={bench.mean_ttfp_ms:.1f} median={bench.median_ttfp_ms:.1f}" + f" p90={bench.p90_ttfp_ms:.1f} p99={bench.p99_ttfp_ms:.1f}" + ) + print( + f" E2E (ms): mean={bench.mean_e2e_ms:.1f} median={bench.median_e2e_ms:.1f}" + f" p90={bench.p90_e2e_ms:.1f} p99={bench.p99_e2e_ms:.1f}" + ) + print(f" RTF: mean={bench.mean_rtf:.3f} median={bench.median_rtf:.3f}") + print(f" Throughput: {bench.audio_throughput:.2f} audio-sec/wall-sec") + print(f"{'=' * 60}\n") + + if failed: + for r in failed[:3]: + print(f" [ERROR] {r.error[:200]}") + + return bench + + +async def main(args): + all_results = [] + + for concurrency in args.max_concurrency: + result = await run_benchmark( + host=args.host, + port=args.port, + num_prompts=args.num_prompts, + max_concurrency=concurrency, + num_warmups=args.num_warmups, + voice=args.voice, + language=args.language, + stream=args.stream, + ) + result.config_name = args.config_name + all_results.append(asdict(result)) + + result_dir = Path(args.result_dir) + result_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + result_file = result_dir / f"bench_{args.config_name}_{timestamp}.json" + + with open(result_file, "w") as f: + json.dump(all_results, f, indent=2) + print(f"Results saved to {result_file}") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Qwen3-TTS async_chunk benchmark client") + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--num-prompts", type=int, default=50) + parser.add_argument("--max-concurrency", type=int, nargs="+", default=[1, 10]) + parser.add_argument("--num-warmups", type=int, default=3) + parser.add_argument("--voice", type=str, default="vivian") + parser.add_argument("--language", type=str, default="English") + parser.add_argument("--stream", action="store_true", default=True) + parser.add_argument("--no-stream", dest="stream", action="store_false") + parser.add_argument("--config-name", type=str, default="async_chunk_on") + parser.add_argument("--result-dir", type=str, default="results") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + asyncio.run(main(args)) diff --git a/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py b/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py new file mode 100644 index 00000000000..96b904b0174 --- /dev/null +++ b/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py @@ -0,0 +1,371 @@ +"""Benchmark client for Qwen3-TTS via /v1/audio/speech endpoint. + +Measures TTFP (Time-to-First-Packet), E2E latency, and RTF (Real-Time Factor) +across configurable concurrency levels. Saves results as JSON for plotting. + +Usage: + python bench_tts_serve.py \ + --host 127.0.0.1 --port 8000 \ + --num-prompts 50 \ + --max-concurrency 1 4 10 \ + --result-dir results/ +""" + +import argparse +import asyncio +import json +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm + +PROMPTS = [ + "Hello, welcome to the voice synthesis benchmark test.", + "She said she would be here by noon, but nobody showed up.", + "The quick brown fox jumps over the lazy dog near the riverbank.", + "I can't believe how beautiful the sunset looks from up here on the mountain.", + "Please remember to bring your identification documents to the appointment tomorrow morning.", + "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?", + "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.", + "After the meeting, we should discuss the quarterly results and plan for the next phase.", + "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.", + "The train leaves at half past seven, so we need to arrive at the station before then.", + "Could you please turn down the music a little bit, I'm trying to concentrate on my work.", + "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", +] +REF_AUDIO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav" +REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you." +INSTRUCT = "Speak in an incredulous tone, but with a hint of panic beginning to creep into your voice." + + +@dataclass +class RequestResult: + success: bool = False + ttfp: float = 0.0 # Time to first audio packet (seconds) + e2e: float = 0.0 # End-to-end latency (seconds) + audio_bytes: int = 0 # Total audio bytes received + audio_duration: float = 0.0 # Audio duration in seconds (estimated from PCM) + rtf: float = 0.0 # Real-time factor = e2e / audio_duration + prompt: str = "" + error: str = "" + + +@dataclass +class BenchmarkResult: + config_name: str = "" + concurrency: int = 0 + num_prompts: int = 0 + completed: int = 0 + failed: int = 0 + duration_s: float = 0.0 + # TTFP stats (ms) + mean_ttfp_ms: float = 0.0 + median_ttfp_ms: float = 0.0 + std_ttfp_ms: float = 0.0 + p90_ttfp_ms: float = 0.0 + p95_ttfp_ms: float = 0.0 + p99_ttfp_ms: float = 0.0 + # E2E stats (ms) + mean_e2e_ms: float = 0.0 + median_e2e_ms: float = 0.0 + std_e2e_ms: float = 0.0 + p90_e2e_ms: float = 0.0 + p95_e2e_ms: float = 0.0 + p99_e2e_ms: float = 0.0 + # RTF stats + mean_rtf: float = 0.0 + median_rtf: float = 0.0 + std_rtf: float = 0.0 + p99_rtf: float = 0.0 + # Audio stats + mean_audio_duration_s: float = 0.0 + total_audio_duration_s: float = 0.0 + audio_throughput: float = 0.0 # audio_duration / wall_time + request_throughput: float = 0.0 # requests / second + # Per-request details + per_request: list = field(default_factory=list) + + +def pcm_bytes_to_duration(num_bytes: int, sample_rate: int = 24000, sample_width: int = 2) -> float: + """Convert raw PCM byte count to duration in seconds.""" + num_samples = num_bytes / sample_width + return num_samples / sample_rate + + +def create_payload( + prompt: str, task_type: str = "CustomVoice", voice: str = "vivian", language: str = "English" +) -> dict: + payload = { + "input": prompt, + "language": language, + "stream": True, + "response_format": "pcm", + "task_type": task_type, + } + + if task_type == "Base": + payload["ref_audio"] = REF_AUDIO + payload["ref_text"] = REF_TEXT + elif task_type == "CustomVoice": + payload["voice"] = voice + elif task_type == "VoiceDesign": + payload["instructions"] = INSTRUCT + + return payload + + +async def send_tts_request( + session: aiohttp.ClientSession, + api_url: str, + prompt: str, + task_type: str = "CustomVoice", + voice: str = "vivian", + language: str = "English", + pbar: tqdm | None = None, +) -> RequestResult: + """Send a streaming TTS request and measure latency metrics.""" + payload = create_payload(prompt, task_type, voice, language) + + result = RequestResult(prompt=prompt) + st = time.perf_counter() + + try: + async with session.post(api_url, json=payload) as response: + if response.status != 200: + result.error = f"HTTP {response.status}: {await response.text()}" + result.success = False + return result + + first_chunk = True + total_bytes = 0 + + async for chunk in response.content.iter_any(): + if first_chunk and len(chunk) > 0: + result.ttfp = time.perf_counter() - st + first_chunk = False + total_bytes += len(chunk) + + result.e2e = time.perf_counter() - st + result.audio_bytes = total_bytes + result.audio_duration = pcm_bytes_to_duration(total_bytes) + + if result.audio_duration > 0: + result.rtf = result.e2e / result.audio_duration + result.success = True + + except Exception as e: + result.error = str(e) + result.success = False + result.e2e = time.perf_counter() - st + + if pbar: + pbar.update(1) + return result + + +async def run_benchmark( + host: str, + port: int, + num_prompts: int, + max_concurrency: int, + num_warmups: int = 3, + task_type: str = "CustomVoice", + voice: str = "vivian", + language: str = "English", +) -> BenchmarkResult: + """Run benchmark at a given concurrency level.""" + api_url = f"http://{host}:{port}/v1/audio/speech" + + connector = aiohttp.TCPConnector( + limit=max_concurrency, + limit_per_host=max_concurrency, + keepalive_timeout=60, + ) + session = aiohttp.ClientSession( + connector=connector, + timeout=aiohttp.ClientTimeout(total=600), + ) + + # Warmup + if num_warmups > 0: + print(f" Warming up with {num_warmups} requests...") + warmup_tasks = [] + for i in range(num_warmups): + prompt = PROMPTS[i % len(PROMPTS)] + warmup_tasks.append(send_tts_request(session, api_url, prompt, task_type, voice, language)) + await asyncio.gather(*warmup_tasks) + print(" Warmup done.") + + # Build request list + request_prompts = [PROMPTS[i % len(PROMPTS)] for i in range(num_prompts)] + + # Run benchmark + print(f" Running {num_prompts} requests with concurrency={max_concurrency}...") + semaphore = asyncio.Semaphore(max_concurrency) + pbar = tqdm(total=num_prompts, desc=f" concurrency={max_concurrency}") + + async def limited_request(prompt): + async with semaphore: + return await send_tts_request(session, api_url, prompt, task_type, voice, language, pbar) + + start_time = time.perf_counter() + tasks = [asyncio.create_task(limited_request(p)) for p in request_prompts] + results: list[RequestResult] = await asyncio.gather(*tasks) + duration = time.perf_counter() - start_time + pbar.close() + + await session.close() + + # Compute stats + successful = [r for r in results if r.success] + failed = [r for r in results if not r.success] + + bench = BenchmarkResult( + concurrency=max_concurrency, + num_prompts=num_prompts, + completed=len(successful), + failed=len(failed), + duration_s=duration, + ) + + if successful: + ttfps = [r.ttfp * 1000 for r in successful] # convert to ms + e2es = [r.e2e * 1000 for r in successful] + rtfs = [r.rtf for r in successful] + audio_durs = [r.audio_duration for r in successful] + + bench.mean_ttfp_ms = float(np.mean(ttfps)) + bench.median_ttfp_ms = float(np.median(ttfps)) + bench.std_ttfp_ms = float(np.std(ttfps)) + bench.p90_ttfp_ms = float(np.percentile(ttfps, 90)) + bench.p95_ttfp_ms = float(np.percentile(ttfps, 95)) + bench.p99_ttfp_ms = float(np.percentile(ttfps, 99)) + + bench.mean_e2e_ms = float(np.mean(e2es)) + bench.median_e2e_ms = float(np.median(e2es)) + bench.std_e2e_ms = float(np.std(e2es)) + bench.p90_e2e_ms = float(np.percentile(e2es, 90)) + bench.p95_e2e_ms = float(np.percentile(e2es, 95)) + bench.p99_e2e_ms = float(np.percentile(e2es, 99)) + + bench.mean_rtf = float(np.mean(rtfs)) + bench.median_rtf = float(np.median(rtfs)) + bench.std_rtf = float(np.std(rtfs)) + bench.p99_rtf = float(np.percentile(rtfs, 99)) + + bench.mean_audio_duration_s = float(np.mean(audio_durs)) + bench.total_audio_duration_s = float(np.sum(audio_durs)) + bench.audio_throughput = bench.total_audio_duration_s / duration + bench.request_throughput = len(successful) / duration + + bench.per_request = [ + { + "ttfp_ms": r.ttfp * 1000, + "e2e_ms": r.e2e * 1000, + "rtf": r.rtf, + "audio_duration_s": r.audio_duration, + "prompt": r.prompt, + } + for r in successful + ] + + # Print summary in standardized performance template + W = 50 + print("") + print(f"{'=' * W}") + print(f"{'Serving Benchmark Result':^{W}}") + print(f"{'=' * W}") + print(f"{'Successful requests:':<40}{bench.completed:<10}") + print(f"{'Failed requests:':<40}{bench.failed:<10}") + print(f"{'Maximum request concurrency:':<40}{max_concurrency:<10}") + print(f"{'Benchmark duration (s):':<40}{duration:<10.2f}") + print(f"{'Request throughput (req/s):':<40}{bench.request_throughput:<10.2f}") + print(f"{'-' * W}") + print(f"{'End-to-end Latency':^{W}}") + print(f"{'-' * W}") + print(f"{'Mean E2EL (ms):':<40}{bench.mean_e2e_ms:<10.2f}") + print(f"{'Median E2EL (ms):':<40}{bench.median_e2e_ms:<10.2f}") + print(f"{'P99 E2EL (ms):':<40}{bench.p99_e2e_ms:<10.2f}") + print(f"{'=' * W}") + print(f"{'Audio Result':^{W}}") + print(f"{'=' * W}") + print(f"{'Total audio duration generated (s):':<40}{bench.total_audio_duration_s:<10.2f}") + print(f"{'Audio throughput (audio duration/s):':<40}{bench.audio_throughput:<10.2f}") + print(f"{'-' * W}") + print(f"{'Time to First Packet':^{W}}") + print(f"{'-' * W}") + print(f"{'Mean AUDIO_TTFP (ms):':<40}{bench.mean_ttfp_ms:<10.2f}") + print(f"{'Median AUDIO_TTFP (ms):':<40}{bench.median_ttfp_ms:<10.2f}") + print(f"{'P99 AUDIO_TTFP (ms):':<40}{bench.p99_ttfp_ms:<10.2f}") + print(f"{'-' * W}") + print(f"{'Real Time Factor':^{W}}") + print(f"{'-' * W}") + print(f"{'Mean AUDIO_RTF:':<40}{bench.mean_rtf:<10.3f}") + print(f"{'Median AUDIO_RTF:':<40}{bench.median_rtf:<10.3f}") + print(f"{'P99 AUDIO_RTF:':<40}{bench.p99_rtf:<10.3f}") + print(f"{'=' * W}") + print("") + + if failed: + for r in failed[:3]: + print(f" [ERROR] {r.error[:200]}") + + return bench + + +async def main(args): + all_results = [] + + for concurrency in args.max_concurrency: + result = await run_benchmark( + host=args.host, + port=args.port, + num_prompts=args.num_prompts, + max_concurrency=concurrency, + num_warmups=args.num_warmups, + task_type=args.task_type, + voice=args.voice, + language=args.language, + ) + result.config_name = args.config_name + all_results.append(asdict(result)) + + # Save results + result_dir = Path(args.result_dir) + result_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + result_file = result_dir / f"bench_{args.config_name}_{timestamp}.json" + + with open(result_file, "w") as f: + json.dump(all_results, f, indent=2) + print(f"Results saved to {result_file}") + + return all_results + + +def parse_args(): + parser = argparse.ArgumentParser(description="Qwen3-TTS Benchmark Client") + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--num-prompts", type=int, default=50, help="Number of prompts per concurrency level") + parser.add_argument( # noqa: E501 + "--max-concurrency", type=int, nargs="+", default=[1, 4, 10], help="Concurrency levels to test" + ) + parser.add_argument("--num-warmups", type=int, default=3) + parser.add_argument("--task-type", type=str, default="CustomVoice", choices=["CustomVoice", "VoiceDesign", "Base"]) + parser.add_argument("--voice", type=str, default="vivian") + parser.add_argument("--language", type=str, default="English") + parser.add_argument( + "--config-name", type=str, default="async_chunk", help="Label for this config (used in filenames)" + ) + parser.add_argument("--result-dir", type=str, default="results") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + asyncio.run(main(args)) diff --git a/vllm_omni/platforms/npu/stage_configs/voxcpm_async_chunk.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml similarity index 64% rename from vllm_omni/platforms/npu/stage_configs/voxcpm_async_chunk.yaml rename to benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml index 87843634cb7..ca441d286dd 100644 --- a/vllm_omni/platforms/npu/stage_configs/voxcpm_async_chunk.yaml +++ b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml @@ -1,3 +1,5 @@ +# Qwen3-TTS batch_size=1 config (streaming with async_chunk) +# 2-stage pipeline: Talker -> Code2Wav async_chunk: true stage_args: - stage_id: 0 @@ -5,85 +7,87 @@ stage_args: is_comprehension: true runtime: devices: "0" - max_batch_size: 1 engine_args: - dtype: bfloat16 - model_stage: latent_generator - model_arch: VoxCPMForConditionalGeneration - # Optional persistent HF-compatible config dir for native VoxCPM models. - hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + max_num_seqs: 1 + model_stage: qwen3_tts + model_arch: Qwen3TTSTalkerForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: true + enforce_eager: false trust_remote_code: true - async_scheduling: false + async_scheduling: true enable_prefix_caching: false engine_output_type: latent - gpu_memory_utilization: 0.75 + gpu_memory_utilization: 0.3 distributed_executor_backend: "mp" - max_num_batched_tokens: 4096 + max_num_batched_tokens: 512 max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.voxcpm.latent2vae_async_chunk + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk output_connectors: to_stage_1: connector_of_shared_memory default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 + temperature: 0.9 + top_k: 50 max_tokens: 4096 seed: 42 detokenize: false - repetition_penalty: 1.0 - final_output: false + repetition_penalty: 1.05 + stop_token_ids: [2150] - stage_id: 1 stage_type: llm runtime: devices: "0" - max_batch_size: 1 engine_args: - dtype: float32 - model_stage: vae - model_arch: VoxCPMForConditionalGeneration - hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + max_num_seqs: 1 + model_stage: code2wav + model_arch: Qwen3TTSCode2Wav worker_type: generation scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler enforce_eager: true trust_remote_code: true - async_scheduling: false + async_scheduling: true enable_prefix_caching: false engine_output_type: audio - gpu_memory_utilization: 0.1 + gpu_memory_utilization: 0.3 distributed_executor_backend: "mp" max_num_batched_tokens: 8192 - max_model_len: 4096 + max_model_len: 32768 engine_input_source: [0] - input_connectors: - from_stage_0: connector_of_shared_memory final_output: true final_output_type: audio + input_connectors: + from_stage_0: connector_of_shared_memory + tts_args: + max_instructions_length: 500 default_sampling_params: temperature: 0.0 top_p: 1.0 top_k: -1 - max_tokens: 1 + max_tokens: 65536 seed: 42 detokenize: true repetition_penalty: 1.0 runtime: enabled: true + defaults: + window_size: -1 + max_inflight: 1 connectors: connector_of_shared_memory: name: SharedMemoryConnector extra: shm_threshold_bytes: 65536 - codec_streaming: false + codec_streaming: true connector_get_sleep_s: 0.01 connector_get_max_wait_first_chunk: 3000 connector_get_max_wait: 300 + codec_chunk_frames: 25 + codec_left_context_frames: 25 edges: - from: 0 to: 1 + window_size: -1 diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs16.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs16.yaml new file mode 100644 index 00000000000..2cc5cf53532 --- /dev/null +++ b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs16.yaml @@ -0,0 +1,94 @@ +# Qwen3-TTS max_num_seqs=16 config (streaming with async_chunk) +# High-throughput concurrent request processing +# 2-stage pipeline: Talker -> Code2Wav +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + engine_args: + max_num_seqs: 16 + model_stage: qwen3_tts + model_arch: Qwen3TTSTalkerForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: false + trust_remote_code: true + async_scheduling: true + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.3 + distributed_executor_backend: "mp" + max_num_batched_tokens: 4096 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + engine_args: + max_num_seqs: 16 + model_stage: code2wav + model_arch: Qwen3TTSCode2Wav + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: true + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.2 + distributed_executor_backend: "mp" + max_num_batched_tokens: 16384 + max_model_len: 32768 + engine_input_source: [0] + final_output: true + final_output_type: audio + input_connectors: + from_stage_0: connector_of_shared_memory + tts_args: + max_instructions_length: 500 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 16 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + codec_streaming: true + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + codec_chunk_frames: 25 + codec_left_context_frames: 25 + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml new file mode 100644 index 00000000000..5de107d4976 --- /dev/null +++ b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml @@ -0,0 +1,94 @@ +# Qwen3-TTS batch_size=4 config (streaming with async_chunk) +# Enables concurrent request processing +# 2-stage pipeline: Talker -> Code2Wav +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + engine_args: + max_num_seqs: 4 + model_stage: qwen3_tts + model_arch: Qwen3TTSTalkerForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: false + trust_remote_code: true + async_scheduling: true + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.3 + distributed_executor_backend: "mp" + max_num_batched_tokens: 512 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + engine_args: + max_num_seqs: 4 + model_stage: code2wav + model_arch: Qwen3TTSCode2Wav + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: true + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.2 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 32768 + engine_input_source: [0] + final_output: true + final_output_type: audio + input_connectors: + from_stage_0: connector_of_shared_memory + tts_args: + max_instructions_length: 500 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 4 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + codec_streaming: true + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + codec_chunk_frames: 25 + codec_left_context_frames: 25 + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/benchmarks/qwen3-tts/vllm_omni/plot_async_chunk.py b/benchmarks/qwen3-tts/vllm_omni/plot_async_chunk.py new file mode 100644 index 00000000000..dd03d9626d9 --- /dev/null +++ b/benchmarks/qwen3-tts/vllm_omni/plot_async_chunk.py @@ -0,0 +1,249 @@ +"""Plot TTFP comparison: async_chunk off vs on. + +Generates a bar chart with improvement arrows, matching the Qwen3-Omni +async_chunk benchmark figure style. + +Usage: + python plot_async_chunk.py \ + --off results/bench_async_chunk_off_*.json \ + --on results/bench_async_chunk_on_*.json \ + --output results/qwen3_tts_async_chunk_ttfp.png + + # Also supports E2E and RTF metrics: + python plot_async_chunk.py \ + --off results/bench_async_chunk_off_*.json \ + --on results/bench_async_chunk_on_*.json \ + --metric e2e \ + --output results/qwen3_tts_async_chunk_e2e.png +""" + +import argparse +import json +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + +METRIC_CONFIG = { + "ttfp": { + "key": "mean_ttfp_ms", + "ylabel": "TTFP (s)", + "title": "TTFP (Time to First Audio Packet) - Qwen3-TTS, by concurrency", + "to_seconds": True, + }, + "e2e": { + "key": "mean_e2e_ms", + "ylabel": "E2E (s)", + "title": "E2E Latency - Qwen3-TTS, by concurrency", + "to_seconds": True, + }, + "rtf": { + "key": "mean_rtf", + "ylabel": "RTF", + "title": "Real-Time Factor - Qwen3-TTS, by concurrency", + "to_seconds": False, + }, +} + + +def load_results(path: str) -> list[dict]: + with open(path) as f: + return json.load(f) + + +def plot_ttfp_comparison( + off_results: list[dict], + on_results: list[dict], + metric: str, + output_path: str, + title_override: str | None = None, +): + cfg = METRIC_CONFIG[metric] + key = cfg["key"] + to_seconds = cfg["to_seconds"] + + off_map = {r["concurrency"]: r for r in off_results} + on_map = {r["concurrency"]: r for r in on_results} + concurrencies = sorted(set(off_map.keys()) & set(on_map.keys())) + + off_vals = [] + on_vals = [] + for c in concurrencies: + v_off = off_map[c][key] + v_on = on_map[c][key] + if to_seconds: + v_off /= 1000.0 + v_on /= 1000.0 + off_vals.append(v_off) + on_vals.append(v_on) + + fig, ax = plt.subplots(figsize=(8, 6)) + + x = np.arange(len(concurrencies)) + width = 0.3 + + ax.bar(x - width / 2, off_vals, width, label="async_chunk off", color="#87CEEB", edgecolor="none") + ax.bar(x + width / 2, on_vals, width, label="async_chunk on", color="#FFF8DC", edgecolor="#DDD8B8") + + # Draw improvement arrows and labels + for i in range(len(concurrencies)): + v_off = off_vals[i] + v_on = on_vals[i] + if v_on > 0: + improvement = v_off / v_on + else: + improvement = float("inf") + + # Arrow from top of off-bar to top of on-bar + arrow_start_x = x[i] - width / 2 + arrow_start_y = v_off * 0.95 + arrow_end_x = x[i] + width / 2 + arrow_end_y = v_on * 1.05 + + ax.annotate( + "", + xy=(arrow_end_x, arrow_end_y), + xytext=(arrow_start_x, arrow_start_y), + arrowprops=dict(arrowstyle="->", color="red", lw=1.5), + ) + + # Improvement label + label_x = (arrow_start_x + arrow_end_x) / 2 + label_y = arrow_start_y + (v_off - v_on) * 0.15 + ax.text( + label_x, + label_y, + f"{improvement:.1f}x improvement", + ha="center", + va="bottom", + fontsize=10, + color="red", + fontweight="bold", + ) + + title = title_override or cfg["title"] + ax.set_title(title, fontsize=13, fontweight="bold") + ax.set_ylabel(cfg["ylabel"], fontsize=12) + ax.set_xlabel("Max concurrency", fontsize=12) + ax.set_xticks(x) + ax.set_xticklabels([str(c) for c in concurrencies]) + ax.set_yscale("log") + ax.legend(loc="upper left", fontsize=11) + ax.grid(axis="y", alpha=0.3, linestyle="--") + ax.set_axisbelow(True) + + plt.tight_layout() + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"Plot saved to {output_path}") + plt.close() + + +def plot_all_metrics(off_results: list[dict], on_results: list[dict], output_path: str): + """Generate a 1x3 subplot with TTFP, E2E, and RTF comparisons.""" + off_map = {r["concurrency"]: r for r in off_results} + on_map = {r["concurrency"]: r for r in on_results} + concurrencies = sorted(set(off_map.keys()) & set(on_map.keys())) + + fig, axes = plt.subplots(1, 3, figsize=(18, 6)) + fig.suptitle("Qwen3-TTS: async_chunk on vs off", fontsize=15, fontweight="bold") + + for ax, metric in zip(axes, ["ttfp", "e2e", "rtf"]): + cfg = METRIC_CONFIG[metric] + key = cfg["key"] + to_seconds = cfg["to_seconds"] + + off_vals = [] + on_vals = [] + for c in concurrencies: + v_off = off_map[c][key] + v_on = on_map[c][key] + if to_seconds: + v_off /= 1000.0 + v_on /= 1000.0 + off_vals.append(v_off) + on_vals.append(v_on) + + x = np.arange(len(concurrencies)) + width = 0.3 + ax.bar(x - width / 2, off_vals, width, label="async_chunk off", color="#87CEEB") + ax.bar(x + width / 2, on_vals, width, label="async_chunk on", color="#FFF8DC", edgecolor="#DDD8B8") + + for i in range(len(concurrencies)): + if on_vals[i] > 0: + improvement = off_vals[i] / on_vals[i] + ax.annotate( + "", + xy=(x[i] + width / 2, on_vals[i] * 1.05), + xytext=(x[i] - width / 2, off_vals[i] * 0.95), + arrowprops=dict(arrowstyle="->", color="red", lw=1.5), + ) + label_y = off_vals[i] * 0.85 + ax.text(x[i], label_y, f"{improvement:.1f}x", ha="center", fontsize=10, color="red", fontweight="bold") + + ax.set_title(cfg["title"].split(" - ")[0], fontsize=12, fontweight="bold") + ax.set_ylabel(cfg["ylabel"], fontsize=11) + ax.set_xlabel("Max concurrency", fontsize=11) + ax.set_xticks(x) + ax.set_xticklabels([str(c) for c in concurrencies]) + if metric != "rtf": + ax.set_yscale("log") + ax.legend(fontsize=9) + ax.grid(axis="y", alpha=0.3, linestyle="--") + ax.set_axisbelow(True) + + plt.tight_layout() + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"Plot saved to {output_path}") + plt.close() + + +def print_table(off_results: list[dict], on_results: list[dict]): + off_map = {r["concurrency"]: r for r in off_results} + on_map = {r["concurrency"]: r for r in on_results} + concurrencies = sorted(set(off_map.keys()) & set(on_map.keys())) + + print("\n## Benchmark Results: async_chunk off vs on\n") + print("| Metric | Concurrency | async_chunk off | async_chunk on | Improvement |") + print("| --- | --- | --- | --- | --- |") + + for name, key, fmt in [ + ("TTFP (ms)", "mean_ttfp_ms", ".1f"), + ("E2E (ms)", "mean_e2e_ms", ".1f"), + ("RTF", "mean_rtf", ".3f"), + ("Throughput", "audio_throughput", ".2f"), + ]: + for c in concurrencies: + v_off = off_map[c].get(key, 0) + v_on = on_map[c].get(key, 0) + if v_on > 0 and key != "audio_throughput": + ratio = f"{v_off / v_on:.1f}x" + elif v_off > 0 and key == "audio_throughput": + ratio = f"{v_on / v_off:.1f}x" + else: + ratio = "N/A" + print(f"| {name} | {c} | {v_off:{fmt}} | {v_on:{fmt}} | {ratio} |") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Plot async_chunk comparison for Qwen3-TTS") + parser.add_argument("--off", type=str, required=True, help="JSON results for async_chunk off") + parser.add_argument("--on", type=str, required=True, help="JSON results for async_chunk on") + parser.add_argument("--metric", type=str, default="ttfp", choices=["ttfp", "e2e", "rtf", "all"]) + parser.add_argument("--output", type=str, default="results/qwen3_tts_async_chunk.png") + parser.add_argument("--title", type=str, default=None, help="Custom title override") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + off_results = load_results(args.off) + on_results = load_results(args.on) + + print_table(off_results, on_results) + + if args.metric == "all": + plot_all_metrics(off_results, on_results, args.output) + else: + plot_ttfp_comparison(off_results, on_results, args.metric, args.output, args.title) diff --git a/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh b/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh new file mode 100755 index 00000000000..61cf7757a9b --- /dev/null +++ b/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# Qwen3-TTS async_chunk on vs off Benchmark +# +# Starts two servers (async_chunk on and off), benchmarks both, +# and generates comparison plots. +# +# Usage: +# bash run_async_chunk_benchmark.sh +# +# Environment variables: +# GPU_DEVICE - GPU index (default: 0) +# MODEL - Model path (default: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice) +# NUM_PROMPTS - Prompts per concurrency level (default: 50) +# CONCURRENCY - Space-separated concurrency levels (default: "1 10") +# PORT_ON - Port for async_chunk on server (default: 8000) +# PORT_OFF - Port for async_chunk off server (default: 8001) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" +cd "$PROJECT_ROOT" + +GPU_DEVICE="${GPU_DEVICE:-0}" +MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice}" +NUM_PROMPTS="${NUM_PROMPTS:-50}" +CONCURRENCY="${CONCURRENCY:-1 10}" +NUM_WARMUPS="${NUM_WARMUPS:-3}" +PORT_ON="${PORT_ON:-8000}" +PORT_OFF="${PORT_OFF:-8001}" +RESULT_DIR="${SCRIPT_DIR}/results" +TIMESTAMP="$(date +%Y%m%d_%H%M%S)" + +STAGE_CONFIG_ON="vllm_omni/model_executor/stage_configs/qwen3_tts.yaml" +STAGE_CONFIG_OFF="vllm_omni/model_executor/stage_configs/qwen3_tts_no_async_chunk.yaml" + +mkdir -p "${RESULT_DIR}" + +echo "============================================================" +echo " Qwen3-TTS async_chunk Benchmark" +echo "============================================================" +echo " GPU: ${GPU_DEVICE}" +echo " Model: ${MODEL}" +echo " Prompts: ${NUM_PROMPTS}" +echo " Concurrency: ${CONCURRENCY}" +echo " Port (on/off): ${PORT_ON} / ${PORT_OFF}" +echo " Results: ${RESULT_DIR}" +echo "============================================================" + +cleanup() { + echo "Cleaning up servers..." + kill "$PID_ON" 2>/dev/null || true + kill "$PID_OFF" 2>/dev/null || true + wait "$PID_ON" 2>/dev/null || true + wait "$PID_OFF" 2>/dev/null || true +} +trap cleanup EXIT + +wait_for_server() { + local port=$1 + local name=$2 + local max_wait=300 + local elapsed=0 + echo "Waiting for ${name} server on port ${port}..." + while ! curl -s "http://localhost:${port}/health" >/dev/null 2>&1; do + sleep 5 + elapsed=$((elapsed + 5)) + if [ $elapsed -ge $max_wait ]; then + echo "ERROR: ${name} server failed to start within ${max_wait}s" + exit 1 + fi + done + echo "${name} server ready (${elapsed}s)" +} + +# ---- Phase 1: Start async_chunk ON server ---- +echo "" +echo "[Phase 1] Starting async_chunk ON server on port ${PORT_ON}..." +CUDA_VISIBLE_DEVICES=${GPU_DEVICE} vllm-omni serve "${MODEL}" \ + --stage-configs-path "${STAGE_CONFIG_ON}" \ + --host 0.0.0.0 --port "${PORT_ON}" \ + --trust-remote-code --enforce-eager --omni \ + > "${RESULT_DIR}/server_on_${TIMESTAMP}.log" 2>&1 & +PID_ON=$! + +wait_for_server "${PORT_ON}" "async_chunk_on" + +echo "[Phase 1] Benchmarking async_chunk ON..." +# shellcheck disable=SC2086 +python "${SCRIPT_DIR}/bench_async_chunk.py" \ + --host 127.0.0.1 --port "${PORT_ON}" \ + --config-name "async_chunk_on" \ + --num-prompts "${NUM_PROMPTS}" \ + --max-concurrency ${CONCURRENCY} \ + --num-warmups "${NUM_WARMUPS}" \ + --result-dir "${RESULT_DIR}" + +echo "[Phase 1] Stopping async_chunk ON server..." +kill "$PID_ON" 2>/dev/null || true +wait "$PID_ON" 2>/dev/null || true +sleep 5 + +# ---- Phase 2: Start async_chunk OFF server ---- +echo "" +echo "[Phase 2] Starting async_chunk OFF server on port ${PORT_OFF}..." +CUDA_VISIBLE_DEVICES=${GPU_DEVICE} vllm-omni serve "${MODEL}" \ + --stage-configs-path "${STAGE_CONFIG_OFF}" \ + --host 0.0.0.0 --port "${PORT_OFF}" \ + --trust-remote-code --enforce-eager --omni \ + > "${RESULT_DIR}/server_off_${TIMESTAMP}.log" 2>&1 & +PID_OFF=$! + +wait_for_server "${PORT_OFF}" "async_chunk_off" + +echo "[Phase 2] Benchmarking async_chunk OFF (non-streaming)..." +# shellcheck disable=SC2086 +python "${SCRIPT_DIR}/bench_async_chunk.py" \ + --host 127.0.0.1 --port "${PORT_OFF}" \ + --config-name "async_chunk_off" \ + --num-prompts "${NUM_PROMPTS}" \ + --max-concurrency ${CONCURRENCY} \ + --num-warmups "${NUM_WARMUPS}" \ + --no-stream \ + --result-dir "${RESULT_DIR}" + +echo "[Phase 2] Stopping async_chunk OFF server..." +kill "$PID_OFF" 2>/dev/null || true +wait "$PID_OFF" 2>/dev/null || true + +# ---- Phase 3: Plot results ---- +echo "" +echo "[Phase 3] Generating plots..." + +# Find the latest result files +RESULT_ON=$(ls -t "${RESULT_DIR}"/bench_async_chunk_on_*.json 2>/dev/null | head -1) +RESULT_OFF=$(ls -t "${RESULT_DIR}"/bench_async_chunk_off_*.json 2>/dev/null | head -1) + +if [ -z "$RESULT_ON" ] || [ -z "$RESULT_OFF" ]; then + echo "ERROR: Could not find result files. Check logs in ${RESULT_DIR}/" + exit 1 +fi + +echo " ON results: ${RESULT_ON}" +echo " OFF results: ${RESULT_OFF}" + +# TTFP comparison (main figure) +python "${SCRIPT_DIR}/plot_async_chunk.py" \ + --off "${RESULT_OFF}" \ + --on "${RESULT_ON}" \ + --metric ttfp \ + --output "${RESULT_DIR}/qwen3_tts_async_chunk_ttfp.png" + +# All metrics comparison +python "${SCRIPT_DIR}/plot_async_chunk.py" \ + --off "${RESULT_OFF}" \ + --on "${RESULT_ON}" \ + --metric all \ + --output "${RESULT_DIR}/qwen3_tts_async_chunk_all.png" + +echo "" +echo "============================================================" +echo " Benchmark complete!" +echo " Results: ${RESULT_DIR}/" +echo " Plots:" +echo " - ${RESULT_DIR}/qwen3_tts_async_chunk_ttfp.png" +echo " - ${RESULT_DIR}/qwen3_tts_async_chunk_all.png" +echo "============================================================" diff --git a/benchmarks/tts/README.md b/benchmarks/tts/README.md deleted file mode 100644 index 9e2fd35b1a5..00000000000 --- a/benchmarks/tts/README.md +++ /dev/null @@ -1,227 +0,0 @@ -# TTS Universal Benchmark - -A model-agnostic serving benchmark for TTS models in vllm-omni. One CLI -(`bench_tts.py`) + one YAML registry (`model_configs.yaml`) drive perf and -quality runs for every registered checkpoint: **Qwen3-TTS** (Base / CustomVoice) -and **VoxCPM2** today, more to come. - -The same three task types — `voice_clone`, `default_voice`, `voice_design` — -are wired into both the manual CLI and the DFX nightly CI matrix -(`tests/dfx/perf/tests/test_tts.json`). - -## Quick start - -### 1. Start the server - -```bash -vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base --omni --port 8000 -``` - -The server auto-loads its Deploy YAML from `vllm_omni/deploy/qwen3_tts.yaml` -(Pipeline + Deploy schema introduced in #2383). No `--stage-configs-path` or -`--deploy-config` flag is needed for any registered model. - -### 2. Run the benchmark (`vllm bench serve --omni`) - -The primary, directly-controllable path. Copy-paste one of these and tweak -any bench flag (sampling params, endpoint, extra body, warmups, etc.): - -#### voice_clone (Qwen3-TTS-Base, seed-tts dataset) - -```bash -vllm bench serve --omni \ - --host 127.0.0.1 --port 8000 \ - --model Qwen/Qwen3-TTS-12Hz-1.7B-Base \ - --backend openai-audio-speech \ - --endpoint /v1/audio/speech \ - --dataset-name seed-tts \ - --dataset-path /path/to/seed-tts-eval \ - --seed-tts-locale en \ - --num-prompts 20 --num-warmups 2 \ - --extra-body '{"task_type":"Base"}' \ - --max-concurrency 1 --request-rate inf \ - --percentile-metrics ttft,e2el,audio_rtf,audio_ttfp,audio_duration \ - --save-result --result-dir ./results -``` - -#### default_voice (Qwen3-TTS-CustomVoice, bundled seed_tts_smoke) - -```bash -vllm bench serve --omni \ - --host 127.0.0.1 --port 8000 \ - --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --backend openai-audio-speech \ - --endpoint /v1/audio/speech \ - --dataset-name seed-tts-text \ - --dataset-path benchmarks/build_dataset/seed_tts_smoke \ - --seed-tts-locale en \ - --num-prompts 20 --num-warmups 2 \ - --extra-body '{"voice":"Vivian","language":"English","task_type":"CustomVoice"}' \ - --max-concurrency 1 --request-rate inf \ - --percentile-metrics ttft,e2el,audio_rtf,audio_ttfp,audio_duration \ - --save-result --result-dir ./results -``` - -#### voice_design (Qwen3-TTS-CustomVoice, bundled seed_tts_design) - -```bash -vllm bench serve --omni \ - --host 127.0.0.1 --port 8000 \ - --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --backend openai-audio-speech \ - --endpoint /v1/audio/speech \ - --dataset-name seed-tts-design \ - --dataset-path benchmarks/build_dataset/seed_tts_design \ - --seed-tts-locale en \ - --num-prompts 20 --num-warmups 2 \ - --extra-body '{"task_type":"VoiceDesign","language":"English"}' \ - --max-concurrency 1 --request-rate inf \ - --percentile-metrics ttft,e2el,audio_rtf,audio_ttfp,audio_duration \ - --save-result --result-dir ./results -``` - -#### Add WER / SIM / UTMOS to any of the above - -Append `--seed-tts-wer-eval` (and optionally `SEED_TTS_EVAL_DEVICE=cuda:0` -in the env, per PR #2558). This triggers the seed-tts-eval protocol: -Whisper-large-v3 ASR → WER, WavLM embeddings → SIM, balacoon/utmos → UTMOS. - -### 3. Convenience wrapper (`bench_tts.py`) - -If you're running the **canonical** configuration for a registered model, -`bench_tts.py` loads the right defaults from `model_configs.yaml` and -emits the exact `vllm bench serve --omni` command above — useful for -concurrency sweeps and multi-task runs: - -```bash -# Smallest smoke — 5 prompts, concurrency=1 -python benchmarks/tts/bench_tts.py \ - --model Qwen/Qwen3-TTS-12Hz-1.7B-Base \ - --task voice_clone \ - --dataset-path /path/to/seed-tts-eval \ - --concurrency 1 --num-prompts 5 \ - --output-dir ./results - -# Full concurrency sweep -python benchmarks/tts/bench_tts.py \ - --model Qwen/Qwen3-TTS-12Hz-1.7B-Base \ - --task voice_clone \ - --dataset-path /path/to/seed-tts-eval \ - --concurrency 1 2 4 8 16 32 \ - --num-prompts 20 \ - --output-dir ./results - -# With WER / SIM / UTMOS quality eval (adds ASR + embedding compute) -python benchmarks/tts/bench_tts.py \ - --model Qwen/Qwen3-TTS-12Hz-1.7B-Base \ - --task voice_clone \ - --dataset-path /path/to/seed-tts-eval \ - --wer-eval \ - --concurrency 4 --num-prompts 200 \ - --output-dir ./results -``` - -### 4. Plot a sweep - -```bash -python benchmarks/tts/plot_results.py \ - --results ./results/*.json \ - --output ./results/curve.png -``` - -Outputs TTFP / RTF / throughput curves (and a markdown table) for every -`(task, concurrency)` combination in the result set. - -## Task types - -| Task | Dataset | Request body | Checkpoints that support it | -|-----------------|-------------------|-----------------------------------------------------|------------------------------------------| -| `voice_clone` | `seed-tts` | `ref_audio` + `ref_text` + `task_type=Base` | `Qwen3-TTS-*-Base`, `VoxCPM2` | -| `default_voice` | `seed-tts-text` | `voice=Vivian` + `task_type=CustomVoice` | `Qwen3-TTS-*-CustomVoice` | -| `voice_design` | `seed-tts-design` | `instructions=` + `task_type=VoiceDesign` | `Qwen3-TTS-*-CustomVoice` | - -**`-CustomVoice` checkpoints do NOT ship `speaker_encoder` weights**, so -voice_clone requests raise `ValueError` at model runtime. Use `-Base` for -voice_clone. - -## Adding a new TTS model - -Drop an entry into `model_configs.yaml` — no Python changes required: - -```yaml -models: - /: - supported_tasks: [voice_clone] # or default_voice / voice_design - backend: openai-audio-speech # vllm bench serve backend - endpoint: /v1/audio/speech # OpenAI-compatible endpoint - task_extra_body: # merged into every request's body - voice_clone: - task_type: Base -``` - -Then add the model's Deploy YAML under `vllm_omni/deploy/.yaml` -(Pipeline + Deploy schema) and it's immediately benchable. - -## Datasets - -| Dataset | Bundled? | Format | Source | -|--------------------|----------|-------------------|----------------------------------------------------------------| -| `seed-tts-design` | ✅ | 5-field meta.lst | `benchmarks/build_dataset/seed_tts_design/en/meta.lst` (20 prompts) | -| `seed_tts_smoke` | ✅ | 4-field meta.lst | `benchmarks/build_dataset/seed_tts_smoke/en/meta.lst` (20 text-only) | -| `seed-tts` | ❌ | 4-field meta.lst + WAVs | Google-Drive: [BytedanceSpeech/seed-tts-eval][seedtts] (~1.2 GB) | -| `seed-tts-text` | ❌ | 4-field meta.lst | Same archive as `seed-tts` (wav column unused) | - -[seedtts]: https://github.com/BytedanceSpeech/seed-tts-eval - -For manual voice_clone / default_voice runs against the full corpus, follow -`benchmarks/build_dataset/download_process_data_seedtts.md` and point -`--dataset-path` at the extracted `seedtts_testset` directory. - -## DFX nightly CI - -`tests/dfx/perf/tests/test_tts.json` wires three perf regimes plus quality: - -| eval_phase | concurrency | purpose | Baseline metrics | -|---------------|-------------|---------------------------------------------------------|-----------------------------------------| -| `latency` | 1 | Single-request TTFP / RTF SLO | `median_audio_ttfp_ms`, `median_audio_rtf` | -| `throughput` | 8 | Codec-batching cliff sentinel (PDF #272 concurrency≥8) | `median_audio_ttfp_ms`, `median_audio_rtf` | -| `quality` | 4 | WER / SIM / UTMOS regression (disabled in CI by default)| `mean_audio_rtf` | - -Why `median_*` for latency/throughput and `mean_*` for quality: latency -distributions have cold-start tails that drag the mean; quality aggregates -over 200 prompts so single-request outliers don't matter. - -Quality entries are `enabled: false` in CI because seed-tts-eval is not -staged in the Buildkite container (matches the precedent in -PR #2558 — quality runs are manual / release-validation, not nightly). - -## Concurrency cliff regression sentinel - -Observed on H20-3e, Qwen3-TTS-1.7B (measured pre-merge on this branch): - -| Task | Model | c=1 | c=4 | **c=8** | c=16 | c=32 | -|---------------|---------------|--------|--------|------------|--------|--------| -| voice_clone | 1.7B-Base | RTF 0.15 / TTFP 165ms | 0.28 / 412ms | **0.49 / 1701ms** | 0.72 / 3355ms | 0.77 / 3772ms | -| voice_design | 1.7B-CustomVoice | RTF 0.08 / TTFP 53ms | 0.11 / 154ms | **0.21 / 872ms** | 0.33 / 1801ms | 0.38 / 1989ms | - -Both models show a **4–6× TTFP jump from c=4 to c=8** while audio throughput -saturates around c=4–8 — the codec-bs=1 bottleneck documented in -vllm-project/vllm-omni#272. The `throughput` CI regime at c=8 is the -sentinel for regressions in this area. - -## File layout - -``` -benchmarks/tts/ -├── README.md (this file) -├── bench_tts.py CLI — serve-mode benchmark driver -├── bench_voxcpm_offline.py CLI — offline VoxCPM benchmark (sync + streaming) -├── plot_results.py Generate per-task / per-concurrency curves -└── model_configs.yaml Model registry (supported tasks + extra body) -``` - -## Related - -- Upstream seed-tts-eval integration: vllm-project/vllm-omni#2558 -- Pipeline + Deploy schema: vllm-project/vllm-omni#2383 -- Concurrency cliff RFC: vllm-project/vllm-omni#272 diff --git a/benchmarks/tts/bench_tts.py b/benchmarks/tts/bench_tts.py deleted file mode 100644 index ba82b1c9b7b..00000000000 --- a/benchmarks/tts/bench_tts.py +++ /dev/null @@ -1,308 +0,0 @@ -#!/usr/bin/env python3 -"""Universal TTS benchmark CLI for vllm-omni. - -Runs ``vllm bench serve --omni`` with model-aware defaults loaded from -``model_configs.yaml``. Supports Qwen3-TTS, VoxCPM2, and any future TTS -model registered in the config file -- no code changes needed to add models. - -Usage:: - - python benchmarks/tts/bench_tts.py \\ - --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \\ - --task voice_clone \\ - --locale en \\ - --concurrency 1 4 \\ - --num-prompts 20 \\ - --dataset-path /path/to/seed-tts-eval \\ - --host localhost --port 8000 - -See ``--help`` for full option list. -""" - -from __future__ import annotations - -import argparse -import json -import math -import os -import subprocess -import sys -from datetime import datetime -from pathlib import Path -from typing import Any - -import yaml - - -def _vllm_omni_bin() -> str: - """Return the vllm-omni (or vllm) binary co-located with the current Python.""" - bin_dir = Path(sys.executable).parent - for candidate in ("vllm-omni", "vllm"): - p = bin_dir / candidate - if p.is_file(): - return str(p) - return "vllm-omni" # fall back and let the shell resolve it - - -_REPO_ROOT = Path(__file__).resolve().parent.parent.parent -_SCRIPT_DIR = Path(__file__).resolve().parent -_DEFAULT_MODEL_CONFIGS = _SCRIPT_DIR / "model_configs.yaml" - -# Maps task name to the dataset_name used with vllm bench serve -_TASK_TO_DATASET: dict[str, str] = { - "voice_clone": "seed-tts", - "default_voice": "seed-tts-text", - "voice_design": "seed-tts-design", -} - -# Default design dataset path (bundled with the repo) -_DEFAULT_DESIGN_DATASET_PATH = str(_REPO_ROOT / "benchmarks" / "build_dataset" / "seed_tts_design") - - -def load_model_configs(path: Path) -> dict[str, Any]: - """Load model registry from YAML file.""" - with open(path, encoding="utf-8") as f: - data = yaml.safe_load(f) - return data.get("models", {}) - - -def build_bench_args( - *, - host: str, - port: int, - model: str, - task: str, - model_cfg: dict[str, Any], - locale: str, - num_prompts: int, - concurrency: int | None, - dataset_path: str | None, - wer_eval: bool, - output_dir: str | None, - result_filename: str | None, - extra_cli_args: list[str], -) -> list[str]: - """Build the ``vllm bench serve --omni`` command for one (task, concurrency) run.""" - dataset_name = _TASK_TO_DATASET[task] - backend: str = model_cfg["backend"] - endpoint: str = model_cfg["endpoint"] - task_extra_body: dict[str, Any] = (model_cfg.get("task_extra_body") or {}).get(task) or {} - - # Resolve dataset path - if dataset_path: - resolved_dataset_path = dataset_path - elif task == "voice_design": - resolved_dataset_path = _DEFAULT_DESIGN_DATASET_PATH - else: - resolved_dataset_path = None - - cmd = [ - _vllm_omni_bin(), - "bench", - "serve", - "--omni", - "--host", - host, - "--port", - str(port), - "--model", - model, - "--backend", - backend, - "--endpoint", - endpoint, - "--dataset-name", - dataset_name, - "--num-prompts", - str(num_prompts), - "--num-warmups", - "2", - "--percentile-metrics", - "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - ] - - if resolved_dataset_path: - cmd += ["--dataset-path", resolved_dataset_path] - - if locale: - cmd += ["--seed-tts-locale", locale] - - if task_extra_body: - cmd += ["--extra-body", json.dumps(task_extra_body, separators=(",", ":"))] - - if concurrency is not None: - cmd += ["--max-concurrency", str(concurrency), "--request-rate", "inf"] - - if wer_eval: - cmd.append("--seed-tts-wer-eval") - - if output_dir or result_filename: - out_dir = output_dir or "." - os.makedirs(out_dir, exist_ok=True) - cmd += ["--save-result", "--result-dir", out_dir] - if result_filename: - cmd += ["--result-filename", result_filename] - - cmd += extra_cli_args - return cmd - - -def run_one_benchmark(cmd: list[str]) -> dict[str, Any] | None: - """Run a single benchmark subprocess and return parsed JSON result if available.""" - print(f"\n{'=' * 60}") - print("Running:", " ".join(cmd)) - print("=" * 60) - result = subprocess.run(cmd, check=False) - if result.returncode != 0: - print(f"[bench_tts] WARNING: benchmark exited with code {result.returncode}") - return None - # If --save-result was used, find the result file - try: - result_dir_idx = cmd.index("--result-dir") - result_dir = Path(cmd[result_dir_idx + 1]) - if "--result-filename" in cmd: - fname_idx = cmd.index("--result-filename") - result_file = result_dir / cmd[fname_idx + 1] - else: - # find most recently modified json - jsons = sorted(result_dir.glob("result_*.json"), key=lambda p: p.stat().st_mtime) - result_file = jsons[-1] if jsons else None - if result_file and result_file.is_file(): - return json.loads(result_file.read_text(encoding="utf-8")) - except (ValueError, IndexError, OSError): - pass - return None - - -def print_summary_table(results: list[dict[str, Any]]) -> None: - """Print a unified metrics table across all (task, concurrency) runs.""" - if not results: - return - header = ( - f"{'Task':<16} {'Concurrency':>11} {'RTF mean':>10} " - f"{'TTFP (ms)':>10} {'Throughput':>12} {'WER':>7} {'SIM':>7} {'UTMOS':>7}" - ) - print(f"\n{'=' * len(header)}") - print("BENCHMARK SUMMARY") - print("=" * len(header)) - print(header) - print("-" * len(header)) - for r in results: - task = r.get("_task", "?") - conc = r.get("_concurrency", "?") - rtf = r.get("mean_audio_rtf", float("nan")) - ttfp = r.get("mean_audio_ttfp_ms", float("nan")) - throughput = r.get("audio_throughput", float("nan")) - wer = r.get("seed_tts_mean_wer", float("nan")) - sim = r.get("seed_tts_mean_sim", float("nan")) - utmos = r.get("seed_tts_mean_utmos", float("nan")) - - def fmt(v: float, digits: int = 3) -> str: - return f"{v:.{digits}f}" if not math.isnan(v) else " n/a" - - print( - f"{task:<16} {str(conc):>11} {fmt(rtf):>10} {fmt(ttfp, 0):>10} " - f"{fmt(throughput):>12} {fmt(wer):>7} {fmt(sim):>7} {fmt(utmos):>7}" - ) - print("=" * len(header)) - - -def main() -> None: - """Entry point for the universal TTS benchmark CLI.""" - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--model", required=True, help="HuggingFace model ID (e.g. Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)" - ) - parser.add_argument("--task", default="all", help="Task type: voice_clone | default_voice | voice_design | all") - parser.add_argument("--locale", default="en", choices=["en", "zh"]) - parser.add_argument("--concurrency", type=int, nargs="+", default=[1, 4], metavar="N") - parser.add_argument( - "--num-prompts", - type=int, - nargs="+", - default=[20], - metavar="N", - help="Number of prompts per run. If one value, applied to all concurrency levels.", - ) - parser.add_argument( - "--dataset-path", default=None, help="Root of seed-tts-eval dataset (required for voice_clone/default_voice)" - ) - parser.add_argument("--wer-eval", action="store_true", help="Enable WER/SIM/UTMOS quality eval") - parser.add_argument("--output-dir", default=None, help="Directory to save result JSON files") - parser.add_argument("--host", default="localhost") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--model-configs", default=str(_DEFAULT_MODEL_CONFIGS), help="Path to model_configs.yaml") - parser.add_argument("extra", nargs=argparse.REMAINDER, help="Extra args passed directly to vllm bench serve") - args = parser.parse_args() - - model_configs = load_model_configs(Path(args.model_configs)) - if args.model not in model_configs: - known = "\n ".join(model_configs.keys()) - print(f"[bench_tts] ERROR: model '{args.model}' not in model_configs.yaml.\nKnown models:\n {known}") - sys.exit(1) - - model_cfg = model_configs[args.model] - supported_tasks: list[str] = model_cfg.get("supported_tasks", []) - - tasks_to_run: list[str] - if args.task == "all": - tasks_to_run = supported_tasks - elif args.task in supported_tasks: - tasks_to_run = [args.task] - else: - print( - f"[bench_tts] ERROR: task '{args.task}' not supported by {args.model}.\nSupported tasks: {supported_tasks}" - ) - sys.exit(1) - - # Align num_prompts list with concurrency list - num_prompts_list: list[int] = args.num_prompts - if len(num_prompts_list) == 1: - num_prompts_list = num_prompts_list * len(args.concurrency) - elif len(num_prompts_list) != len(args.concurrency): - print( - f"[bench_tts] ERROR: --num-prompts ({len(num_prompts_list)} values) must be " - f"length 1 or match --concurrency ({len(args.concurrency)} values)." - ) - sys.exit(1) - - all_results: list[dict[str, Any]] = [] - - for task in tasks_to_run: - for concurrency, num_prompts in zip(args.concurrency, num_prompts_list): - ts = datetime.now().strftime("%Y%m%d-%H%M%S") - result_filename = f"bench_tts_{args.model.replace('/', '_')}_{task}_c{concurrency}_{ts}.json" - cmd = build_bench_args( - host=args.host, - port=args.port, - model=args.model, - task=task, - model_cfg=model_cfg, - locale=args.locale, - num_prompts=num_prompts, - concurrency=concurrency, - dataset_path=args.dataset_path, - wer_eval=args.wer_eval, - output_dir=args.output_dir, - result_filename=result_filename, - extra_cli_args=args.extra or [], - ) - result = run_one_benchmark(cmd) - if result is not None: - result["_task"] = task - result["_concurrency"] = concurrency - all_results.append(result) - # Persist the metadata so plot_results.py can pick it up. - if args.output_dir and result_filename: - result_path = Path(args.output_dir) / result_filename - if result_path.is_file(): - result_path.write_text(json.dumps(result, indent=2), encoding="utf-8") - - print_summary_table(all_results) - - -if __name__ == "__main__": - main() diff --git a/benchmarks/tts/bench_voxcpm_offline.py b/benchmarks/tts/bench_voxcpm_offline.py deleted file mode 100644 index 672b77f1495..00000000000 --- a/benchmarks/tts/bench_voxcpm_offline.py +++ /dev/null @@ -1,922 +0,0 @@ -"""Offline VoxCPM benchmark for vLLM Omni. - -Supports both: -- sync one-shot (Omni.generate) -- streaming (AsyncOmni.generate with async_chunk config) -- text-only synthesis -- voice cloning -- text/clone batch inputs from txt or jsonl - -Usage:: - - # Sync (default voice) - python benchmarks/tts/bench_voxcpm_offline.py \\ - --model /path/to/VoxCPM \\ - --text "Hello world" \\ - --output-dir results/audio/ - - # Streaming (async_chunk) - python benchmarks/tts/bench_voxcpm_offline.py \\ - --model /path/to/VoxCPM \\ - --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \\ - --txt-prompts prompts.txt \\ - --output-dir results/audio/ - - # Voice cloning batch via JSONL - python benchmarks/tts/bench_voxcpm_offline.py \\ - --model /path/to/VoxCPM \\ - --jsonl-prompts prompts.jsonl \\ - --output-dir results/audio/ -""" - -from __future__ import annotations - -import asyncio -import json -import logging -import os -import tempfile -import time -import uuid -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import torch -from vllm.utils.argparse_utils import FlexibleArgumentParser - -from vllm_omni import AsyncOmni, Omni - - -def _find_repo_root(start: Path) -> Path: - """Walk up from ``start`` until a repo marker is found. - - Falls back to ``parents[2]`` for backwards compatibility if no marker hits - (which can only happen in unusual checkouts — the tree should always have - pyproject.toml + vllm_omni/ at the top level). - """ - for candidate in [start, *start.parents]: - if (candidate / "pyproject.toml").is_file() and (candidate / "vllm_omni").is_dir(): - return candidate - return start.parents[2] - - -REPO_ROOT = _find_repo_root(Path(__file__).resolve()) -DEFAULT_STAGE_ASYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm_async_chunk.yaml" -DEFAULT_STAGE_SYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" - -logger = logging.getLogger(__name__) - - -@dataclass(frozen=True, slots=True) -class PromptSpec: - text: str - label: str - ref_audio: str | None = None - ref_text: str | None = None - - -def _require_soundfile(): - try: - import soundfile as sf # type: ignore - except ModuleNotFoundError as exc: - raise RuntimeError( - "soundfile is required to write VoxCPM benchmark WAV outputs. Install it with: pip install soundfile" - ) from exc - return sf - - -def _build_prompt( - args, - *, - text: str, - ref_audio: str | None = None, - ref_text: str | None = None, - global_request_id: str | None = None, -) -> dict[str, Any]: - additional_information: dict[str, list[Any]] = { - "text": [text], - "cfg_value": [args.cfg_value], - "inference_timesteps": [args.inference_timesteps], - "min_len": [args.min_len], - "max_new_tokens": [args.max_new_tokens], - } - if args.streaming_prefix_len is not None: - additional_information["streaming_prefix_len"] = [args.streaming_prefix_len] - - if ref_audio: - additional_information["ref_audio"] = [ref_audio] - if ref_text: - additional_information["ref_text"] = [ref_text] - if global_request_id is not None: - additional_information["global_request_id"] = [global_request_id] - - return { - "prompt_token_ids": [1], - "additional_information": additional_information, - } - - -def _extract_audio_tensor(mm: dict[str, Any]) -> torch.Tensor: - audio = mm.get("audio", mm.get("model_outputs")) - if audio is None: - raise ValueError("No audio output found in multimodal output.") - if isinstance(audio, list): - parts = [torch.as_tensor(a).float().cpu().reshape(-1) for a in audio] - audio = torch.cat(parts, dim=-1) if parts else torch.zeros(0) - if not isinstance(audio, torch.Tensor): - audio = torch.as_tensor(audio) - return audio.float().cpu().reshape(-1) - - -def _extract_sample_rate(mm: dict[str, Any]) -> int: - sr_raw = mm.get("sr", 24000) - if isinstance(sr_raw, list) and sr_raw: - sr_raw = sr_raw[-1] - if hasattr(sr_raw, "item"): - return int(sr_raw.item()) - return int(sr_raw) - - -def _emit_offline_metrics( - *, - request_id: str, - elapsed_s: float, - first_audio_elapsed: float | None, - audio_duration_s: float, -) -> None: - metrics = { - "request_id": request_id, - "ttfp_ms": round(first_audio_elapsed * 1000.0, 3) if first_audio_elapsed is not None else None, - "audio_duration_s": round(audio_duration_s, 6), - "rtf": round(elapsed_s / audio_duration_s, 6) if audio_duration_s > 0 else None, - } - print(f"[OfflineMetrics] {metrics}") - - -def _write_audio_tensor(output_path: Path, audio_tensor: Any, sample_rate: int) -> None: - sf = _require_soundfile() - if isinstance(audio_tensor, torch.Tensor): - audio_np = audio_tensor.float().cpu().clamp(-1.0, 1.0).numpy() - else: - audio_np = torch.as_tensor(audio_tensor).float().cpu().clamp(-1.0, 1.0).numpy() - sf.write( - output_path, - audio_np, - sample_rate, - format="WAV", - subtype="PCM_16", - ) - - -def _save_wav(mm: dict[str, Any], output_dir: Path, request_id: str) -> Path: - output_dir.mkdir(parents=True, exist_ok=True) - output_path = output_dir / f"output_{request_id}.wav" - _write_audio_tensor(output_path, _extract_audio_tensor(mm), _extract_sample_rate(mm)) - return output_path - - -def _iter_request_multimodal_outputs(request_output: Any): - outputs = getattr(request_output, "outputs", None) - if outputs: - for output in outputs: - mm = getattr(output, "multimodal_output", None) - if isinstance(mm, dict): - yield mm - - mm = getattr(request_output, "multimodal_output", None) - if isinstance(mm, dict): - yield mm - - -def _read_non_empty_lines(path: str) -> list[str]: - with open(path, encoding="utf-8") as f: - return [line.strip() for line in f if line.strip()] - - -def _load_prompt_specs(args) -> list[PromptSpec]: - specs: list[PromptSpec] = [] - - if args.txt_prompts is not None: - texts = _read_non_empty_lines(args.txt_prompts) - if not texts: - raise ValueError(f"No prompts found in {args.txt_prompts}") - for idx, text in enumerate(texts, start=1): - specs.append( - PromptSpec( - text=text, - label=f"item{idx:03d}", - ref_audio=args.ref_audio, - ref_text=args.ref_text, - ) - ) - return specs - - if args.jsonl_prompts is not None: - with open(args.jsonl_prompts, encoding="utf-8") as f: - for line_no, raw_line in enumerate(f, start=1): - line = raw_line.strip() - if not line: - continue - try: - item = json.loads(line) - except json.JSONDecodeError as exc: - raise ValueError(f"{args.jsonl_prompts}:{line_no} is not valid JSON: {exc}") from exc - if not isinstance(item, dict): - raise ValueError(f"{args.jsonl_prompts}:{line_no} must be a JSON object") - - text = item.get("text") - if not isinstance(text, str) or not text.strip(): - raise ValueError(f"{args.jsonl_prompts}:{line_no} requires non-empty string field 'text'") - - ref_audio = item.get("ref_audio", args.ref_audio) - ref_text = item.get("ref_text", args.ref_text) - if (ref_audio is None) != (ref_text is None): - raise ValueError( - f"{args.jsonl_prompts}:{line_no} must provide both 'ref_audio' and 'ref_text' together" - ) - - specs.append( - PromptSpec( - text=text.strip(), - label=f"item{len(specs) + 1:03d}", - ref_audio=ref_audio, - ref_text=ref_text, - ) - ) - - if not specs: - raise ValueError(f"No prompts found in {args.jsonl_prompts}") - return specs - - specs.append( - PromptSpec( - text=args.text, - label="item001", - ref_audio=args.ref_audio, - ref_text=args.ref_text, - ) - ) - return specs - - -def _build_prompt_for_spec(args, spec: PromptSpec, *, global_request_id: str | None = None) -> dict[str, Any]: - return _build_prompt( - args, - text=spec.text, - ref_audio=spec.ref_audio, - ref_text=spec.ref_text, - global_request_id=global_request_id, - ) - - -def _count_voice_clone_prompts(prompt_specs: list[PromptSpec]) -> int: - return sum(1 for spec in prompt_specs if spec.ref_audio is not None) - - -def _get_warmup_specs(prompt_specs: list[PromptSpec]) -> list[PromptSpec]: - return prompt_specs[:1] - - -def _extract_stream_finished(stage_output: Any) -> bool: - request_output = getattr(stage_output, "request_output", None) - request_finished = getattr(request_output, "finished", None) - if request_finished is not None: - return bool(request_finished) - return bool(getattr(stage_output, "finished", False)) - - -def _build_profiled_stage_config( - stage_configs_path: str, - profiler_dir: str, -) -> str: - stage_config_path = Path(stage_configs_path) - yaml_text = stage_config_path.read_text(encoding="utf-8") - injected_lines: list[str] = [] - injected_count = 0 - - for line in yaml_text.splitlines(): - injected_lines.append(line) - if line.strip() != "engine_args:": - continue - indent = line[: len(line) - len(line.lstrip())] - child_indent = indent + " " - grandchild_indent = child_indent + " " - injected_lines.extend( - [ - f"{child_indent}profiler_config:", - f'{grandchild_indent}profiler: "torch"', - f'{grandchild_indent}torch_profiler_dir: "{profiler_dir}"', - f"{grandchild_indent}torch_profiler_with_stack: true", - ] - ) - injected_count += 1 - - if injected_count == 0: - raise ValueError(f"No engine_args block found in stage config: {stage_configs_path}") - - tmp = tempfile.NamedTemporaryFile( - mode="w", - encoding="utf-8", - delete=False, - suffix=".yaml", - prefix=f"{stage_config_path.stem}_profile_", - ) - tmp.write("\n".join(injected_lines) + "\n") - tmp.close() - return tmp.name - - -def parse_args(): - parser = FlexibleArgumentParser( - description="Offline split-stage VoxCPM inference with vLLM Omni (auto sync/streaming by stage config)" - ) - parser.add_argument( - "--model", - type=str, - default=os.environ.get("VOXCPM_MODEL"), - help="Local VoxCPM model directory. Defaults to $VOXCPM_MODEL.", - ) - parser.add_argument( - "--text", - type=str, - default="This is a split-stage VoxCPM synthesis example running on vLLM Omni.", - help="Text to synthesize. Ignored when --txt-prompts or --jsonl-prompts is used.", - ) - parser.add_argument( - "--txt-prompts", - type=str, - default=None, - help="Path to a .txt file with one synthesis text per line.", - ) - parser.add_argument( - "--jsonl-prompts", - type=str, - default=None, - help=( - "Path to a .jsonl file. Each line must contain at least {'text': ...}; " - "clone rows can also set ref_audio/ref_text, and ref_text must be the " - "real transcript of ref_audio." - ), - ) - parser.add_argument( - "--ref-audio", - type=str, - default=None, - help=( - "Optional reference audio path for voice cloning. With --txt-prompts, " - "the same reference is applied to every line." - ), - ) - parser.add_argument( - "--ref-text", - type=str, - default=None, - help=( - "Real transcript of the reference audio. Placeholder text or mismatched " - "text will usually produce noisy/electronic clone audio." - ), - ) - parser.add_argument( - "--stage-configs-path", - type=str, - default=str(DEFAULT_STAGE_SYNC), - help="Stage config YAML path. Routing is selected only from this path.", - ) - parser.add_argument( - "--cfg-value", - type=float, - default=2.0, - help="Classifier-free guidance value for VoxCPM.", - ) - parser.add_argument( - "--inference-timesteps", - type=int, - default=10, - help="Number of inference timesteps.", - ) - parser.add_argument( - "--min-len", - type=int, - default=2, - help="Minimum generated token length.", - ) - parser.add_argument( - "--max-new-tokens", - type=int, - default=4096, - help="Maximum generated token length.", - ) - parser.add_argument( - "--streaming-prefix-len", - type=int, - default=None, - help="VoxCPM streaming window (optional, streaming mode only).", - ) - parser.add_argument( - "--output-dir", - type=str, - default=None, - help="Directory for output WAV files.", - ) - parser.add_argument( - "--stage-init-timeout", - type=int, - default=600, - help="Stage initialization timeout in seconds.", - ) - parser.add_argument( - "--log-stats", - dest="log_stats", - action="store_true", - help="Enable vLLM Omni stats logging.", - ) - parser.add_argument( - "--no-log-stats", - dest="log_stats", - action="store_false", - help="Disable vLLM Omni stats logging.", - ) - parser.set_defaults(log_stats=True) - parser.add_argument( - "--num-runs", - type=int, - default=1, - help="Number of full inference runs (same prompt each time). Default 1.", - ) - parser.add_argument( - "--warmup-runs", - type=int, - default=0, - help=( - "Optional number of warmup passes before measured runs. Warmup uses only " - "the first prompt and does not save outputs." - ), - ) - parser.add_argument( - "--enable-profiler", - action="store_true", - help=( - "Enable torch profiler for the configured stages. A temporary profiled " - "stage config is generated automatically." - ), - ) - parser.add_argument( - "--profiler-dir", - type=str, - default=None, - help="Directory for profiler traces. Defaults to /profiler when profiling is enabled.", - ) - parser.add_argument( - "--profiler-stages", - type=int, - nargs="*", - default=None, - help="Optional stage ids to profile. Defaults to all stages that have profiler_config.", - ) - parser.add_argument( - "--profiler-wait-seconds", - type=float, - default=30.0, - help="Seconds to wait after stop_profile for trace files to flush.", - ) - args = parser.parse_args() - - if not args.model: - parser.error("--model is required unless $VOXCPM_MODEL is set") - if args.txt_prompts is not None and args.jsonl_prompts is not None: - parser.error("--txt-prompts and --jsonl-prompts are mutually exclusive") - if (args.ref_audio is None) != (args.ref_text is None): - parser.error("--ref-audio and --ref-text must be provided together") - if args.num_runs < 1: - parser.error("--num-runs must be >= 1") - if args.warmup_runs < 0: - parser.error("--warmup-runs must be >= 0") - if args.output_dir is None: - args.output_dir = ( - "output_audio_streaming" if _is_streaming_stage_config(args.stage_configs_path) else "output_audio" - ) - if args.enable_profiler and args.profiler_dir is None: - args.profiler_dir = str(Path(args.output_dir) / "profiler") - try: - args.prompt_specs = _load_prompt_specs(args) - except ValueError as exc: - parser.error(str(exc)) - - return args - - -def _is_streaming_stage_config(stage_configs_path: str) -> bool: - cfg_name = Path(stage_configs_path).name.lower() - return "async_chunk" in cfg_name - - -async def _collect_streaming_audio( - omni: AsyncOmni, - args: Any, - spec: PromptSpec, - request_id: str, - *, - phase_label: str, - prompt_index: int, - prompt_count: int, - print_prompt: bool = False, -) -> tuple[torch.Tensor, int, float, float | None]: - prompt = _build_prompt_for_spec(args, spec, global_request_id=request_id) - delta_chunks: list[torch.Tensor] = [] - sample_rate = 24000 - chunk_i = 0 - prev_total_samples = 0 - t_start = time.perf_counter() - first_audio_elapsed: float | None = None - - if print_prompt: - print(f"---prompt---:{prompt}") - - async for stage_output in omni.generate(prompt, request_id=request_id): - mm = getattr(stage_output, "multimodal_output", None) - if not isinstance(mm, dict): - ro = getattr(stage_output, "request_output", None) - if ro is None: - continue - mm = getattr(ro, "multimodal_output", None) - if not isinstance(mm, dict) and getattr(ro, "outputs", None): - seq = ro.outputs[0] - mm = getattr(seq, "multimodal_output", None) - if not isinstance(mm, dict): - continue - sample_rate = _extract_sample_rate(mm) - try: - w = _extract_audio_tensor(mm) - n = int(w.numel()) - if n == 0: - continue - finished = _extract_stream_finished(stage_output) - if n > prev_total_samples: - delta = w.reshape(-1)[prev_total_samples:] - prev_total_samples = n - elif finished and n == prev_total_samples: - delta = w.reshape(-1)[:0] - else: - delta = w.reshape(-1) - prev_total_samples += int(delta.numel()) - if int(delta.numel()) > 0: - delta_chunks.append(delta) - if first_audio_elapsed is None and int(delta.numel()) > 0: - first_audio_elapsed = time.perf_counter() - t_start - logger.info( - "%s prompt=%d/%d chunk=%d delta_samples=%d buf_len=%d finished=%s", - phase_label, - prompt_index + 1, - prompt_count, - chunk_i, - int(delta.numel()), - n, - finished, - ) - chunk_i += 1 - except ValueError: - if not _extract_stream_finished(stage_output): - logger.debug("skip non-audio partial output chunk=%d", chunk_i) - - if not delta_chunks: - raise RuntimeError("No audio chunks received; check stage config and logs.") - - audio_cat = torch.cat([c.reshape(-1) for c in delta_chunks], dim=0) - elapsed = time.perf_counter() - t_start - return audio_cat, sample_rate, elapsed, first_audio_elapsed - - -async def _abort_streaming_residual_work( - omni: AsyncOmni, - request_id: str, - *, - settle_seconds: float = 0.1, -) -> None: - """Stop any late stage-0 work once the final audio has been collected.""" - await omni.engine.abort_async([request_id]) - if settle_seconds > 0: - await asyncio.sleep(settle_seconds) - - -async def _run_streaming_single( - omni: AsyncOmni, - args: Any, - spec: PromptSpec, - output_dir: Path, - request_id: str, - *, - run_index: int, - num_runs: int, - prompt_index: int, - prompt_count: int, -) -> Path: - audio_cat, sample_rate, elapsed, first_audio_elapsed = await _collect_streaming_audio( - omni, - args, - spec, - request_id, - phase_label=f"run={run_index + 1}/{num_runs}", - prompt_index=prompt_index, - prompt_count=prompt_count, - print_prompt=(run_index == 0 and prompt_index == 0), - ) - await _abort_streaming_residual_work(omni, request_id) - output_path = output_dir / f"output_run{run_index + 1}_{spec.label}.wav" - _write_audio_tensor(output_path, audio_cat, sample_rate) - audio_duration_s = float(audio_cat.numel()) / float(sample_rate) if sample_rate > 0 else 0.0 - ttfp_text = f", ttfp={first_audio_elapsed:.2f}s" if first_audio_elapsed is not None else "" - rtf_text = f", rtf={elapsed / audio_duration_s:.3f}" if audio_duration_s > 0 else "" - print( - f"Saved (streaming) run {run_index + 1}/{num_runs}, " - f"prompt {prompt_index + 1}/{prompt_count}: {output_path} ({elapsed:.2f}s{ttfp_text}{rtf_text})" - ) - _emit_offline_metrics( - request_id=request_id, - elapsed_s=elapsed, - first_audio_elapsed=first_audio_elapsed, - audio_duration_s=audio_duration_s, - ) - return output_path - - -async def _run_streaming_warmup(args, omni: AsyncOmni) -> None: - if args.warmup_runs == 0: - return - - warmup_specs = _get_warmup_specs(args.prompt_specs) - print( - f"Warmup: {args.warmup_runs} run(s) using the first prompt " - f"({len(warmup_specs)} prompt(s)); outputs will be discarded." - ) - for warmup_index in range(args.warmup_runs): - t_warmup = time.perf_counter() - tasks = [] - request_ids: list[str] = [] - for prompt_index, spec in enumerate(warmup_specs): - request_id = f"warmup_stream_{warmup_index + 1}_{spec.label}_{uuid.uuid4().hex[:8]}" - request_ids.append(request_id) - tasks.append( - _collect_streaming_audio( - omni, - args, - spec, - request_id, - phase_label=f"warmup={warmup_index + 1}/{args.warmup_runs}", - prompt_index=prompt_index, - prompt_count=len(warmup_specs), - ) - ) - results = await asyncio.gather(*tasks) - for request_id in request_ids: - await _abort_streaming_residual_work(omni, request_id) - total_samples = sum(int(audio.numel()) for audio, _, _, _ in results) - warmup_ttfps = [ttfp for _, _, _, ttfp in results if ttfp is not None] - ttfp_text = f", ttfp={min(warmup_ttfps):.2f}s" if warmup_ttfps else "" - print( - f"Warmup (streaming) {warmup_index + 1}/{args.warmup_runs} finished: " - f"{len(results)} prompt(s), {total_samples} sample(s) " - f"({time.perf_counter() - t_warmup:.2f}s{ttfp_text})" - ) - - -async def _run_streaming(args) -> list[Path]: - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - omni = AsyncOmni( - model=args.model, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - ) - - await _run_streaming_warmup(args, omni) - profiler_started = False - if args.enable_profiler: - profile_prefix = f"voxcpm_streaming_{int(time.time())}" - stages_text = args.profiler_stages if args.profiler_stages is not None else "all-configured" - print(f"Starting profiler (streaming): stages={stages_text}, dir={args.profiler_dir}") - await omni.start_profile(profile_prefix=profile_prefix, stages=args.profiler_stages) - profiler_started = True - t_total = time.perf_counter() - total_elapsed = 0.0 - paths: list[Path] = [] - prompt_specs: list[PromptSpec] = args.prompt_specs - try: - for run in range(args.num_runs): - for prompt_index, spec in enumerate(prompt_specs): - request_id = f"stream_{run + 1}_{spec.label}_{uuid.uuid4().hex[:8]}" - paths.append( - await _run_streaming_single( - omni, - args, - spec, - output_dir, - request_id, - run_index=run, - num_runs=args.num_runs, - prompt_index=prompt_index, - prompt_count=len(prompt_specs), - ) - ) - total_elapsed = time.perf_counter() - t_total - finally: - if profiler_started: - print("Stopping profiler (streaming)...") - await omni.stop_profile(stages=args.profiler_stages) - if args.profiler_wait_seconds > 0: - print(f"Waiting {args.profiler_wait_seconds:.1f}s for profiler traces to flush...") - await asyncio.sleep(args.profiler_wait_seconds) - - print( - f"All streaming runs finished: {args.num_runs} run(s), " - f"{len(prompt_specs)} prompt(s), {len(paths)} file(s) in {total_elapsed:.2f}s total" - ) - return paths - - -def _run_sync(args) -> list[Path]: - output_dir = Path(args.output_dir) - - omni = Omni( - model=args.model, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - ) - - def _run_sync_single( - spec: PromptSpec, - *, - request_prefix: str, - save_outputs: bool, - run_index: int | None = None, - ) -> tuple[list[Path], int, float | None, float, float, str]: - global_request_id = f"{request_prefix}_{spec.label}" - prompt = _build_prompt_for_spec(args, spec, global_request_id=global_request_id) - if save_outputs and run_index == 0 and spec.label == "item001": - print(f"---prompt---:{prompt}") - - saved_paths: list[Path] = [] - output_count = 0 - first_audio_elapsed: float | None = None - total_audio_duration_s = 0.0 - metrics_request_id = global_request_id - t_start = time.perf_counter() - for stage_outputs in omni.generate(prompt): - request_output = stage_outputs.request_output - if request_output is None: - continue - request_output_id = getattr(request_output, "request_id", None) - if isinstance(request_output_id, str) and request_output_id: - metrics_request_id = request_output_id - for j, mm in enumerate(_iter_request_multimodal_outputs(request_output)): - output_count += 1 - if first_audio_elapsed is None: - try: - audio_tensor = _extract_audio_tensor(mm) - if int(audio_tensor.numel()) > 0: - first_audio_elapsed = time.perf_counter() - t_start - total_audio_duration_s += float(audio_tensor.numel()) / float(_extract_sample_rate(mm)) - except ValueError: - pass - else: - try: - audio_tensor = _extract_audio_tensor(mm) - total_audio_duration_s += float(audio_tensor.numel()) / float(_extract_sample_rate(mm)) - except ValueError: - pass - if not save_outputs: - continue - save_stem = f"run{run_index + 1}_{spec.label}" if j == 0 else f"run{run_index + 1}_{spec.label}_{j}" - saved_paths.append(_save_wav(mm, output_dir, save_stem)) - - if output_count == 0: - raise RuntimeError("No output from Omni.generate") - elapsed_s = time.perf_counter() - t_start - return saved_paths, output_count, first_audio_elapsed, elapsed_s, total_audio_duration_s, metrics_request_id - - if args.warmup_runs: - warmup_specs = _get_warmup_specs(args.prompt_specs) - print( - f"Warmup: {args.warmup_runs} run(s) using the first prompt " - f"({len(warmup_specs)} prompt(s)); outputs will be discarded." - ) - for warmup_index in range(args.warmup_runs): - t_warmup = time.perf_counter() - _, output_count, first_audio_elapsed, elapsed_s, audio_duration_s, _ = _run_sync_single( - warmup_specs[0], - request_prefix=f"warmup_sync{warmup_index + 1}", - save_outputs=False, - ) - ttfp_text = f", ttfp={first_audio_elapsed:.2f}s" if first_audio_elapsed is not None else "" - rtf_text = f", rtf={elapsed_s / audio_duration_s:.3f}" if audio_duration_s > 0 else "" - print( - f"Warmup (sync) {warmup_index + 1}/{args.warmup_runs} finished: " - f"{output_count} output(s) ({time.perf_counter() - t_warmup:.2f}s{ttfp_text}{rtf_text})" - ) - - profiler_started = False - if args.enable_profiler: - profile_prefix = f"voxcpm_sync_{int(time.time())}" - stages_text = args.profiler_stages if args.profiler_stages is not None else "all-configured" - print(f"Starting profiler (sync): stages={stages_text}, dir={args.profiler_dir}") - omni.start_profile(profile_prefix=profile_prefix, stages=args.profiler_stages) - profiler_started = True - - t_total = time.perf_counter() - total_elapsed = 0.0 - saved_paths: list[Path] = [] - prompt_specs: list[PromptSpec] = args.prompt_specs - try: - for run in range(args.num_runs): - t_run = time.perf_counter() - run_paths: list[Path] = [] - for prompt_index, spec in enumerate(prompt_specs): - prompt_paths, _, first_audio_elapsed, elapsed_s, audio_duration_s, metrics_request_id = ( - _run_sync_single( - spec, - request_prefix=f"sync_run{run + 1}_{prompt_index + 1:03d}", - save_outputs=True, - run_index=run, - ) - ) - run_paths.extend(prompt_paths) - ttfp_text = f", ttfp={first_audio_elapsed:.2f}s" if first_audio_elapsed is not None else "" - rtf_text = f", rtf={elapsed_s / audio_duration_s:.3f}" if audio_duration_s > 0 else "" - print( - f"Saved (sync) run {run + 1}/{args.num_runs}, " - f"prompt {prompt_index + 1}/{len(prompt_specs)}: {len(prompt_paths)} file(s){ttfp_text}{rtf_text}" - ) - _emit_offline_metrics( - request_id=metrics_request_id, - elapsed_s=elapsed_s, - first_audio_elapsed=first_audio_elapsed, - audio_duration_s=audio_duration_s, - ) - - saved_paths.extend(run_paths) - print( - f"Run {run + 1}/{args.num_runs} finished: {len(run_paths)} file(s) ({time.perf_counter() - t_run:.2f}s)" - ) - for path in run_paths: - print(f" {path}") - - total_elapsed = time.perf_counter() - t_total - finally: - if profiler_started: - print("Stopping profiler (sync)...") - omni.stop_profile(stages=args.profiler_stages) - if args.profiler_wait_seconds > 0: - print(f"Waiting {args.profiler_wait_seconds:.1f}s for profiler traces to flush...") - time.sleep(args.profiler_wait_seconds) - - print( - f"All sync runs finished: {args.num_runs} run(s), " - f"{len(prompt_specs)} prompt(s), {len(saved_paths)} file(s) in {total_elapsed:.2f}s total" - ) - return saved_paths - - -def main(args) -> int: - logging.basicConfig(level=logging.INFO) - profiled_stage_config_path: str | None = None - original_stage_config_path = args.stage_configs_path - if args.enable_profiler: - Path(args.profiler_dir).mkdir(parents=True, exist_ok=True) - profiled_stage_config_path = _build_profiled_stage_config( - args.stage_configs_path, - str(Path(args.profiler_dir).resolve()), - ) - args.stage_configs_path = profiled_stage_config_path - - is_streaming = _is_streaming_stage_config(args.stage_configs_path) - voice_clone_count = _count_voice_clone_prompts(args.prompt_specs) - print(f"Model: {args.model}") - print(f"Stage config: {original_stage_config_path}") - print(f"Route: {'streaming' if is_streaming else 'sync'} (from stage-configs-path)") - print(f"Prompt count: {len(args.prompt_specs)}") - print("Batch mode: sequential (aligned with native VoxCPM)") - print(f"Warmup runs: {args.warmup_runs}") - print(f"Voice cloning prompts: {voice_clone_count}/{len(args.prompt_specs)}") - if args.enable_profiler: - print(f"Profiler: enabled (dir={args.profiler_dir}, stages={args.profiler_stages or 'all-configured'})") - print(f"Profiled stage config: {args.stage_configs_path}") - if voice_clone_count: - print("Voice cloning note: --ref-text/ref_text must match the spoken content of the reference audio.") - print(f"Num runs: {args.num_runs}") - try: - if is_streaming: - asyncio.run(_run_streaming(args)) - else: - _run_sync(args) - finally: - if profiled_stage_config_path is not None and os.path.exists(profiled_stage_config_path): - os.unlink(profiled_stage_config_path) - return 0 - - -if __name__ == "__main__": - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - raise SystemExit(main(parse_args())) diff --git a/benchmarks/tts/model_configs.yaml b/benchmarks/tts/model_configs.yaml deleted file mode 100644 index 83b25370538..00000000000 --- a/benchmarks/tts/model_configs.yaml +++ /dev/null @@ -1,39 +0,0 @@ -# Universal TTS benchmark model registry. -# Maps HuggingFace model ID → supported tasks + per-task extra body fields. -# To add a new TTS model: add an entry here. No code changes required. -# -# The server auto-loads its Deploy YAML from vllm_omni/deploy/.yaml via -# the Pipeline + Deploy schema introduced in #2383, so no stage_config path -# is tracked here. - -models: - # -CustomVoice checkpoints lack speaker_encoder weights, so voice_clone is - # NOT supported (an attempt raises ValueError from _extract_speaker_embedding - # at model runtime). Use -Base for voice_clone. - Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice: - supported_tasks: [default_voice, voice_design] - backend: openai-audio-speech - endpoint: /v1/audio/speech - task_extra_body: - default_voice: - voice: Vivian - language: English - task_type: CustomVoice - voice_design: - task_type: VoiceDesign - language: English - - Qwen/Qwen3-TTS-12Hz-1.7B-Base: - supported_tasks: [voice_clone] - backend: openai-audio-speech - endpoint: /v1/audio/speech - task_extra_body: - voice_clone: - task_type: Base - - openbmb/VoxCPM2: - supported_tasks: [voice_clone] - backend: openai-audio-speech - endpoint: /v1/audio/speech - task_extra_body: - voice_clone: {} diff --git a/benchmarks/tts/plot_results.py b/benchmarks/tts/plot_results.py deleted file mode 100644 index f19c613209a..00000000000 --- a/benchmarks/tts/plot_results.py +++ /dev/null @@ -1,324 +0,0 @@ -"""Plot universal TTS benchmark results. - -Reads JSON files saved by ``bench_tts.py`` (via ``vllm bench serve --omni``) -and generates comparison bar charts grouped by task type. - -Metrics plotted: -- AUDIO_TTFP (mean audio time-to-first-packet, ms) -- E2EL (mean end-to-end latency, ms) -- Audio RTF (mean real-time factor) -- Audio throughput (audio-seconds / wall-second) - -Quality metrics (WER / SIM / UTMOS) are printed in a table when present. - -Usage:: - - # Single run — one JSON per task, all in results/ - python benchmarks/tts/plot_results.py \\ - --results results/bench_tts_*.json \\ - --output results/tts_benchmark.png - - # Compare two runs (e.g. async_chunk on vs off) - python benchmarks/tts/plot_results.py \\ - --results run_a/bench_tts_*.json \\ - --results run_b/bench_tts_*.json \\ - --labels "async_chunk_on" "async_chunk_off" \\ - --output results/comparison.png -""" - -from __future__ import annotations - -import argparse -import json -import math -from pathlib import Path - -import matplotlib.pyplot as plt -import numpy as np - -# --------------------------------------------------------------------------- -# JSON loading -# --------------------------------------------------------------------------- - - -def load_run(paths: list[str]) -> list[dict]: - """Load and merge all JSON files for one run into a flat list of records. - - Each record is expected to have at least ``_concurrency`` (int) and - ``_task`` (str) keys injected by ``bench_tts.py``. Records that come - from a file that contains a list are flattened. - """ - records: list[dict] = [] - for p in paths: - raw = json.loads(Path(p).read_text(encoding="utf-8")) - if isinstance(raw, list): - records.extend(raw) - elif isinstance(raw, dict): - records.append(raw) - return records - - -def _get(record: dict, key: str) -> float: - v = record.get(key, float("nan")) - if v is None or (isinstance(v, float) and math.isnan(v)): - return float("nan") - try: - return float(v) - except (TypeError, ValueError): - return float("nan") - - -# --------------------------------------------------------------------------- -# Plotting helpers -# --------------------------------------------------------------------------- - - -def _bar_group( - ax: plt.Axes, - x: np.ndarray, - data_per_label: dict[str, list[float]], - width: float, - colors: list[str], - ylabel: str, - title: str, - concurrency_labels: list[str], - fmt: str = ".1f", -) -> None: - n = len(data_per_label) - offsets = np.linspace(-(n - 1) * width / 2, (n - 1) * width / 2, n) if n > 1 else [0.0] - - for i, (label, values) in enumerate(data_per_label.items()): - plot_vals = [0.0 if math.isnan(v) else v for v in values] - bar = ax.bar(x + offsets[i], plot_vals, width, label=label, color=colors[i % len(colors)], alpha=0.85) - max_val = max((v for v in values if not math.isnan(v)), default=1.0) - for rect, val in zip(bar, values): - if not math.isnan(val) and val > 0: - ax.text( - rect.get_x() + rect.get_width() / 2, - rect.get_height() + max_val * 0.02, - f"{val:{fmt}}", - ha="center", - va="bottom", - fontsize=8, - fontweight="bold", - ) - - ax.set_xlabel("Concurrency", fontsize=11) - ax.set_ylabel(ylabel, fontsize=11) - ax.set_title(title, fontsize=12, fontweight="bold") - ax.set_xticks(x) - ax.set_xticklabels(concurrency_labels) - ax.legend(fontsize=9) - ax.grid(axis="y", alpha=0.3) - ax.set_axisbelow(True) - - -COLORS = ["#2196F3", "#FF5722", "#4CAF50", "#FFC107", "#9C27B0"] - - -# --------------------------------------------------------------------------- -# Comparison plot (multiple labels / runs) -# --------------------------------------------------------------------------- - - -def plot_comparison( - all_runs: list[list[dict]], - labels: list[str], - output_path: str, - task_filter: str | None = None, - title_prefix: str = "TTS", -) -> None: - """One 2×2 subplot per task found in the data.""" - # Determine tasks to plot - tasks: list[str] = [] - for run in all_runs: - for r in run: - t = r.get("_task", "unknown") - if t not in tasks: - tasks.append(t) - if task_filter: - tasks = [t for t in tasks if t == task_filter] - - n_tasks = len(tasks) - if n_tasks == 0: - print("[plot_results] No tasks found in data.") - return - - fig, axes_grid = plt.subplots(n_tasks, 4, figsize=(18, 4.5 * n_tasks)) - fig.suptitle(f"{title_prefix} Benchmark", fontsize=15, fontweight="bold") - - # Ensure axes_grid is always 2D - if n_tasks == 1: - axes_grid = [axes_grid] - - for row_idx, task in enumerate(tasks): - # Collect concurrencies across all runs for this task - all_concs: set[int] = set() - for run in all_runs: - for r in run: - if r.get("_task") == task: - c = r.get("_concurrency") - if c is not None: - all_concs.add(int(c)) - concurrencies = sorted(all_concs) - x = np.arange(len(concurrencies)) - conc_labels = [str(c) for c in concurrencies] - - def _series(run: list[dict], metric_key: str) -> list[float]: - conc_map = {int(r["_concurrency"]): r for r in run if r.get("_task") == task and "_concurrency" in r} - return [_get(conc_map.get(c, {}), metric_key) for c in concurrencies] - - metrics = [ - ("mean_audio_ttfp_ms", "TTFP (ms)", "Time-to-First-Packet", ".0f"), - ("mean_e2el_ms", "E2E Latency (ms)", "End-to-End Latency", ".0f"), - ("mean_audio_rtf", "RTF", "Real-Time Factor (RTF)", ".3f"), - ("audio_throughput", "audio-s / wall-s", "Audio Throughput", ".2f"), - ] - - axes_row = axes_grid[row_idx] - for col_idx, (key, ylabel, subtitle, fmt) in enumerate(metrics): - data_per_label = {lbl: _series(run, key) for lbl, run in zip(labels, all_runs)} - _bar_group( - axes_row[col_idx], - x, - data_per_label, - width=0.3 if len(labels) > 1 else 0.5, - colors=COLORS, - ylabel=ylabel, - title=f"{task} — {subtitle}", - concurrency_labels=conc_labels, - fmt=fmt, - ) - - plt.tight_layout() - Path(output_path).parent.mkdir(parents=True, exist_ok=True) - plt.savefig(output_path, dpi=150, bbox_inches="tight") - print(f"Plot saved to {output_path}") - plt.close() - - -# --------------------------------------------------------------------------- -# Markdown comparison table -# --------------------------------------------------------------------------- - - -def print_comparison_table(all_runs: list[list[dict]], labels: list[str]) -> None: - tasks: list[str] = [] - for run in all_runs: - for r in run: - t = r.get("_task", "unknown") - if t not in tasks: - tasks.append(t) - - perf_metrics = [ - ("TTFP (ms)", "mean_audio_ttfp_ms", ".1f"), - ("E2E (ms)", "mean_e2el_ms", ".1f"), - ("RTF", "mean_audio_rtf", ".3f"), - ("Throughput (a-s/s)", "audio_throughput", ".2f"), - ] - quality_metrics = [ - ("WER (%)", "seed_tts_mean_wer", ".1f"), - ("SIM", "seed_tts_mean_sim", ".3f"), - ("UTMOS", "seed_tts_mean_utmos", ".2f"), - ] - - for task in tasks: - all_concs: set[int] = set() - for run in all_runs: - for r in run: - if r.get("_task") == task: - c = r.get("_concurrency") - if c is not None: - all_concs.add(int(c)) - concurrencies = sorted(all_concs) - - print(f"\n## {task}\n") - col_header = "| Metric | Concurrency |" + "".join(f" {lbl} |" for lbl in labels) - sep = "| --- | --- |" + " --- |" * len(labels) - print(col_header) - print(sep) - - for metric, key, fmt in perf_metrics + quality_metrics: - for c in concurrencies: - row = f"| {metric} | {c} |" - for run in all_runs: - conc_map = { - int(r["_concurrency"]): r for r in run if r.get("_task") == task and "_concurrency" in r - } - val = _get(conc_map.get(c, {}), key) - row += f" {val:{fmt}} |" if not math.isnan(val) else " n/a |" - print(row) - - # Improvement column (2-run comparison only) - if len(all_runs) == 2: - print(f"\n### Improvement ({labels[0]} vs {labels[1]})\n") - print("| Metric | Concurrency | Change |") - print("| --- | --- | --- |") - for metric, key, _ in perf_metrics: - for c in concurrencies: - conc_map0 = { - int(r["_concurrency"]): r for r in all_runs[0] if r.get("_task") == task and "_concurrency" in r - } - conc_map1 = { - int(r["_concurrency"]): r for r in all_runs[1] if r.get("_task") == task and "_concurrency" in r - } - v0 = _get(conc_map0.get(c, {}), key) - v1 = _get(conc_map1.get(c, {}), key) - if not math.isnan(v0) and not math.isnan(v1) and v1 > 0: - pct = (v1 - v0) / v1 * 100 - print(f"| {metric} | {c} | {pct:+.1f}% |") - - -# --------------------------------------------------------------------------- -# CLI -# --------------------------------------------------------------------------- - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--results", - type=str, - nargs="+", - action="append", - required=True, - metavar="FILE", - help="JSON result file(s) for one run. Repeat --results for multiple runs to compare.", - ) - parser.add_argument( - "--labels", - type=str, - nargs="+", - default=None, - help="Label for each --results group (must match the number of --results groups).", - ) - parser.add_argument("--output", type=str, default="results/tts_benchmark.png", help="Output image path.") - parser.add_argument("--title", type=str, default="TTS", help="Title prefix for the plot.") - parser.add_argument("--task", type=str, default=None, help="Filter to a single task (e.g. voice_clone).") - return parser.parse_args() - - -def main() -> None: - args = parse_args() - - # args.results is a list-of-lists due to action="append" - all_runs: list[list[dict]] = [load_run(group) for group in args.results] - n_runs = len(all_runs) - - labels: list[str] - if args.labels: - if len(args.labels) != n_runs: - raise SystemExit(f"--labels count ({len(args.labels)}) must match --results groups ({n_runs})") - labels = args.labels - else: - labels = [f"run{i + 1}" for i in range(n_runs)] - - print_comparison_table(all_runs, labels) - plot_comparison(all_runs, labels, args.output, task_filter=args.task, title_prefix=args.title) - - -if __name__ == "__main__": - main() diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 9cbf89d0b79..24ce39bafd7 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -7,7 +7,7 @@ COPY . . # Install system dependencies RUN apt-get update && \ - apt-get install -y espeak-ng git jq && \ + apt-get install -y espeak-ng ffmpeg git sox libsox-fmt-all jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda deleted file mode 100644 index 28e10f4fb85..00000000000 --- a/docker/Dockerfile.cuda +++ /dev/null @@ -1,22 +0,0 @@ -ARG BASE_IMAGE=vllm/vllm-openai:v0.19.0 -FROM ${BASE_IMAGE} - -ARG COMMON_WORKDIR=/app - -WORKDIR ${COMMON_WORKDIR} - -# Step 1: Setup - Install system dependencies -RUN apt-get update && \ - apt-get install -y git jq && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni - -# Step 2: Copy vllm-omni code and install -COPY . ${COMMON_WORKDIR}/vllm-omni -RUN cd ${COMMON_WORKDIR}/vllm-omni && uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir "." - -RUN ln -sf /usr/bin/python3 /usr/bin/python - -ENTRYPOINT [] diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index a54aa3b7933..bfbb060bcb5 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -18,10 +18,8 @@ ARG COMMON_WORKDIR=/app WORKDIR ${COMMON_WORKDIR} # Step 1: Setup - Install system dependencies -# Need to include ffmpeg because vllm rocm upstream docker image -# does not include it. RUN apt-get update && \ - apt-get install -y espeak-ng ffmpeg git jq && \ + apt-get install -y espeak-ng ffmpeg git sox libsox-fmt-all jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -41,24 +39,6 @@ RUN if [ "${USE_NIGHTLY_BUILD}" = "1" ]; then \ # Step 3: Copy vllm-omni code and install without uv RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni COPY . ${COMMON_WORKDIR}/vllm-omni - -# This is a workaround to ensure pytest exits with the correct status code in CI tests. -RUN printf '%s\n' \ - 'import os' \ - '' \ - '_exit_code = 1' \ - '' \ - 'def pytest_sessionfinish(session, exitstatus):' \ - ' global _exit_code' \ - ' _exit_code = int(exitstatus)' \ - '' \ - 'def pytest_unconfigure(config):' \ - ' import sys' \ - ' sys.stdout.flush()' \ - ' sys.stderr.flush()' \ - ' os._exit(_exit_code)' \ - > ${COMMON_WORKDIR}/vllm-omni/conftest.py - RUN cd ${COMMON_WORKDIR}/vllm-omni && uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]" --no-build-isolation RUN ln -sf /usr/bin/python3 /usr/bin/python diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 25d5d0c800e..17f1aebf0d0 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -15,7 +15,9 @@ RUN apt clean && apt-get update -y && \ apt-get install -y --no-install-recommends --fix-missing \ curl \ espeak-ng \ + ffmpeg \ git \ + libsndfile1 \ libsm6 \ libxext6 \ libgl1 \ diff --git a/docs/.nav.yml b/docs/.nav.yml index b44e8e6b5a8..a4939961e89 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -10,7 +10,6 @@ nav: - Image Generation: serving/image_generation_api.md - Image Edit: serving/image_edit_api.md - Text to Speech: serving/speech_api.md - - Streaming Video Input: serving/video_stream_api.md - Examples: - examples/README.md - Offline Inference: @@ -35,7 +34,6 @@ nav: - Online Serving: - BAGEL-7B-MoT: user_guide/examples/online_serving/bagel.md - vLLM-Omni Helm Chart: user_guide/examples/online_serving/chart-helm.md - - Diffusers Backend Adapter Example: user_guide/examples/online_serving/diffusers_pipeline_adapter.md - Fish Speech S2 Pro: user_guide/examples/online_serving/fish_speech.md - GLM-Image Online Serving: user_guide/examples/online_serving/glm_image.md - Image-To-Image: user_guide/examples/online_serving/image_to_image.md @@ -66,8 +64,6 @@ nav: - FP8: user_guide/diffusion/quantization/fp8.md - Int8: user_guide/diffusion/quantization/int8.md - GGUF: user_guide/diffusion/quantization/gguf.md - - Attention Backends: user_guide/diffusion/attention_backends.md - - Frame Interpolation: user_guide/diffusion/frame_interpolation.md - Parallelism: - Overview: user_guide/diffusion/parallelism/overview.md - CFG Parallel: user_guide/diffusion/parallelism/cfg_parallel.md @@ -84,6 +80,7 @@ nav: - Developer Guide: - General: - contributing/README.md + - pr_reviewer.md - glob: contributing/* flatten_single_child_sections: true - Model Implementation: @@ -100,16 +97,14 @@ nav: - design/feature/disaggregated_inference.md - design/feature/ray_based_execution.md - design/feature/omni_connectors/ - - design/feature/prefix_caching.md - design/feature/cfg_parallel.md - - design/feature/expert_parallel.md - design/feature/sequence_parallel.md - design/feature/tensor_parallel.md - design/feature/vae_parallel.md - design/feature/hsdp.md - design/feature/cache_dit.md - design/feature/teacache.md - - design/feature/async_chunk.md + - design/feature/async_chunk_design.md - design/feature/vae_parallel.md - design/feature/diffusion_step_execution.md - Module Design: diff --git a/docs/api/README.md b/docs/api/README.md index 0147f19e126..f65cbb525d9 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -5,7 +5,7 @@ Main entry points for vLLM-Omni inference and serving. - [vllm_omni.entrypoints.async_omni.AsyncOmni][] -- [vllm_omni.engine.cfg_companion_tracker.CfgCompanionTracker][] +- [vllm_omni.entrypoints.cfg_companion_tracker.CfgCompanionTracker][] - [vllm_omni.entrypoints.cli.benchmark.base.OmniBenchmarkSubcommandBase][] - [vllm_omni.entrypoints.cli.benchmark.main.OmniBenchmarkSubcommand][] - [vllm_omni.entrypoints.cli.benchmark.serve.OmniBenchmarkServingSubcommand][] diff --git a/docs/assets/WeChat.jpg b/docs/assets/WeChat.jpg index 83252b7569d..28956a12099 100644 Binary files a/docs/assets/WeChat.jpg and b/docs/assets/WeChat.jpg differ diff --git a/docs/cli/serve.md b/docs/cli/serve.md index 035fa056731..47a873b7211 100644 --- a/docs/cli/serve.md +++ b/docs/cli/serve.md @@ -1,59 +1,5 @@ # vllm-omni serve -## Stage-based CLI quickstart - -The stage-based CLI is designed for deployments that require launching each pipeline stage in an isolated process -(e.g., across separate operating system processes, distinct GPUs, or distributed hosts). - -- For **migrated models** that utilize the bundled deployment YAML configurations located in - `vllm_omni/deploy/`, the `--deploy-config` flag is only required to override the default configuration. By default, executing `vllm serve MODEL --omni ...` - automatically loads the bundled deployment configuration. -- For **legacy models** utilizing configuration files located in - `vllm_omni/model_executor/stage_configs/`, the `--stage-configs-path` parameter remains mandatory. - -Example: Initializing Stage 0 (Orchestrator and API Server): - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --port 8091 \ - --stage-id 0 \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - -Example: Initializing a Headless Worker Stage (Stage 1): - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --stage-id 1 \ - --headless \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - -When utilizing a custom deployment YAML based on the new schema, append `--deploy-config /path/to/override.yaml` to each command execution. Conversely, for legacy models, substitute this parameter with `--stage-configs-path /path/to/stage_configs.yaml`. - -In the standard execution paradigm, the `--stage-overrides` argument is utilized to apply stage-specific configurations from a single CLI command. -However, under the **stage-based CLI** paradigm, where each process strictly encapsulates a single stage, it is recommended to specify tuning parameters directly via discrete command-line flags for the respective stage, rather than constructing a composite `--stage-overrides` JSON string. - -For example, as an alternative to the following composite configuration: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --stage-overrides '{"1": {"gpu_memory_utilization": 0.5}}' -``` - -the stage-based CLI permits the direct initialization of Stage 1 with explicit parameters: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --stage-id 1 \ - --headless \ - --gpu-memory-utilization 0.5 \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - ## JSON CLI Arguments --8<-- "docs/cli/json_tip.inc.md" diff --git a/docs/configuration/README.md b/docs/configuration/README.md index 390176e9cea..b5761a7f1bc 100644 --- a/docs/configuration/README.md +++ b/docs/configuration/README.md @@ -6,7 +6,7 @@ For options within a vLLM Engine. Please refer to [vLLM Configuration](https://d Currently, the main options are maintained by stage configs for each model. -For a specific example, see the [Qwen2.5-Omni deploy config](gh-file:vllm_omni/deploy/qwen2_5_omni.yaml). The matching frozen pipeline topology lives at [vllm_omni/model_executor/models/qwen2_5_omni/pipeline.py](gh-file:vllm_omni/model_executor/models/qwen2_5_omni/pipeline.py). +For specific example, please refer to [Qwen2.5-omni stage config](stage_configs/qwen2_5_omni.yaml) For introduction, please check [Introduction for stage config](./stage_configs.md) diff --git a/docs/configuration/pd_disaggregation.md b/docs/configuration/pd_disaggregation.md index 9196bdb0240..1cf6189e603 100644 --- a/docs/configuration/pd_disaggregation.md +++ b/docs/configuration/pd_disaggregation.md @@ -11,7 +11,7 @@ deployment-specific values usually change per environment: - connector backend and connector ports - connector IPs or bootstrap addresses -Start from the [default Qwen3-Omni stage config](gh-file:vllm_omni/deploy/qwen3_omni_moe.yaml) +Start from the [default Qwen3-Omni stage config](gh-file:vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml) and copy it to your own file, for example `qwen3_omni_pd.yaml`. Then apply the changes below. @@ -145,13 +145,19 @@ Compared with the default Qwen3-Omni config: ```yaml runtime: enabled: true + defaults: + window_size: -1 + max_inflight: 1 edges: - from: 0 to: 1 + window_size: -1 - from: 1 to: 2 + window_size: -1 - from: 2 to: 3 + window_size: -1 ``` ## 4. Launch with your custom config diff --git a/docs/configuration/stage_configs.md b/docs/configuration/stage_configs.md index 45bacfb7893..95c42afcc70 100644 --- a/docs/configuration/stage_configs.md +++ b/docs/configuration/stage_configs.md @@ -3,210 +3,7 @@ In vLLM-Omni, the target model is separated into multiple stages, which are processed by different LLMEngines, DiffusionEngines or other types of engines. Depending on different types of stages, such as Autoregressive (AR) stage or Diffusion transformer (DiT) stage, each can choose corresponding schedulers, model workers to load with the Engines in a plug-in fashion. !!! note - Default deploy config YAMLs (for example, `vllm_omni/deploy/qwen2_5_omni.yaml`, `vllm_omni/deploy/qwen3_omni_moe.yaml`, and `vllm_omni/deploy/qwen3_tts.yaml`) are bundled and loaded automatically when neither `--stage-configs-path` nor `--deploy-config` is provided — the model registry resolves the right pipeline + deploy YAML by `model_type`. The bundled defaults have been verified on 1xH100 for Qwen2.5-Omni and 2xH100 for Qwen3-Omni. Models that have not yet migrated to the new schema continue to use the legacy `vllm_omni/model_executor/stage_configs/.yaml` files via `--stage-configs-path`. - -## New deploy schema reference - -The new deploy schema lives under `vllm_omni/deploy/` and is paired with a frozen `PipelineConfig` registered by the model's `pipeline.py`. Each deploy YAML has these top-level fields: - -| Field | Type | Required | Default | Description | -|-------|------|----------|---------|-------------| -| `base_config` | str (path) | optional | — | Overlay parent (relative or absolute). `stages:` / `platforms:` deep-merged by stage_id; other scalars overlay-wins. Intended for user-authored overlays; prod yamls stay flat. | -| `async_chunk` | bool | optional | `true` | Enable chunked streaming between stages. Pin to `false` if the pipeline runs end-to-end. | -| `connectors` | dict | optional | `null` | Named connector specs (`{name, extra}`). Referenced by each stage's `input_connectors` / `output_connectors`. See [Connector schema](#connector-schema). | -| `edges` | list | optional | `null` | Explicit edge list for the KV transfer graph. Auto-derived from stage inputs if omitted. | -| `stages` | list | required | — | Per-stage engine args + wiring (see [Stage fields](#stage-fields)). | -| `platforms` | dict | optional | `null` | Keyed by `npu` / `rocm` / `xpu`, each contains a `stages:` list with per-platform overrides applied on top of the CUDA defaults. | -| `pipeline` | str | optional | `null` | Override the auto-detected pipeline registry key (used for structural variants like `qwen2_5_omni_thinker_only`). | -| `trust_remote_code` | bool | optional | `true` | **Pipeline-wide.** Trust HF remote code on model load; applies to every stage. | -| `distributed_executor_backend` | str \| null | optional | `null` | **Pipeline-wide.** Distributed executor backend forwarded to vLLM (`"mp"`, `"ray"`, `"external_launcher"`). If omitted, vLLM auto-selects backend from runtime topology. | -| `dtype` | str \| null | optional | `null` | **Pipeline-wide.** Model dtype for every stage. | -| `quantization` | str \| null | optional | `null` | **Pipeline-wide.** Quantization method for every stage. | -| `enable_prefix_caching` | bool | optional | `false` | **Pipeline-wide.** Prefix cache toggle applied to every stage. | -| `enable_chunked_prefill` | bool \| null | optional | `null` | **Pipeline-wide.** Chunked prefill toggle applied to every stage. | -| `data_parallel_size` | int | optional | `1` | **Pipeline-wide.** DP degree for every stage. | -| `pipeline_parallel_size` | int | optional | `1` | **Pipeline-wide.** PP degree for every stage. | - -Note: for diffusion path, `distributed_executor_backend` currently defaults to -`mp`, and `ray` / `external_launcher` are not fully supported yet. - -### Stage fields - -Each entry under `stages:` accepts any `StageDeployConfig` field directly (no nested `engine_args:`). Only fields whose value legitimately varies across stages live here; pipeline-wide settings (trust_remote_code, distributed_executor_backend, dtype, quantization, prefix/chunked prefill, DP/PP sizes) are declared at the top level and applied to every stage. Unknown keys fall through to `engine_extras:` and are forwarded to the engine. - -| Field | Type | Required | Default | Description | -|-------|------|----------|---------|-------------| -| `stage_id` | int | required | — | Stage identity; matched against `PipelineConfig.stages[*].stage_id`. | -| `max_num_seqs` | int | optional | `64` | Max concurrent sequences per stage. | -| `gpu_memory_utilization` | float | optional | `0.9` | Per-stage memory budget. | -| `tensor_parallel_size` | int | optional | `1` | TP degree for this stage. | -| `enforce_eager` | bool | optional | `false` | Disable CUDA graphs. | -| `max_num_batched_tokens` | int | optional | `32768` | Prefill budget. | -| `max_model_len` | int \| null | optional | `null` | Per-stage context length (auto-sets `VLLM_ALLOW_LONG_MAX_MODEL_LEN=1` when larger than HF default). | -| `async_scheduling` | bool \| null | optional | `null` | Per-stage async scheduling toggle. | -| `devices` | str | optional | `"0"` | `CUDA_VISIBLE_DEVICES`-style device list. | -| `output_connectors` | dict \| null | optional | `null` | Keyed by `to_stage_`; values are names registered under top-level `connectors:`. | -| `input_connectors` | dict \| null | optional | `null` | Keyed by `from_stage_`; values are names registered under top-level `connectors:`. | -| `default_sampling_params` | dict \| null | optional | `null` | Baseline sampling params. Deep-merged with pipeline `sampling_constraints` (pipeline wins). | -| `engine_extras` | dict | optional | `{}` | Catch-all for keys not listed above; deep-merged across overlays. Also carries per-stage overrides of pipeline-wide settings (e.g. stage-specific `dtype`). | - -### Connector schema - -Each entry under top-level `connectors:` follows this shape: - -```yaml -connectors: - : - name: # required — class registered in vllm_omni.distributed - extra: # optional — forwarded to the connector's __init__ - : - ... -``` - -| Connector class | Use case | `extra` keys | -|-----------------|----------|--------------| -| `SharedMemoryConnector` | Same-host KV transfer between stages (default for bundled YAMLs). | `shm_threshold_bytes` (int, default `65536`). | -| `MooncakeStoreConnector` | Cross-host KV transfer over TCP. Required for multi-node deployments. | `host`, `metadata_server`, `master`, `segment` (int bytes), `localbuf` (int bytes), `proto` (`"tcp"` / `"rdma"`). | - -A stage references a connector by name in its `input_connectors` / `output_connectors`: - -```yaml -connectors: - shm: - name: SharedMemoryConnector - -stages: - - stage_id: 0 - output_connectors: {to_stage_1: shm} - - stage_id: 1 - input_connectors: {from_stage_0: shm} -``` - -### CLI flags introduced in this refactor - -| Flag | Description | -|------|-------------| -| `--deploy-config PATH` | Load a new-schema deploy YAML. Takes precedence over `--stage-configs-path`. **Optional** — when omitted, the bundled `vllm_omni/deploy/.yaml` is auto-loaded by the model registry. | -| `--stage-overrides JSON` | Per-stage JSON overrides, e.g. `'{"0":{"gpu_memory_utilization":0.5}}'`. Per-stage values always win over global flags. | -| `--async-chunk` / `--no-async-chunk` | Flip the deploy YAML's `async_chunk:` bool. Unset (default) leaves the YAML value in force. | -| `--stage-configs-path` | **Deprecated.** Accepts legacy `stage_args` yamls and (auto-detected) new deploy yamls; emits a deprecation warning. Migrate to `--deploy-config`. To be removed in a follow-up PR. | - -### Stage-Based CLI Paradigm - -The stage-based CLI paradigm facilitates the execution of discrete pipeline stages within isolated processes: - -- **Stage 0** typically encapsulates the orchestrator and the primary API server. Invocation requires `--stage-id 0`, - `--omni-master-address`, `--omni-master-port`, and standard port declarations (e.g., `--port`). -- **Worker Stages** operate without a distinct API server (i.e., using `--headless`), are assigned sequential `--stage-id` identifiers, and must reference the corresponding - `--omni-master-address` and `--omni-master-port` parameters to successfully register with Stage 0. - -For migrated architectures, the system automatically resolves and loads the bundled deployment YAML. Consequently, the primary execution path -does **not** necessitate the explicit definition of `--deploy-config`: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --port 8091 \ - --stage-id 0 \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 - -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --stage-id 1 \ - --headless \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - -When instantiating a custom deployment YAML conforming to the updated schema, append the `--deploy-config /path/to/override.yaml` directive -to all node invocations. For legacy architectures (e.g., BAGEL) configured via deprecated `stage_args:` schemas, continue to specify the relevant configuration via `--stage-configs-path /path/to/config.yaml`. - -In the context of standard initialization architectures, utilizing the `--stage-overrides` parameter operates as the optimal methodology -for delineating stage-specific tuning from the CLI interface: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --stage-overrides '{"1": {"gpu_memory_utilization": 0.5}}' -``` - -Conversely, in the context of the **stage-based CLI** paradigm, given that each execution process exclusively instantiates a single pipeline stage, configuration override attributes -can be defined uniformly via explicit CLI flags on the corresponding instantiation command, rendering composite `--stage-overrides` JSON strings unnecessary: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --stage-id 1 \ - --headless \ - --gpu-memory-utilization 0.5 \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - -### Precedence - -From highest to lowest: - -1. Per-stage flags (`--stage-overrides` JSON, `--stage--` if registered) -2. Explicit global CLI flags (`--gpu-memory-utilization 0.85`, etc.) -3. Platform section (`platforms.npu.stages`, etc.) on top of the base `stages:` -4. Overlay YAML (via `base_config:`) on top of the base YAML -5. Parser defaults - -### Worked override example - -Starting from the bundled `vllm_omni/deploy/qwen3_omni_moe.yaml`: - -```yaml -# vllm_omni/deploy/qwen3_omni_moe.yaml (excerpt) -async_chunk: true -stages: - - stage_id: 0 - gpu_memory_utilization: 0.9 - max_num_seqs: 32 - - stage_id: 1 - gpu_memory_utilization: 0.7 - max_num_seqs: 16 -``` - -A user-authored overlay that inherits the base and overrides only stage 1: - -```yaml -# my_overrides.yaml -base_config: /path/to/vllm_omni/deploy/qwen3_omni_moe.yaml -stages: - - stage_id: 1 - gpu_memory_utilization: 0.5 # smaller GPU -``` - -Launched with both an explicit global flag and a per-stage override: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --deploy-config my_overrides.yaml \ - --max-model-len 16384 \ - --stage-overrides '{"0": {"max_num_seqs": 8}}' -``` - -Within the stage-based CLI paradigm, equivalent configuration parameters can inherently be passed directly -as command-line arguments to the designated single-stage process instantiation: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --stage-id 0 \ - --max-num-seqs 8 \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - -Effective config per stage after the merge: - -| Stage | Field | Final value | Source | -|-------|-------|-------------|--------| -| 0 | `gpu_memory_utilization` | `0.9` | base YAML (overlay didn't touch stage 0) | -| 0 | `max_num_seqs` | `8` | per-stage CLI (`--stage-overrides`) — wins over base `32` | -| 0 | `max_model_len` | `16384` | global CLI | -| 1 | `gpu_memory_utilization` | `0.5` | overlay YAML — wins over base `0.7` | -| 1 | `max_num_seqs` | `16` | base YAML (overlay didn't touch this field) | -| 1 | `max_model_len` | `16384` | global CLI | -| 2 | (all defaults) | — | base YAML (no overrides apply) | + Default stage config YAMLs (for example, `vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml` and `vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml`) are bundled and loaded automatically when `stage_configs_path` is not provided. They have been verified to work on 1xH100 for Qwen2.5-Omni and 2xH100 for Qwen3-Omni. Therefore, as a core part of vLLM-Omni, the stage configs for a model have several main functions: @@ -216,14 +13,9 @@ Therefore, as a core part of vLLM-Omni, the stage configs for a model have sever - Input and output dependencies for each stage. - Default input parameters. -To override specific parameters, explicitly inject the customized configuration schema -in both online and offline instantiation flows. Prioritize the `--deploy-config` flag -when loading the new-schema deploy YAML schemas, reserving the `--stage-configs-path` parameter -exclusively to maintain compatibility with legacy `stage_args` YAML constructs. - -Examples: +If users want to modify some part of it. The custom stage_configs file can be input as input argument in both online and offline. Just like examples below: -For offline (Assume necessary dependencies have been imported): +For offline (Assume necessary dependencies have ben imported): ```python model_name = "Qwen/Qwen2.5-Omni-7B" omni = Omni(model=model_name, stage_configs_path="/path/to/custom_stage_configs.yaml") @@ -231,13 +23,7 @@ omni = Omni(model=model_name, stage_configs_path="/path/to/custom_stage_configs. For online serving: ```bash -vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091 --deploy-config /path/to/deploy_config.yaml -``` - -Legacy online serving: - -```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091 --stage-configs-path /path/to/stage_configs_file ``` !!! important We are actively iterating on the definition of stage configs, and we welcome all feedbacks from both community users and developers to help us shape the development! @@ -249,7 +35,7 @@ stage_args: - stage_id: 0 # mark the unique id for each stage runtime: # The disaggregated configuration process: true # Run this stage in a separate process - devices: "0" # Logical device index for this stage (mapped through CUDA_VISIBLE_DEVICES / ASCEND_RT_VISIBLE_DEVICES if set) + devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) engine_args: # Engine arguments for a certain engine model_stage: thinker max_num_seqs: 1 @@ -328,12 +114,16 @@ stage_args: # Top-level runtime config (concise): default windows and stage edges runtime: enabled: true - + defaults: + window_size: -1 # Simplified: trigger downstream only after full upstream completion + max_inflight: 1 # Simplified: process serially within each stage edges: - from: 0 # thinker → talker: trigger only after receiving full input (-1) to: 1 + window_size: -1 - from: 1 # talker → code2wav: trigger only after receiving full input (-1) to: 2 + window_size: -1 ``` @@ -365,9 +155,7 @@ Default: `true` #### `runtime.devices` -Logical device indices for this stage, specified as a string. Values are **logical indices** (`0`, `1`, `2`, ...) — not physical GPU IDs — and are mapped through the platform's visibility env var (`CUDA_VISIBLE_DEVICES` on CUDA, `ASCEND_RT_VISIBLE_DEVICES` on NPU) before being applied via `torch.cuda.set_device()` (or the equivalent). - -Example: if `CUDA_VISIBLE_DEVICES=0,2,4` is set in the environment, then `devices: "0"` selects physical GPU 0 (the first visible), `devices: "1"` selects physical GPU 2, and `devices: "0,1"` makes physical GPUs 0 and 2 available to the stage. If no visibility env var is set, logical and physical IDs coincide. +Visible devices for this stage, specified as a string. This controls which GPU devices are available to the stage process, similar to setting `CUDA_VISIBLE_DEVICES` or using `torch.cuda.set_device()`. For example, `"0"` uses GPU 0, `"1"` uses GPU 1, and `"0,1"` makes both GPUs 0 and 1 visible. Default: `"0"` diff --git a/docs/configuration/stage_configs/qwen2_5_omni.yaml b/docs/configuration/stage_configs/qwen2_5_omni.yaml new file mode 100644 index 00000000000..690577b84a8 --- /dev/null +++ b/docs/configuration/stage_configs/qwen2_5_omni.yaml @@ -0,0 +1,94 @@ +# stage config for running qwen2.5-omni with AsyncOmniEngine + Orchestrator runtime. +stage_args: + - stage_id: 0 + runtime: + process: true # Run this stage in a separate process + devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + engine_args: + model_stage: thinker + max_num_seqs: 1 + model_arch: Qwen2_5OmniForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.8 + enforce_eager: true # Now we only support eager mode + trust_remote_code: true + engine_output_type: latent + enable_prefix_caching: false + is_comprehension: true + final_output: true + final_output_type: text + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + - stage_id: 1 + runtime: + process: true + devices: "1" + engine_args: + model_stage: talker + max_num_seqs: 1 + model_arch: Qwen2_5OmniForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.8 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: latent + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker + default_sampling_params: + temperature: 0.9 + top_p: 0.8 + top_k: 40 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.05 + stop_token_ids: [8294] + - stage_id: 2 + runtime: + process: true + devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU + engine_args: + model_stage: code2wav + max_num_seqs: 1 + model_arch: Qwen2_5OmniForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + gpu_memory_utilization: 0.15 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: audio + engine_input_source: [1] + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + +# Top-level runtime config (concise): default windows and stage edges +runtime: + enabled: true + defaults: + window_size: -1 # Simplified: trigger downstream only after full upstream completion + max_inflight: 1 # Simplified: process serially within each stage + edges: + - from: 0 # thinker → talker: trigger only after receiving full input (-1) + to: 1 + window_size: -1 + - from: 1 # talker → code2wav: trigger only after receiving full input (-1) + to: 2 + window_size: -1 diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 3baa7ff8828..967d0cc6d72 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -86,8 +86,7 @@ Through five levels (L1-L5) and common (Common) specifications, the system clari /tests/e2e/online_serving/test_{model_name}_expansion.py
/tests/e2e/offline_inference/test_{model_name}_expansion.py
Performance:
- /tests/dfx/perf/tests/test_qwen_omni.json (Omni), test_tts.json (TTS),
- and /tests/dfx/perf/tests/test_{diffusion_model}_vllm_omni.json (Diffusion)
+ /tests/dfx/perf/tests/test.json
Doc Test:
tests/example/online_serving/test_{model_name}.py
tests/example/offline_inference/test_{model_name}.py @@ -105,8 +104,7 @@ Through five levels (L1-L5) and common (Common) specifications, the system clari Depends on reality Stability:
- /tests/dfx/stability/tests/test_qwen3_omni.json
- /tests/dfx/stability/tests/test_wan22.json
+ /tests/dfx/stability/tests/test.json
Reliability:
tests/e2e/reliability/test_{model_name}.py @@ -232,7 +230,8 @@ vllm_omni/ tests/ │ ├── test_qwen3_omni_expansion.py │ ├── test_mimo_audio.py │ ├── test_image_gen_edit.py - │ └── test_images_generations_lora.py + │ ├── test_images_generations_lora.py + │ └── stage_configs/ └── offline_inference/ ✅ ├── test_qwen2_5_omni.py ├── test_qwen3_omni.py @@ -243,17 +242,16 @@ vllm_omni/ tests/ ├── test_zimage_tensor_parallel.py ├── test_cache_dit.py ├── test_teacache.py - ├── test_stable_audio_expansion.py + ├── test_stable_audio_model.py ├── test_diffusion_cpu_offload.py ├── test_diffusion_layerwise_offload.py ├── test_diffusion_lora.py ├── test_sequence_parallel.py - └── stage_configs/ (legacy schema, still - ├── bagel_*.yaml present for unmigrated - └── npu/, rocm/, etc. models) - -# Migrated models (qwen3_omni_moe, qwen2_5_omni, qwen3_tts) live under -# vllm_omni/deploy/ instead — see docs/configuration/stage_configs.md. + └── stage_configs/ + ├── qwen2_5_omni_ci.yaml + ├── qwen3_omni_ci.yaml + ├── bagel_*.yaml + └── npu/, rocm/, etc. ``` @@ -273,7 +271,7 @@ Before entering specific testing levels, the project establishes two common spec L1 and L2 level testing form the foundation of the quality assurance system. L1 level testing focuses on verifying the internal logic correctness of code units (e.g., functions, classes), ensuring each independent component behaves as designed. -L2 level testing builds upon L1 by introducing GPU resources and verifying that the end-to-end (E2E) process of the model in basic deployment scenarios is smooth. For example, it uses dummy models to confirm that core interfaces like the inference pipeline, output format, and streaming response work properly. The common goal of these two levels is to provide developers with rapid feedback, discovering and fixing issues early in the development cycle. +L2 level testing builds upon L1 by introducing GPU resources and verifying that the end-to-end (E2E) process of the model in basic deployment scenarios is smooth. For example, it uses dummy models to confirm that core interfaces like the inference pipeline, output format, and streaming response work properly. The common goal of these two levels is to provide developers with rapid feedback, discovering and fixing issues early in the development cycle . @@ -419,13 +417,13 @@ L3 level testing executes after code is merged into the main branch. Its core pu **Explanation**: - @pytest.mark.advanced_model: Marks the test as L3 merge level, indicating deep validation with real models. @pytest.mark.full_model: Marks L4 nightly-only suites (e.g. `test_*_expansion.py`, doc examples). + @pytest.mark.advanced_model: Marks the test as L3 or L4 level, indicating that this test case performs deep validation, using real models for performance, integration, and accuracy testing. This forms a "basic-advanced" correspondence with the core_model mark at the L2 level. @pytest.mark.core_model: Marks the test as L1 or L2 level, indicating that this test case validates the basic functionality of the core model. It uses mock weights and only checks if the relevant interface functions correctly. @pytest.mark.parametrize: A parameterization decorator that allows abstracting test data into parameters, enabling reuse of the same test logic across different data configurations. indirect=True indicates that parameters will be passed to the fixture for processing. - **Notes**: If you believe the test case only needs to execute basic run logic at the PR-level CI, you can mark it only with @pytest.mark.core_model. If you believe it only needs to execute deep validation at merge (L3), use @pytest.mark.advanced_model. For L4 nightly-only expansion and doc-example tests, use @pytest.mark.full_model with `--run-level full_model`. If the test case needs both basic run and deep validation, mark with @pytest.mark.core_model and the appropriate L3/L4 marker (`advanced_model` and/or `full_model`). + **Notes**: If you believe the test case only needs to execute basic run logic at the PR-level CI, you can mark it only with @pytest.mark.core_model. If you believe it only needs to execute deep validation run logic at the merge or nightly level, you can mark it only with @pytest.mark.advanced_model. If you believe the test case needs to accommodate both basic run and deep validation test logic, you should mark it with both @pytest.mark.core_model and @pytest.mark.advanced_model. **2.4.2 Test Function Definition and Documentation** @@ -517,11 +515,9 @@ L3 level testing executes after code is merged into the main branch. Its core pu **Single Request**: The comment clearly states this is a single-request completion test. For concurrent testing, it can be extended to multiple requests using request_num = n. - **Implicit Validation**: The `send_omni_request` and `send_diffusion_request` methods internally includes validation logic dynamically selected based on the --run-level parameter: core_model performs basic validation, while advanced_model and full_model perform deep validation. - -- ***Run Command (L3 merge)***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}.py -m advanced_model --run-level=advanced_model` + **Implicit Validation**: The `send_omni_request` and `send_diffusion_request` methods internally includes validation logic dynamically selected based on the --run-level parameter: core_model performs basic validation, while advanced_model performs deep validation. -- ***Run Command (L4 nightly expansion)***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}_expansion.py -m full_model --run-level=full_model` +- ***Run Command***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}.py -m advanced_model --run-level=advanced_model` ## Chapter 3: L4 Level Testing - Full Functionality, Performance, and Documentation Testing @@ -534,13 +530,13 @@ L4 level testing is a comprehensive quality audit before a version release. It e ### 3.2 Testing Content and Scope - ***Full Functionality Testing***: Executes all test cases defined in `test_{model_name}_expansion.py`, covering all implemented features, positive flows, boundary conditions, and exception handling. -- ***Performance Testing***: Uses `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, and diffusion configs in the form `tests/dfx/perf/tests/test_*_vllm_omni.json` (passed to `run_benchmark.py` via `--test-config-file`) to drive performance testing tools for stress, load, and endurance tests, collecting metrics like throughput, response time, and resource utilization. +- ***Performance Testing***: Uses the `tests/dfx/perf/tests/test.json` configuration file to drive performance testing tools for stress, load, and endurance tests, collecting metrics like throughput, response time, and resource utilization. - ***Documentation Testing***: Verifies whether the example code provided to users is runnable and its results match the description. ### 3.3 Test Directory and Execution Files - ***Functional Testing***: Same directories as L3. -- ***Performance Test Configuration***: `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, and diffusion configs `tests/dfx/perf/tests/test_*_vllm_omni.json` (e.g. `test_qwen_image_vllm_omni.json`) +- ***Performance Test Configuration***: `tests/dfx/perf/tests/test.json` - ***Documentation Example Tests***: - - `tests/example/online_serving/test_{model_name}.py` - `tests/example/offline_inference/test_{model_name}.py` @@ -575,12 +571,12 @@ L5 level testing focuses on the performance of model services under ***long-runn ### 4.2 Testing Content and Scope -- ***Long-term Stability (Stability) Testing***: Uses JSON under `tests/dfx/stability/tests/` (for example `test_qwen3_omni.json` and `test_wan22.json`) to run the service under moderate load for an extended period (e.g., over 12 hours), monitoring whether metrics like memory/VRAM usage, response time, and throughput degrade over time, and whether the service process remains stable. +- ***Long-term Stability (Stability) Testing***: Uses the `tests/dfx/stability/tests/test.json` configuration to run the service under moderate load for an extended period (e.g., over 12 hours), monitoring whether metrics like memory/VRAM usage, response time, and throughput degrade over time, and whether the service process remains stable. - ***Reliability Testing***: Uses `tests/e2e/reliability/test_{model_name}.py` to actively simulate various fault and abnormal scenarios, such as: dependent service interruption, abnormal input data, network flicker, hardware resource preemption, etc., to verify the system's fault tolerance, self-healing, and graceful degradation capabilities. ### 4.3 Test Directory and Execution Files -- ***Stability Test Configuration***: `tests/dfx/stability/tests/test_qwen3_omni.json`, `tests/dfx/stability/tests/test_wan22.json` (one JSON per model / runner family) +- ***Stability Test Configuration***: `tests/dfx/stability/tests/test.json` - ***Reliability Test Suite***: `tests/e2e/reliability/test_{model_name}.py` ### 4.4 Execution Method and Example @@ -591,7 +587,7 @@ L5 level testing focuses on the performance of model services under ***long-runn
Test Examples -When you want to add L5-level stability test cases, add or extend the appropriate JSON file under `tests/dfx/stability/tests/` (for example `test_qwen3_omni.json` for Omni bench traffic, or `test_wan22.json` for diffusion `/v1/videos` workloads). The following illustrates the Qwen3-Omni shape: +When you want to add L5-level stability test cases, you can refer to the following format for case addition in `tests/dfx/stability/tests/test.json`: ```json { @@ -662,7 +658,7 @@ All other optional parameters follow the same rules as the in Chapter 3.4.
-- - ***Stability***: `pytest -s -v tests/dfx/stability/scripts/test_stability_qwen3_omni.py` or `pytest -s -v tests/dfx/stability/scripts/test_stability_wan22.py` (or add `test_stability_.py` alongside a matching JSON config) +- - ***Stability***: `pytest -s -v tests/dfx/stability/scripts/test_{model_name}.py` - ***Reliability***: `pytest -s -v tests/e2e/reliability/test_{model_name}.py` ## Summary diff --git a/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md b/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md index e1309b1adeb..69d6ad82871 100644 --- a/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md +++ b/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md @@ -37,10 +37,10 @@ Currently all the features are available in online serving mode. Hence, only nee **Code Style** - Validation: test that the multimodal output files of your model have the correct shapes. `OpenAIClientHandler.send_diffusion_request` should have taken care of this. -- Test marks: always add `full_model` and `diffusion` for L4 nightly `test_*_expansion.py` cases. Add GPU-related marks if needed. Ref: [Markers for Tests](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/tests_markers/). +- Test marks: always add `advanced_model` and `diffusion`. Add GPU-related marks if needed. Ref: [Markers for Tests](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/tests_markers/). - To maximize code reuse, you may refer to - `tests/conftest.py` for `omni_server` (running server in subprocess) and `openai_client` fixtures (sending requests and validating output), `generate_synthetic_image` and `assert_XXX_valid` helper. - - `tests/helpers/mark.py` for `@hardware_test(...)` and `hardware_marks`. + - `tests/utils.py` for `@hardware_test(...)` and `hardware_marks`. - [Parametrizing tests (pytest doc)](https://docs.pytest.org/en/stable/example/parametrize.html) to reuse test function implementation for different cases. - Doc: add a concise docstring for each test function. - Reference L4 test implementation: [tests/e2e/online_serving/test_qwen_image_edit_expansion.py](https://github.com/vllm-project/vllm-omni/blob/main/tests/e2e/online_serving/test_qwen_image_edit_expansion.py). diff --git a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md index f1f3073dc52..8093e1459f5 100644 --- a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md +++ b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md @@ -1,4 +1,4 @@ -When you want to add L4-level ***performance test*** cases, you can refer to the following format for case addition in `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, or diffusion configs such as `tests/dfx/perf/tests/test_*_vllm_omni.json` (selected via `pytest ... run_benchmark.py --test-config-file `): +When you want to add L4-level ***performance test*** cases, you can refer to the following format for case addition in tests/dfx/perf/tests/test.json: ```JSON { diff --git a/docs/contributing/ci/test_guide.md b/docs/contributing/ci/test_guide.md index 018c47b053f..425f24332c2 100644 --- a/docs/contributing/ci/test_guide.md +++ b/docs/contributing/ci/test_guide.md @@ -42,63 +42,32 @@ Our test scripts use the pytest framework. First, please use `git clone https:// ``` The latest test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-ready.yml). -=== "L3 level" +=== "L3 level & L4 level" ```bash + cd tests pytest -s -v -m "advanced_model" --run-level=advanced_model ``` - If you only want to run a specific test case, you can use: - ```bash - pytest -s -v test_xxxx.py --run-level=advanced_model - ``` - If you only want to run specific test cases on a particular platform, you can use: - ```bash - pytest -s -v -m "advanced_model and distributed_cuda and L4" --run-level=advanced_model - ``` - The latest L3 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-merge.yml). - - -=== "L4 level" - + If you only want to run L3 test case, you can use: ```bash - cd tests - pytest -s -v -m "full_model" --run-level=full_model + pytest -s -v e2e/ --ignore-glob='*expansion.py' -m "advanced_model" --run-level=advanced_model ``` If you only want to run a specific test case, you can use: ```bash - pytest -s -v test_xxxx.py --run-level=full_model + pytest -s -v test_xxxx.py --run-level=advanced_model ``` If you only want to run specific test cases on a particular platform, you can use: ```bash - pytest -s -v -m "full_model and distributed_cuda and L4" --run-level=full_model + pytest -s -v -m "core_model and distributed_cuda and L4" --run-level=core_model ``` - Note: To run performance tests (defaults to ``test_qwen_omni.json``; use ``--test-config-file tests/dfx/perf/tests/test_tts.json`` for TTS): + Note: To run performance tests, use: ```bash - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py + pytest -s -v perf/scripts/run_benchmark.py ``` - The latest L4 (nightly) test commands use the `full_model` marker and `--run-level full_model` (see [test-nightly.yml](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly.yml) and [test-nightly-diffusion.yml](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly-diffusion.yml)). Example: - - ```bash - cd tests - pytest -s -v -m "full_model and omni and H100" --run-level=full_model - ``` - -=== "L5 level" - - L5 includes stability and reliability testing. Typical commands: - ```bash - cd tests - - # Stability: Qwen3-Omni - pytest -s -v dfx/stability/scripts/test_stability_qwen3_omni.py - - # Stability: Wan2.2 (v1/videos diffusion benchmark loop) - pytest -s -v dfx/stability/scripts/test_stability_wan22.py - - ``` + The latest L3 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-merge.yml). - The latest L5 commands for CI can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-ready.yml). + The latest L4 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly.yml). You can find more information about markers in the documentation: [marker doc](./tests_markers.md) diff --git a/docs/contributing/ci/tests_markers.md b/docs/contributing/ci/tests_markers.md index 6130541a617..7c1ba1c73bd 100644 --- a/docs/contributing/ci/tests_markers.md +++ b/docs/contributing/ci/tests_markers.md @@ -8,8 +8,7 @@ Defined in `pyproject.toml`: | Marker | Description | | ------------------ | --------------------------------------------------------- | | `core_model` | L1&L2 tests (run in each PR) | -| `advanced_model` | L3 tests (run on each merge to main) | -| `full_model` | L4 tests (run nightly) | +| `advanced_model` | L3&L4 level tests (run in each merge or nightly) | | `diffusion` | Diffusion model tests | | `omni` | Omni model tests | | `cache` | Cache backend tests | @@ -39,7 +38,7 @@ Defined in `pyproject.toml`: ### Example usage for markers ```python -from tests.helpers.mark import hardware_test +from tests.utils import hardware_test @pytest.mark.core_model @pytest.mark.omni @@ -54,7 +53,7 @@ def test_video_to_audio() ### Decorator: `@hardware_test` -This decorator is intended to make hardware-aware, cross-platform test authoring easier and more robust for CI/CD environments. The `hardware_test` decorator in `vllm-omni/tests/helpers/mark.py` performs the following actions: +This decorator is intended to make hardware-aware, cross-platform test authoring easier and more robust for CI/CD environments. The `hardware_test` decorator in `vllm-omni/tests/utils.py` performs the following actions: 1. **Applies platform and resource markers** Adds the appropriate pytest markers for each specified hardware platform (e.g., `cuda`, `rocm`, `xpu`, `npu`) and resource type (e.g., `L4`, `H100`, `MI325`, `B60`, `A2`, `A3`). @@ -106,7 +105,7 @@ This decorator is intended to make hardware-aware, cross-platform test authoring `hardware_marks` returns a list of pytest mark objects with the same signature as `@hardware_test`. Use it when you need more flexibility, such as attaching hardware marks to individual `pytest.param` entries rather than an entire test function. ```python -from tests.helpers.mark import hardware_marks +from tests.utils import hardware_marks MULTI_CARD_MARKS = hardware_marks( res={"cuda": "H100", "rocm": "MI325", "npu": "A2"}, num_cards=2 @@ -134,9 +133,9 @@ If you want to add support for a new platform (e.g., "tpu" for a new accelerator "distributed_tpu: Tests that require multiple TPU devices", ] ``` -2. **Implement a marker construction function for your platform** in `vllm-omni/tests/helpers/mark.py`: +2. **Implement a marker construction function for your platform** in `vllm-omni/tests/utils.py`: ```python - # In vllm-omni/tests/helpers/mark.py + # In vllm-omni/tests/utils.py def tpu_marks(*, res: str, num_cards: int): test_platform = pytest.mark.tpu @@ -176,4 +175,4 @@ If you want to add support for a new platform (e.g., "tpu" for a new accelerator - Plug into `hardware_marks` - You're done: tests using `@hardware_test` or `hardware_marks` with your platform now automatically get the correct markers, distribution, and isolation! -See code in `vllm-omni/tests/helpers/mark.py` for existing examples (`cuda_marks`, `rocm_marks`, `npu_marks`). +See code in `vllm-omni/tests/utils.py` for existing examples (`cuda_marks`, `rocm_marks`, `npu_marks`). diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md index 3a8cb0f127c..8b10cf4cc1c 100644 --- a/docs/contributing/ci/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -135,7 +135,8 @@ vllm_omni/ tests/ │ ├── test_qwen3_omni_expansion.py │ ├── test_mimo_audio.py │ ├── test_image_gen_edit.py - │ └── test_images_generations_lora.py + │ ├── test_images_generations_lora.py + │ └── stage_configs/ └── offline_inference/ ✅ ├── test_qwen2_5_omni.py ├── test_qwen3_omni.py @@ -146,18 +147,17 @@ vllm_omni/ tests/ ├── test_zimage_tensor_parallel.py ├── test_cache_dit.py ├── test_teacache.py - ├── test_stable_audio_expansion.py + ├── test_stable_audio_model.py ├── test_diffusion_cpu_offload.py ├── test_diffusion_layerwise_offload.py ├── test_diffusion_lora.py ├── test_sequence_parallel.py ├── test_qwen_image_edit_expansion.py - └── stage_configs/ (legacy schema, still present - ├── bagel_*.yaml for unmigrated models) + └── stage_configs/ + ├── qwen2_5_omni_ci.yaml + ├── qwen3_omni_ci.yaml + ├── bagel_*.yaml └── npu/, rocm/, etc. - -# Migrated models (qwen3_omni_moe, qwen2_5_omni, qwen3_tts) live under -# vllm_omni/deploy/ instead — see docs/configuration/stage_configs.md. examples/ tests │ └── examples ├── online_serving/ → ├── online_serving/ @@ -221,13 +221,14 @@ from pathlib import Path import openai import pytest -from tests.helpers.media import ( - convert_audio_bytes_to_text, +from tests.conftest import ( + OmniServer, + convert_audio_to_text, cosine_similarity_text, + dummy_messages_from_mix_data, generate_synthetic_video, + merge_base64_and_convert_to_text, ) -from tests.helpers.runtime import OmniServer, dummy_messages_from_mix_data -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config from vllm_omni.platforms import current_omni_platform # Edit: model name and stage config path @@ -235,7 +236,7 @@ models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] #If you use the default configuration file, you can directly use the following address. def get_default_config(): - return get_deploy_config_path("ci/qwen3_omni_moe.yaml") + return str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml") #If you need to modify the configuration file, you can use modify_stage_config. def get_chunk_config(): @@ -404,7 +405,7 @@ def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> N # PURPOSE: Verify text and audio outputs convey the same information # CUSTOMIZATION: Adjust similarity threshold (0.9) based on accuracy requirements assert audio_data is not None, "No audio output is generated" - audio_content = convert_audio_bytes_to_text(audio_data) + audio_content = merge_base64_and_convert_to_text(audio_data) print(f"text content is: {text_content}") print(f"audio content is: {audio_content}") similarity = cosine_similarity_text(audio_content.lower(), text_content.lower()) @@ -427,7 +428,7 @@ from pathlib import Path import pytest from vllm.assets.video import VideoAsset -from tests.helpers.mark import hardware_test +from tests.utils import hardware_test from ..multi_stages.conftest import OmniRunner # Optional: set process start method for workers diff --git a/docs/contributing/model/adding_diffusion_model.md b/docs/contributing/model/adding_diffusion_model.md index 6d5782a6e3c..dfa550173cf 100644 --- a/docs/contributing/model/adding_diffusion_model.md +++ b/docs/contributing/model/adding_diffusion_model.md @@ -802,7 +802,7 @@ omni = Omni(model="your-model", enable_layerwise_offload=True) ```python class WanTransformer3DModel(nn.Module): - _layerwise_offload_blocks_attrs = ["blocks"] # Attribute name containing transformer blocks + _layerwise_offload_blocks_attr = "blocks" # Attribute name containing transformer blocks def __init__(self): self.blocks = nn.ModuleList([...]) # Transformer blocks @@ -813,16 +813,16 @@ class WanTransformer3DModel(nn.Module): --- -### Diffusion Pipeline Profiler (Performance Profiling) +### Diffusion Timing (Performance Profiling) When adapting a new diffusion model, it is often useful to analyze the latency of key components such as text encoding, diffusion denoising, and VAE decoding. vLLM-Omni provides a timing utility via `DiffusionPipelineProfilerMixin` to help developers quickly identify performance bottlenecks. !!! info - `DiffusionPipelineProfilerMixin` is different from using `torch.profiler` for diffusion models, as introduced in this [tutorial](https://github.com/vllm-project/vllm-omni/blob/main/docs/contributing/profiling.md). `DiffusionPipelineProfilerMixin` only prints the timing information of multiple functions (such as `vae.decode`), while `torch.profiler` saves detailed GPU/CPU computation time, call/execution steps. + `DiffusionPipelineProfilerMixin` is different from using `torch.profiler` for diffusion models, as introduced in this [tutorial](https://github.com/vllm-project/vllm-omni/blob/main/docs/contributing/profiling.md#3-profiling-diffusion-models). `DiffusionPipelineProfilerMixin` only prints the timing information of multiple functions (such as `vae.decode`), while `torch.profiler` saves detailed GPU/CPU computation time, call/execution steps. This tool automatically measures the execution time of selected pipeline modules and prints the results in the logs. -**Enabling Diffusion Pipeline Profiler** +**Enabling Diffusion Timing** Enable timing by setting: @@ -843,7 +843,7 @@ If not specified, the default targets are used: **Adding DiffusionPipelineProfilerMixin to a Pipeline** To enable timing support in your pipeline, inherit from DiffusionPipelineProfilerMixin. ```python -from vllm_omni.diffusion.profiler import DiffusionPipelineProfilerMixin +from vllm_omni.diffusion.utils.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin class YourModelPipeline(nn.Module, DiffusionPipelineProfilerMixin): # Optional: Specify custom timing targets @@ -862,9 +862,7 @@ class YourModelPipeline(nn.Module, DiffusionPipelineProfilerMixin): ... # initialize timing profiler - self.setup_diffusion_pipeline_profiler( - enable_diffusion_pipeline_profiler=self.od_config.enable_diffusion_pipeline_profiler - ) + self.setup_diffusion_pipeline_profiler(enable_diffusion_pipeline_profiler) ``` The mixin dynamically wraps selected methods and records their execution time during inference. @@ -908,9 +906,9 @@ tokenizer.forward When enabled, timing logs appear like this: ``` -[DiffusionPipelineProfiler] text_encoder.forward took 0.018s -[DiffusionPipelineProfiler] diffuse took 2.412s -[DiffusionPipelineProfiler] vae.decode took 0.063s +[DiffusionTiming] text_encoder.forward took 0.018s +[DiffusionTiming] diffuse took 2.412s +[DiffusionTiming] vae.decode took 0.063s ``` These measurements help identify bottlenecks during model adaptation and optimization diff --git a/docs/contributing/model/adding_omni_model.md b/docs/contributing/model/adding_omni_model.md index 1eaff10596c..a0619e33811 100644 --- a/docs/contributing/model/adding_omni_model.md +++ b/docs/contributing/model/adding_omni_model.md @@ -313,7 +313,7 @@ The registry uses lazy loading, so the model class is imported only when needed. ## Stage Configuration -Create a YAML configuration file in `vllm_omni/deploy/`. For a complete example, see the [Qwen3-Omni configuration file](gh-file:vllm_omni/deploy/qwen3_omni_moe.yaml). +Create a YAML configuration file in `vllm_omni/model_executor/stage_configs/`. For a complete example, see the [Qwen3-Omni configuration file](gh-file:vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml). ### Key Configuration Fields @@ -408,17 +408,18 @@ Understanding the data structures is crucial for implementing stage transitions: **Input to your function:** - `stage_list[source_stage_id].engine_outputs`: List of `EngineCoreOutput` objects -- - Each contains `outputs`: List of `RequestOutput` objects - - Each `RequestOutput` has: -- - - `token_ids`: Generated token IDs - - `multimodal_output`: Dict with keys like `"code_predictor_codes"`, etc.These are the hidden states or intermediate outputs from the model's forward pass - - `prompt_token_ids`: Original prompt token IDs + - Each contains `outputs`: List of `RequestOutput` objects + - Each `RequestOutput` has: + - `token_ids`: Generated token IDs + - `multimodal_output`: Dict with keys like `"code_predictor_codes"`, etc. + - These are the hidden states or intermediate outputs from the model's forward pass + - `prompt_token_ids`: Original prompt token IDs **Output from your function:** - Must return `list[OmniTokensPrompt]` where each `OmniTokensPrompt` contains: -- - `prompt_token_ids`: List[int] - Token IDs for the next stage - - `additional_information`: Dict[str, Any] - Optional metadata (e.g., embeddings, hidden states) - - `multi_modal_data`: Optional multimodal data if needed + - `prompt_token_ids`: List[int] - Token IDs for the next stage + - `additional_information`: Dict[str, Any] - Optional metadata (e.g., embeddings, hidden states) + - `multi_modal_data`: Optional multimodal data if needed ### How Model Outputs Are Stored @@ -613,7 +614,7 @@ For a complete reference implementation, see: - **Thinker**: `vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py` - **Talker**: `vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py` - **Code2Wav**: `vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_code2wav.py` -- **Stage config**: `vllm_omni/deploy/qwen3_omni_moe.yaml` +- **Stage config**: `vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml` - **Input processors**: `vllm_omni/model_executor/stage_input_processors/qwen3_omni.py` - **Registry**: `vllm_omni/model_executor/models/registry.py` - **Testing**: `vllm_omni/tests/e2e/offline_inference/test_qwen3_omni.py` diff --git a/docs/contributing/model/adding_tts_model.md b/docs/contributing/model/adding_tts_model.md index 34fd2dbb503..e48ae5049ff 100644 --- a/docs/contributing/model/adding_tts_model.md +++ b/docs/contributing/model/adding_tts_model.md @@ -1,93 +1,20 @@ # Adding a TTS Model -This guide walks through adding a new TTS model to vLLM-Omni. Two patterns are -supported: - -- **Two-stage pipeline** (e.g. Qwen3-TTS, Fish Speech): an AR code-predictor stage - feeds an audio decoder stage via the `async_chunk` framework. This is the standard - pattern for maximum streaming performance. -- **Single-stage AR model** (e.g. MOSS-TTS-Nano): the model runs entirely inside one - AR worker and streams audio chunks directly from its own `inference_stream()` generator. - -Qwen3-TTS is used as the reference for the two-stage pattern. For the single-stage -pattern, refer to MOSS-TTS-Nano. +This guide walks through adding a new TTS model to vLLM-Omni, using **Qwen3-TTS** +as a reference. Qwen3-TTS demonstrates the standard two-stage TTS pipeline and the +key optimizations all TTS models in this repo should follow. ## Table of Contents 1. [Overview](#overview) -2. [Cross-Cutting Invariants](#cross-cutting-invariants) -3. [Directory Structure](#directory-structure) -4. [Step-by-Step Implementation](#step-by-step-implementation) -5. [Key Components](#key-components) -6. [Model Registration](#model-registration) -7. [Stage Configuration](#stage-configuration) -8. [Stage Input Processors](#stage-input-processors) -9. [Online Serving Integration](#online-serving-integration) -10. [Single-Stage Models](#single-stage-models) -11. [Testing](#testing) -12. [Pre-commit and DCO](#pre-commit-and-dco) -13. [Summary](#summary) - -## Cross-Cutting Invariants - -These rules apply to every TTS model regardless of architecture (AR vs AR+diffusion, -single-stage vs two-stage, codec-based vs VAE-based). Each has surfaced as a silent -bug in a shipped PR — check them at the end of every phase, not just at the start. - -**I1. Streaming output contract.** Pick one per-step semantics for `forward()` and -document it in the docstring: - -- *Delta*: yield only new audio samples produced this step. Preferred — linear cost. -- *Cumulative*: re-decode from step 0 every call. O(N²); only acceptable when the - codec exposes no streaming decode. - -If you choose delta, audit the full chain: `forward()` returns the new chunk → -`_consolidate_multimodal_tensors()` in `vllm_omni/engine/output_processor.py` -concatenates the audio key into a single tensor at finish → streaming consumers -receive per-step chunks, offline consumers receive the concatenated tensor. A -mismatch (consolidator skips the key with `continue`, or consumers expect a list -but receive a tensor) is invisible in offline RTF benchmarks — users hear replays -or truncation only under live playback. - -**I2. Multimodal output consumer hygiene.** `outputs[0].outputs[0].multimodal_output[key]` -can be `Tensor`, `list[Tensor]` (pre-consolidation snapshot), `np.ndarray`, or -scalar. In every test, example, and benchmark: - -- Never write `dict.get("a") or dict.get("b")` on tensor values — Python evaluates - the tensor's truthiness and raises `Boolean value of Tensor with more than one - value is ambiguous`. Use explicit `if x is None` chains. -- Defensively handle the list form: - `if isinstance(x, list): x = torch.cat([t.reshape(-1) for t in x], dim=0)`. -- Assert `shape` / `dtype` / `duration` explicitly — do not rely on truthiness for - presence checks. - -**I3. Hot-loop GPU discipline.** Inside any per-step model loop (AR decode, -diffusion solver, CFM Euler step, per-frame vocoder): - -- No `tensor.item()`, `.cpu()`, or `.tolist()` — each triggers a GPU→CPU sync; a - 10-step × 60-frame × 4-op loop creates 2400 syncs per request. -- Prefer `dst.copy_(src)` over `dst.fill_(src.item())` for scalar-into-buffer writes. -- Whole-model `torch.compile(Model.forward, fullgraph=False)` usually outperforms - per-submodule compile — fewer dispatch boundaries, larger fusion regions. Measure - before choosing granularity. -- No Python control flow that depends on tensor values; use `torch.where` or masking. - -Profile before optimizing. - -**I4. Validation pyramid.** Offline RTF alone is necessary but not sufficient. A -new TTS model must pass all three levels: - -| Layer | Catches | Tool | -|-------|---------|------| -| Offline RTF / duration | Throughput regressions, missing audio, wrong sample rate | `end2end.py`, pytest e2e | -| Browser streaming playback | Delta-vs-cumulative bugs, chunk boundary glitches, TTFP regressions | Gradio demo over `/v1/audio/speech?stream=true` | -| Concurrent requests | Per-request state leaks, codec window round-robin gaps | `max_num_seqs>1` smoke with 4+ parallel prompts | - -**I5. Per-request state belongs to the request.** If the model caches anything -across `forward()` calls (streaming generators, codec buffers, sliding-window pads, -CUDA graph state), key it by `info.get("_omni_req_id")` and free the entry on -request finish. A shared buffer silently corrupts audio across concurrent requests — -the symptom is crosstalk or truncation under load, nothing in single-request tests. +2. [Directory Structure](#directory-structure) +3. [Step-by-Step Implementation](#step-by-step-implementation) +4. [Key Components](#key-components) +5. [Model Registration](#model-registration) +6. [Stage Configuration](#stage-configuration) +7. [Stage Input Processors](#stage-input-processors) +8. [Testing](#testing) +9. [Summary](#summary) ## Overview @@ -101,7 +28,7 @@ and can be placed on different devices. Qwen3-TTS has two stages: Each stage is a separate model class configured independently via YAML. The two stages are connected by the `async_chunk` framework, which enables inter-stage streaming for -low first-packet latency (see [Async Chunk Design](../../design/feature/async_chunk.md)). +low first-packet latency (see [Async Chunk Design](../../design/feature/async_chunk_design.md)). ### Without async_chunk (batch mode) @@ -193,18 +120,8 @@ vllm_omni/model_executor/stage_configs/ | `models/qwen3_tts/qwen3_tts.py` | Unified model class | | `models/qwen3_tts/qwen3_tts_code_predictor_vllm.py` | Stage 0 - optimized AR | | `models/qwen3_tts/qwen3_tts_code2wav.py` | Stage 1 - decoder | -| `deploy/qwen3_tts.yaml` (new schema) | Deploy config (async_chunk enabled) — paired with `models/qwen3_tts/pipeline.py` for the frozen topology | - -> **Chunked vs end-to-end modes**: `qwen3_tts` registers a single -> pipeline whose stage 1 declares alternate processor functions — an -> `async_chunk_process_next_stage_input_func` (per-chunk streaming, used -> when `deploy.async_chunk=True`) and a `sync_process_input_func` -> (batch-end, used when `deploy.async_chunk=False`). The loader selects -> one at merge time based on the bool, so `--no-async-chunk` alone -> switches modes — no variant yaml or variant pipeline registration is -> needed. Pipelines that only make sense in one mode (e.g. -> `qwen3_omni_moe` is always chunked) can keep using the unconditional -> `custom_process_*` fields. +| `stage_configs/qwen3_tts.yaml` | Stage config (async_chunk enabled) | +| `stage_configs/qwen3_tts_batch.yaml` | Batch mode config | | `stage_input_processors/qwen3_tts.py` | Stage transition processors | ## Step-by-Step Implementation @@ -629,302 +546,6 @@ Recommended test cases for a new TTS model: Reference test: `tests/model_executor/stage_input_processors/test_qwen3_tts_async_chunk.py` -### E2E Online Serving Tests (`tests/e2e/online_serving/test_.py`) - -The `omni_server` fixture in `tests/conftest.py` is **module-scoped**. Each distinct -`OmniServerParams` id in the same test file forces the fixture to tear the server -down and spawn a new one mid-module. A few rules that save real CI debugging time: - -- **Prefer a single `OmniServerParams` set per file.** If you need to exercise two - deploy variants (e.g. `model.yaml` and `model_async_chunk.yaml`), either use one - variant and exercise streaming via request args, or split into two test files so - each file does exactly one server lifecycle. Mid-module teardown/restart is the - fragile path and surfaces startup races first. -- **Never depend on server-side fetching of external URLs** for reference audio or - other fixture data. CI runners (and China-hosted dev boxes) routinely fail on - SSL/DNS for public URLs. Inline the payload as a `data:audio/wav;base64,...` - ref_audio value — the serving layer accepts both forms. -- **Don't roll your own readiness probe.** The harness already waits for HTTP 200 - on `/health` before releasing the server to the test. If your model needs extra - warmup signals, expose them through `/health` rather than adding `time.sleep(...)` - inside the test. (Bare TCP `connect_ex` probes were insufficient; see - `tests/conftest.py` `OmniServer.wait_for_ready`.) -- **Use `core_model` marker + H100 hardware_test** to match the `test-ready.yml` - pipeline so your test is picked up by the `ready` label, not only nightly. - -## Online Serving Integration - -To expose your model through the `/v1/audio/speech` OpenAI-compatible endpoint, add -**all five** of the following integration points to -`vllm_omni/entrypoints/openai/serving_speech.py` in a **single commit**. Adding them -piecemeal causes partial-integration failures that are hard to debug. - -### 1. Stage constant - -Near the top of the file, alongside the other `_*_TTS_MODEL_STAGES` constants: - -```python -_YOUR_MODEL_TTS_MODEL_STAGES = {"your_model_stage_key"} -``` - -### 2. Union into `_TTS_MODEL_STAGES` - -Add to the `_TTS_MODEL_STAGES` set union: - -```python -_TTS_MODEL_STAGES: set[str] = ( - ... - | _YOUR_MODEL_TTS_MODEL_STAGES -) -``` - -### 3. Model type detection - -In `_detect_tts_model_type()`, add before the final `return None`: - -```python -if model_stage in _YOUR_MODEL_TTS_MODEL_STAGES: - return "your_model" -``` - -### 4. Request validation dispatch - -In `_validate_tts_request()`, add before the fallback `return`: - -```python -if self._tts_model_type == "your_model": - return self._validate_your_model_request(request) -``` - -### 5. Validation and parameter-builder methods - -Add two new methods: - -```python -def _validate_your_model_request( - self, request: OpenAICreateSpeechRequest -) -> str | None: - """Validate YourModel request. Returns an error string or None.""" - if not request.input or not request.input.strip(): - return "Input text cannot be empty" - return None - -def _build_your_model_params( - self, request: OpenAICreateSpeechRequest -) -> dict[str, Any]: - """Build additional_information dict for YourModel.""" - params: dict[str, Any] = {"text": [request.input]} - if request.voice is not None: - params["voice"] = [request.voice] - # Add any other model-specific fields here - return params -``` - -Then wire `_build_your_model_params` into the request-dispatch block in -`_create_tts_request()` (search for the equivalent `_build_*_params` call for an -existing model to find the right location). If the model supports voice cloning -(`ref_audio` → `prompt_audio_path`, `ref_text` → `prompt_text`), add those mappings -here too — follow any existing `_build__params` in `serving_speech.py` (e.g. -`_build_moss_tts_params` for the voice-cloning variant) for the pattern. - -> **Two dispatch patterns coexist:** Fish Speech uses a `self._is_fish_speech` boolean -> checked *before* `elif self._is_tts`. All newer models use the `_tts_model_type` -> string pattern shown above. For new models, always use the string pattern — do not -> add new `_is_*` boolean flags. - -> **Note on unused variables:** Only extract parameters in `_build_your_model_params` -> that you actually pass to the model's generate / `inference_stream` call. Extracting -> a variable without forwarding it will trigger a `ruff F841` pre-commit failure. - -### Merge conflicts - -`serving_speech.py` is modified by every new model PR and is the most common source of -rebase conflicts. When rebasing onto `main` and a conflict appears here, the resolution -is always to **keep both** the upstream model's additions and your own — never discard -either side. After resolving: - -```bash -git add vllm_omni/entrypoints/openai/serving_speech.py -git rebase --continue -``` - -## Single-Stage Models - -Some TTS models (e.g. MOSS-TTS-Nano) do not use a two-stage pipeline. Instead the -entire AR LM and audio decoder run inside a single AR worker, streaming audio chunks -directly from the model's own generator. - -### Directory structure - -``` -vllm_omni/model_executor/models/your_model_name/ - __init__.py - modeling_your_model_name.py # unified class: load_weights + forward + streaming - -vllm_omni/model_executor/stage_configs/your_model_name.yaml -``` - -No stage input processor is needed. - -### Stage config - -Use a single stage with `worker_type: ar`. The `is_comprehension: true` field and the -top-level `async_chunk: false` are required — omitting them causes silent -misclassification in the serving layer. Set `max_num_seqs` to at least 4 for -concurrent production use. - -```yaml -# stage_configs/your_model_name.yaml -async_chunk: false - -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true # required for serving_speech.py dispatch - runtime: - devices: "0" - engine_args: - model_stage: your_model_stage_key - model_arch: YourModelForCausalLM - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - engine_output_type: audio - max_num_seqs: 4 # min 4 for concurrent requests; default 1 causes gaps - final_output: true - final_output_type: audio -``` - -### Generator-based streaming pattern - -This is the MOSS-TTS-Nano pattern, distinct from VoxCPM2's vLLM-native AR pattern -(see `plan/voxcpm2_native_ar_design.md` for that variant). Load model weights in -`load_weights()` (not `__init__`) so vLLM finishes distributed initialisation before -any CUDA allocations. Stream via a per-request generator stored in an instance dict: - -```python -class YourModelForCausalLM(nn.Module): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - self._lm: nn.Module | None = None # populated in load_weights() - self._stream_gens: dict[str, Any] = {} # request_key → generator - - def load_weights(self, weights): - # Load self._lm here, after vLLM distributed init - ... - - def forward( - self, - input_ids, - positions, - intermediate_tensors=None, - inputs_embeds=None, - runtime_additional_information: list[dict] | None = None, # one dict per request - **kwargs, - ) -> OmniOutput: - infos = runtime_additional_information or [{}] - # Return empty output during dummy/profiling calls - if not runtime_additional_information or all(i.get("_is_dummy") for i in infos): - self._ar_emit_stop_token = True - return OmniOutput(...) - - outputs, last_flags = [], [] - for info in infos: - request_key = str(info.get("_omni_req_id", "0")) # set by vLLM, not user code - if request_key not in self._stream_gens: - self._stream_gens[request_key] = self._create_stream_gen(info) - try: - chunk, is_last = next(self._stream_gens[request_key]) - except StopIteration: - chunk, is_last = torch.zeros(0), True - if is_last: - del self._stream_gens[request_key] - outputs.append(chunk) - last_flags.append(is_last) - - self._ar_emit_stop_token = all(last_flags) - return OmniOutput(multimodal_outputs={"model_outputs": outputs, "is_last": last_flags}) - - def _create_stream_gen(self, info: dict): - """Yield (waveform_tensor, is_last) from the model's inference_stream(). - - Handle both incremental ("audio" events) and batch ("result" event) models: - some upstream implementations emit one "result" event with the full waveform - instead of incremental "audio" events. Both paths must be covered. - """ - for event in self._lm.inference_stream(...): - if event["type"] == "audio": - yield event["waveform"], False - elif event["type"] == "result": - # Fallback for models that don't emit incremental audio events - yield event.get("waveform", torch.zeros(0)), True - return - yield torch.zeros(0), True - - def compute_logits(self, hidden_states, sampling_metadata): - # Emit EOS only when the last chunk has been yielded so the AR - # scheduler ends the request at the right time. - ... -``` - -For an in-tree reference, look for any single-stage AR model under -`vllm_omni/model_executor/models/` (for example -`moss_tts_nano/modeling_moss_tts_nano.py` once its integration has landed). - -## Pre-commit and DCO - -All contributions must pass the pre-commit checks and the Developer Certificate of -Origin (DCO) sign-off before merging. - -### Running pre-commit - -Install the hooks once with `pre-commit install`. Then run before committing: - -```bash -pre-commit run --files \ - vllm_omni/model_executor/models/your_model_name/*.py \ - vllm_omni/entrypoints/openai/serving_speech.py \ - vllm_omni/model_executor/models/registry.py \ - tests/e2e/offline_inference/test_your_model_name.py \ - tests/e2e/online_serving/test_your_model_name.py -``` - -When pre-commit **modifies files**, it exits with a non-zero code but the reformatting -is correct. Stage the modified files and commit again — do not revert the changes. - -Common failures and fixes: - -| Check | Cause | Fix | -|-------|-------|-----| -| `ruff F841` | Local variable assigned but never used | Remove the extraction or forward it to the model call | -| `ruff E402` | Module-level import not at top of file | Move import to the top-level import block | -| `ruff format` | Line length, spacing, or quote style | Accept the auto-fix, stage, and re-commit | - -### DCO sign-off - -Every commit must carry a `Signed-off-by` trailer. Use the `-s` flag when committing: - -```bash -git commit -s -m "feat(your-model): add YourModel TTS support" -``` - -Or configure git to add it automatically: - -```bash -git config format.signOff true -``` - -To fix a missing sign-off on the most recent commit: - -```bash -git commit --amend -s --no-edit -git push origin your-branch --force-with-lease -``` - -> The DCO check verifies that the commit author email matches the `Signed-off-by` email. -> Make sure `git config user.email` is set to the address associated with your GitHub -> account before committing. - ## Adding a Model Recipe After implementing and testing your model, add a model recipe to the @@ -936,19 +557,15 @@ for the expected format. Adding a TTS model to vLLM-Omni involves: -1. **Create model directory** with AR stage, decoder stage, and unified class (two-stage) - or a single unified class with generator-based streaming (single-stage) +1. **Create model directory** with AR stage, decoder stage, and unified class 2. **AR stage** - use vLLM's native decoder layers with fused QKV; do not wrap HF directly 3. **Decoder stage** - thin wrapper around your audio decoder; implement `chunked_decode_streaming()` 4. **Unified class** - dispatches on `model_stage`; same structure as `Qwen3TTSModelForGeneration` 5. **Register** all stage classes in `registry.py` -6. **YAML configs** - provide both batch and `async_chunk` variants (two-stage), or a single-stage AR config -7. **Stage input processor** - buffer Stage 0 outputs and forward in chunks of 25 (two-stage only) -8. **Online serving** - add all 5 integration points to `serving_speech.py` in one commit -9. **Tests** - cover single request, batching, and streaming -10. **Pre-commit + DCO** - run `pre-commit` before pushing; sign every commit with `git commit -s` -11. **Model recipe** - add to [vllm-project/recipes](https://github.com/vllm-project/recipes) -12. **Invariants** - re-check I1–I5 (streaming contract, consumer hygiene, hot-loop discipline, validation pyramid, per-request state) at the end of every phase +6. **YAML configs** - provide both batch and `async_chunk` variants +7. **Stage input processor** - buffer Stage 0 outputs and forward in chunks of 25 +8. **Tests** - cover single request, batching, and async_chunk streaming +9. **Model recipe** - add to [vllm-project/recipes](https://github.com/vllm-project/recipes) ### Qwen3-TTS Reference Files @@ -957,12 +574,11 @@ Adding a TTS model to vLLM-Omni involves: | `models/qwen3_tts/qwen3_tts.py` | Unified model class | | `models/qwen3_tts/qwen3_tts_code_predictor_vllm.py` | AR stage with vLLM fused ops | | `models/qwen3_tts/qwen3_tts_code2wav.py` | Decoder stage with `chunked_decode_streaming()` | -| `models/qwen3_tts/pipeline.py` | Frozen pipeline topology (registered at import time) | -| `deploy/qwen3_tts.yaml` | Deploy config (user-editable, async_chunk + SharedMemoryConnector) | +| `stage_configs/qwen3_tts.yaml` | Stage configuration | | `stage_input_processors/qwen3_tts.py` | Stage transition processors | For more information, see: - [Architecture Overview](../../design/architecture_overview.md) -- [Async Chunk Design](../../design/feature/async_chunk.md) +- [Async Chunk Design](../../design/feature/async_chunk_design.md) - [Stage Configuration Guide](../../configuration/stage_configs.md) diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index e1dbc8234b0..7a2e64f1312 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -1,286 +1,216 @@ -# Profiling Diffusion Models +# Profiling vLLM-Omni -> **Warning:** Profiling is for development and debugging only. It adds significant overhead and should not be enabled in production. +> **Warning:** Profiling incurs significant overhead. Use only for development and debugging, never in production. -Diffusion profiling supports two backends through `profiler_config`: +vLLM-Omni uses the PyTorch Profiler to analyze performance across both **multi-stage omni-modality models** and **diffusion models**. -- `torch`: detailed CPU/CUDA traces, operator tables, and optional memory snapshots -- `cuda`: low-overhead CUDA range control for NVIDIA Nsight Systems (`nsys`) +### 1. Configure Profiling in the Stage YAML -## 1. Configure `profiler_config` - -Use `profiler_config` to enable profiling for a diffusion model. For diffusion usage, pass it directly to `Omni(...)` or `vllm serve`. - -Minimal torch-profiler config: +Enable profiling by adding `profiler_config` under `engine_args` for the stage(s) you want to profile in your stage config YAML: ```yaml -profiler_config: - profiler: torch - torch_profiler_dir: ./perf +stage_args: + - stage_id: 0 + stage_type: llm + engine_args: + # ... other engine args ... + profiler_config: + profiler: torch + torch_profiler_dir: ./perf ``` -Supported fields: - | Field | Description | |---|---| -| `profiler` | Profiler backend. Supported values: `torch`, `cuda`. Use `torch` for `trace.json`, Excel operator tables, and optional memory snapshots. Use `cuda` for Nsight Systems only. | -| `torch_profiler_dir` | Output directory for torch-profiler artifacts. Required when `profiler: torch`. | -| `torch_profiler_use_gzip` | Compress `trace_rank*.json` into `trace_rank*.json.gz`. | -| `torch_profiler_record_shapes` | Record input shapes and add a `by_shape` sheet to `ops_rank*.xlsx`. | -| `torch_profiler_with_stack` | Record call stacks, add a `by_stack` sheet to `ops_rank*.xlsx`, and export `stacks_cpu_rank*.txt` and `stacks_cuda_rank*.txt`. | -| `torch_profiler_with_memory` | Enable memory profiling and attempt to dump `memory_snapshot_rank*.pickle`. The pickle is only generated when the current backend supports memory history and snapshot APIs. | -| `torch_profiler_with_flops` | Enable FLOPs collection in `torch.profiler`. This does not add a separate output file. | -| `torch_profiler_dump_cuda_time_total` | Export an additional text summary `profiler_out_.txt` sorted by `self_cuda_time_total`. | -| `delay_iterations` | Number of worker iterations to skip before profiling starts. | -| `max_iterations` | Maximum number of worker iterations to capture before auto-stop. | -| `wait_iterations` | Torch-profiler wait iterations before warmup. | -| `warmup_iterations` | Torch-profiler warmup iterations. | -| `active_iterations` | Torch-profiler active iterations. | - -### Minimal configurations by output - -Only collect trace output: +| `profiler` | Profiler backend to use. Currently supports `torch`. | +| `torch_profiler_dir` | Directory where trace files are saved. Created automatically if it doesn't exist. | -```python -profiler_config = { - "profiler": "torch", - "torch_profiler_dir": "./perf", -} -``` +> **Tip:** Only enable `profiler_config` on stages you actually need to profile. Stages without it will not start a profiler, keeping overhead minimal. -Outputs: +### 2. Profiling Omni-Modality Models -- `trace_rank*.json` -- `ops_rank*.xlsx` with a `summary` sheet +**Selective Stage Profiling** -Collect compressed trace output: +It is highly recommended to profile specific stages to prevent producing overly large trace files: ```python -profiler_config = { - "profiler": "torch", - "torch_profiler_dir": "./perf", - "torch_profiler_use_gzip": True, -} -``` - -Outputs: +# Profile all stages +omni_llm.start_profile() -- `trace_rank*.json.gz` -- `ops_rank*.xlsx` with a `summary` sheet +# Only profile Stage 1 +omni_llm.start_profile(stages=[1]) -Collect trace and full operator tables: - -```python -profiler_config = { - "profiler": "torch", - "torch_profiler_dir": "./perf", - "torch_profiler_record_shapes": True, - "torch_profiler_with_stack": True, -} +# Stage 0 (Thinker) and Stage 2 (Audio Decoder) for qwen omni +omni_llm.start_profile(stages=[0, 2]) ``` -Outputs: +> **Important:** Always pass the same `stages` list to both `start_profile()` and `stop_profile()`. If you omit `stages` from `stop_profile()`, it defaults to stopping all stages — including ones that were never started — which will produce errors. -- `trace_rank*.json` -- `ops_rank*.xlsx` with `summary`, `by_shape`, and `by_stack` -- `stacks_cpu_rank*.txt` -- `stacks_cuda_rank*.txt` - -Collect trace, operator tables, and memory snapshots: +**Python Usage**: Wrap your generation logic with `start_profile()` and `stop_profile()`. ```python -profiler_config = { - "profiler": "torch", - "torch_profiler_dir": "./perf", - "torch_profiler_record_shapes": True, - "torch_profiler_with_stack": True, - "torch_profiler_with_memory": True, -} -``` +profiler_stages = [0] # Only profile the stages you need -Outputs: +# 1. Start profiling +omni.start_profile(stages=profiler_stages) -- `trace_rank*.json` -- `ops_rank*.xlsx` with `summary`, `by_shape`, and `by_stack` -- `stacks_cpu_rank*.txt` -- `stacks_cuda_rank*.txt` -- `memory_snapshot_rank*.pickle` when supported by the current backend +# Initialize generator +omni_generator = omni.generate(prompts, sampling_params_list, py_generator=args.py_generator) -### Full torch-profiler configuration +total_requests = len(prompts) +processed_count = 0 -If you want to enable the commonly used torch-profiler options together: +# Main Processing Loop +for stage_outputs in omni_generator: -```python -profiler_config = { - "profiler": "torch", - "torch_profiler_dir": "./perf", - "torch_profiler_use_gzip": False, - "torch_profiler_record_shapes": True, - "torch_profiler_with_stack": True, - "torch_profiler_with_memory": True, - "torch_profiler_with_flops": False, - "torch_profiler_dump_cuda_time_total": False, - "delay_iterations": 0, - "max_iterations": 0, - "wait_iterations": 0, - "warmup_iterations": 0, - "active_iterations": 0, -} -``` + # ... [Output processing logic for text/audio would go here] ... -## 2. Profiling Diffusion with PyTorch Profiler + # Update count to track when to stop profiling + processed_count += len(stage_outputs.request_output) -Single-stage diffusion models use `start_profile()` / `stop_profile()` controls. The profiler only writes artifacts after profiling has been started and then stopped. + # 2. Check if all requests are done to stop the profiler safely + if profiler_enabled and processed_count >= total_requests: + print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...") -```python -from vllm_omni import Omni - -omni = Omni( - model="Wan-AI/Wan2.2-I2V-A14B-Diffusers", - profiler_config={ - "profiler": "torch", - "torch_profiler_dir": "./perf", - }, -) - -omni.start_profile() -... -omni.stop_profile() -``` + # Stop the profiler while workers are still active + # Pass the same stages list used in start_profile() + omni_llm.stop_profile(stages=profiler_stages) -For diffusion offline example scripts under `examples/offline_inference/`, pass `--profiler-config` as a JSON object. The script enables profiling when this argument is set and wraps generation with `start_profile()` / `stop_profile()`. + # Wait for traces to flush to disk + print("[Info] Waiting 30s for workers to write trace files to disk...") + time.sleep(30) + print("[Info] Trace export wait time finished.") + +omni_llm.close() +``` -Example: +**CLI Usage** (using `end2end.py`): ```bash -python examples/offline_inference/image_to_video/image_to_video.py \ - --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \ - --image input.jpg \ - --prompt "A cat playing with yarn" \ - --profiler-config '{ - "profiler": "torch", - "torch_profiler_dir": "./perf", - "torch_profiler_record_shapes": true, - "torch_profiler_with_stack": true - }' -``` +# Profile only Stage 0 (Thinker) +python end2end.py --output-wav output_audio \ + --query-type text --enable-profiler --profiler-stages 0 -Examples: +# Profile Stage 0 and Stage 2 +python end2end.py --output-wav output_audio \ + --query-type text --enable-profiler --profiler-stages 0 2 -1. [Image edit example](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py) -2. [Image to video example](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video) +# Profile all stages (omit --profiler-stages) +python end2end.py --output-wav output_audio \ + --query-type text --enable-profiler +``` -## 3. Profiling Diffusion with Nsight Systems (`nsys`) +**Examples**: -For Nsight Systems, use `profiler: cuda` and wrap the process with `nsys profile`. +1. **Qwen2.5-Omni**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py) -```bash -nsys profile \ - --trace-fork-before-exec=true \ - --cuda-graph-trace=node \ - --capture-range=cudaProfilerApi \ - --capture-range-end=repeat \ - -o diffusion_trace \ - python image_to_video.py ... -``` +2. **Qwen3-Omni**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py) -The Python process being profiled must create the diffusion engine with: +### 3. Profiling diffusion models -```python -profiler_config = {"profiler": "cuda"} +Diffusion profiling is End-to-End, capturing encoding, denoising loops, and decoding. Standalone diffusion scripts use `--profiler-dir` to enable profiling. + +**CLI Usage:** +```bash +python image_to_video.py \ + --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \ + --image qwen-bear.png \ + --prompt "A cat playing with yarn, smooth motion" \ + --profiler-dir \ + \ + # Minimize Spatial Dimensions (Optional but helpful): + # Drastically reduces memory usage so the profiler doesn't + # crash due to overhead, though for accurate performance + # tuning you often want target resolutions. + --height 48 \ + --width 64 \ + \ + # Minimize Temporal Dimension (Frames): + # Video models process 3D tensors (Time, Height, Width). + # Reducing frames to the absolute minimum (2) keeps the + # tensor size small, ensuring the trace file doesn't become + # multi-gigabytes in size. + --num-frames 2 \ + \ + # Minimize Iteration Loop (Steps): + # This is the most critical setting for profiling. + # Diffusion models run the same loop X times. + # Profiling 2 steps gives you the exact same performance + # data as 50 steps, but saves minutes of runtime and + # prevents the trace viewer from freezing. + --num-inference-steps 2 \ + \ + --guidance-scale 5.0 \ + --guidance-scale-high 6.0 \ + --boundary-ratio 0.875 \ + --flow-shift 12.0 \ + --fps 16 \ + --output i2v_output.mp4 ``` -Then call `start_profile()` before the requests you want to capture and `stop_profile()` after them. The diffusion worker processes open and close the CUDA capture range themselves, so `nsys` sees the actual GPU work instead of only the parent process. +> **Note:** For diffusion stages within a multi-stage omni pipeline, use `profiler_config` in the stage YAML instead (see Section 1). -## 4. Profiling Online Serving +**Examples**: -When `profiler_config.profiler` is set for a diffusion model, the server exposes: +1. **Qwen image edit**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py) -- `POST /start_profile` -- `POST /stop_profile` +2. **Wan-AI/Wan2.2-I2V-A14B-Diffusers**: [https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video) -### Start the server +### 4. Profiling Online Serving -Single-stage diffusion serving with torch profiler: +When `profiler_config` is set in the stage YAML, the server automatically exposes `/start_profile` and `/stop_profile` HTTP endpoints. +**1. Start the server** with a stage YAML that has `profiler_config` enabled: ```bash -vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers \ - --omni \ - --port 8091 \ - --profiler-config '{ - "profiler": "torch", - "torch_profiler_dir": "/tmp/vllm_profile_wan22_i2v", - "torch_profiler_with_stack": true, - "torch_profiler_with_flops": false, - "torch_profiler_use_gzip": true, - "torch_profiler_dump_cuda_time_total": false, - "torch_profiler_record_shapes": true, - "torch_profiler_with_memory": true, - "delay_iterations": 0, - "max_iterations": 0, - "wait_iterations": 0, - "warmup_iterations": 0, - "active_iterations": 0 - }' +vllm serve Qwen/Qwen2.5-Omni-7B \ + --omni \ + --stage-configs-path qwen2_5_omni.yaml \ + --port 8091 ``` -Single-stage diffusion serving with Nsight Systems: +Or for one stage diffusion models: ```bash -nsys profile \ - --trace-fork-before-exec=true \ - --cuda-graph-trace=node \ - --capture-range=cudaProfilerApi \ - --capture-range-end=repeat \ - -o serving_trace \ - vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers \ - --omni \ - --port 8091 \ - --profiler-config '{"profiler": "cuda"}' +vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers --omni --port 8091 --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}' ``` -### Control capture - -Example profiling flow for an online Qwen-Image request: - +**2. Start profiling** by sending a POST request: ```bash -# Start profiling. +# Profile all stages that have profiler_config set curl -X POST http://localhost:8091/start_profile -# Send a Qwen-Image generation request while profiling is active. -curl http://localhost:8091/v1/images/generations \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Qwen/Qwen-Image", - "prompt": "A red vintage bicycle parked beside a quiet canal at sunset" - }' - -# Stop profiling and flush profiler artifacts. -curl -X POST http://localhost:8091/stop_profile +# Profile specific stages only +curl -X POST http://localhost:8091/start_profile \ + -H "Content-Type: application/json" \ + -d '{"stages": [0]}' ``` -## 5. Diffusion Pipeline Profiler +**3. Send your inference requests** as normal while the profiler is running. -For lightweight per-stage pipeline timing such as `vae.decode` or `diffuse`, see [Diffusion Pipeline Profiler](model/adding_diffusion_model.md#diffusion-pipeline-profiler-performance-profiling). That utility logs stage durations only and does not generate torch-profiler artifacts such as `trace.json`, Excel tables, or memory snapshots. +**4. Stop profiling** and collect traces: +```bash +# Stop all stages +curl -X POST http://localhost:8091/stop_profile + +# Stop specific stages (must match the stages you started) +curl -X POST http://localhost:8091/stop_profile \ + -H "Content-Type: application/json" \ + -d '{"stages": [0]}' +``` -## 6. Analyze Results +Trace files are written to the `torch_profiler_dir` specified in your stage YAML. -Torch-profiler output: +> **Important:** Always stop the same stages you started. Stopping a stage that was never started will produce errors. -- Chrome/Perfetto trace: `trace_rank*.json` or `trace_rank*.json.gz` -- Excel workbook: `ops_rank*.xlsx` with `summary`, and optional `by_shape` / `by_stack` sheets -- Stack exports: `stacks_cpu_rank*.txt` and `stacks_cuda_rank*.txt` when stack capture is enabled -- Memory snapshot: `memory_snapshot_rank*.pickle` when memory capture is enabled and supported by the backend -- Optional CUDA-time text summary: `profiler_out_.txt` when `torch_profiler_dump_cuda_time_total` is enabled +### 5. Analyzing Traces -CUDA profiler / Nsight Systems output: +Output files are saved to the `torch_profiler_dir` specified in your stage YAML config. -- `.nsys-rep` report files written by `nsys -o ...` +**Output** +**Chrome Trace** (`.json.gz`): Visual timeline of kernels and stages. Open in Perfetto UI. -Recommended viewers: +**Viewing Tools:** -- [Perfetto](https://ui.perfetto.dev/) for torch traces -- `nsys stats .nsys-rep` for CLI summaries -- Nsight Systems GUI for CUDA kernel timelines +- [Perfetto](https://ui.perfetto.dev/) (recommended) +- `chrome://tracing` (Chrome only) -For upstream background on the underlying vLLM profiling infrastructure, see the [vLLM profiling guide](https://docs.vllm.ai/en/stable/contributing/profiling/). +**Note**: vLLM-Omni reuses the PyTorch Profiler infrastructure from vLLM. See the official vLLM profiler documentation: [vLLM Profiling Guide](https://docs.vllm.ai/en/stable/contributing/profiling/) diff --git a/docs/design/feature/async_chunk.md b/docs/design/feature/async_chunk_design.md similarity index 80% rename from docs/design/feature/async_chunk.md rename to docs/design/feature/async_chunk_design.md index 57b4209b8df..202ef0e18e8 100644 --- a/docs/design/feature/async_chunk.md +++ b/docs/design/feature/async_chunk_design.md @@ -1,4 +1,4 @@ -# Async Chunk +# Async Chunk Design ## Table of Contents @@ -19,7 +19,7 @@ The `async_chunk` feature enables asynchronous, chunked processing of data acros For qwen3-omni: - **Thinker → Talker**: Per decode step (typically chunk_size=1) -- **Talker → Code2Wav**: Accumulated to `codec_chunk_frames` (default=25) before sending. During the initial phase, a dynamic initial chunk size (IC) is automatically selected based on server load to reduce TTFP. Use the per-request `initial_codec_chunk_frames` API field to override. +- **Talker → Code2Wav**: Accumulated to `codec_chunk_frames` (default=25) before sending. During the initial phase, a dynamic initial chunk size (IC) is automatically selected based on server load to reduce TTFA. Use the per-request `initial_codec_chunk_frames` API field to override. - **Code2Wav**: Streaming decode with code2wav chunk_size With `async_chunk`: @@ -75,85 +75,26 @@ Enabling **async_chunk** (False→True) sharply reduces time-to-first-audio (TTF

## Architecture +### Data Flow -### Async Chunk Pipeline Overview - -The following diagram illustrates the **Async Chunk Architecture** for multi-stage models (e.g., Qwen3-Omni with Thinker → Talker → Code2Wav), showing how data flows through the 4-stage pipeline with parallel processing and dual-stream output: - +#### Sequential Flow

- - Async Chunk Pipeline Architecture + + Data Flow between stages

-**Diagram Legend:** - -| Step | Stage Type | Description | -|------|-----------|------------| -| `prefill` | Initialization | Context processing, KV cache initialization | -| `decode` | Autoregressive | Token-by-token generation in AR stages | -| `codes` | Audio Encoding | RVQ codec codes from Talker stage | -| `output` | Final Output | Text chunks or audio waveforms | - -### Data Flow - -#### Stage 0: Thinker (Multimodal Understanding + Text Generation) -- **Prefill**: Processes multimodal input (text/image/audio/video), initializes KV cache -- **Decode Loop**: Generates text tokens autoregressively -- **Chunk Triggers**: Each decode step (typically `chunk_size=1`) can trigger downstream processing -- **Dual Output**: - - **Text Stream**: `text_0`, `text_1`, `text_2`... `text_n` streamed to output - - **Hidden States**: Passed to Talker stage for audio synthesis - -#### Stage 1: Talker (Text → RVQ Audio Codes) -- **Prefill**: Receives hidden states from Thinker as semantic condition -- **Decode Loop**: Generates RVQ codec codes autoregressively -- **Accumulation**: Codes accumulate to `codec_chunk_frames` (default=25) before forwarding -- **Dynamic IC**: Initial chunk size auto-selected based on server load to optimize TTFP -- **Output**: `codes` blocks (chunk 0, 1, ... n) sent to Code2Wav - -#### Stage 2: Code2Wav (Vocoder Decoder) -- **Non-Autoregressive**: Processes RVQ codes in parallel batches -- **Streaming Decode**: Converts codes to audio waveforms chunk-by-chunk -- **Batching**: Supports batched inference for multiple concurrent requests -- **Output**: Audio segments `audio_0`, `audio_1`, ... `audio_n` - -#### Stage 3: Output (Dual Stream) -- **Text Streaming**: `text_0` → `text_1` → `text_2` → ... (user sees response in real-time) -- **Audio Streaming**: `audio_0` → `audio_1` → ... (user hears audio progressively) - -### Execution Timeline +#### Async Chunk Flow -``` -Timeline: Parallel vs Sequential - -Sequential (async_chunk=false): -[Thinker: ████████████████████] (2.0s) - [Talker: ████████████████████] (3.0s) - [Code2Wav: ████] (1.0s) -Total: 6.0s, TTFP: 6.0s - -Async Chunk (async_chunk=true): -[Thinker: ████░░░░████░░░░████] (2.0s, streaming) - [Talker: ░░████░░░░████░░] (3.0s, parallel) - [Code2Wav: ░░░░████░░] (1.0s, batched) -Total: ~3.5s, TTFP: ~0.5s - -█ = Active computation ░ = Waiting/idle -``` - -#### Sequential Flow (for comparison)

- - Sequential Data Flow + + Data Flow between stages

-In sequential mode, each stage must wait for the previous stage to complete entirely before starting. - -### Async Chunk System Architecture +### Async Chunk architecture

diff --git a/docs/design/feature/cfg_parallel.md b/docs/design/feature/cfg_parallel.md index c73a87749f5..64decbe9560 100644 --- a/docs/design/feature/cfg_parallel.md +++ b/docs/design/feature/cfg_parallel.md @@ -25,9 +25,7 @@ In standard Classifier-Free Guidance, each diffusion step requires two forward p 1. **Positive/Conditional**: Guided by the text prompt 2. **Negative/Unconditional**: Typically using empty or negative prompt -Some models require 3 or more CFG branches (see [N-Branch CFG](#n-branch-cfg-3-branches)). - -CFG-Parallel eliminates this bottleneck by distributing the forward passes across different GPU ranks, allowing them to execute simultaneously rather than sequentially. +CFG-Parallel eliminates this bottleneck by distributing the two forward passes across different GPU ranks, allowing them to execute simultaneously rather than sequentially. ### Architecture @@ -35,11 +33,9 @@ vLLM-omni provides `CFGParallelMixin` that encapsulates all CFG parallel logic. | Method | Purpose | Automatic Behavior | |--------|---------|-------------------| -| [`predict_noise_maybe_with_cfg()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Predict noise with 2-branch CFG | Detects parallel mode, distributes computation, gathers results | -| [`predict_noise_with_multi_branch_cfg()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Predict noise with N-branch CFG | Round-robin dispatches N branches across M GPUs | +| [`predict_noise_maybe_with_cfg()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Predict noise with CFG | Detects parallel mode, distributes computation, gathers results | | [`scheduler_step_maybe_with_cfg()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Step scheduler | All ranks step locally (no broadcast needed) | -| [`combine_cfg_noise()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Combine 2-branch predictions | Applies CFG formula with optional normalization | -| [`combine_multi_branch_cfg_noise()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Combine N-branch predictions | Override for custom multi-branch combine logic | +| [`combine_cfg_noise()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Combine positive/negative | Applies CFG formula with optional normalization | | [`predict_noise()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Forward pass wrapper | Override for custom transformer calls | | [`cfg_normalize_function()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Normalize CFG output | Override for custom normalization | @@ -61,22 +57,6 @@ vLLM-omni provides `CFGParallelMixin` that encapsulates all CFG parallel logic. - All ranks compute the scheduler step locally — no broadcast needed because `predict_noise_maybe_with_cfg` already ensures all ranks have identical noise predictions after `all_gather` + local combine. -### N-Branch CFG (3+ branches) - -Some models require more than 2 CFG branches. For example, Bagel and OmniGen2 use 3 branches, DreamID Omni uses 4 branches. - -`predict_noise_with_multi_branch_cfg()` handles these by automatically dispatching N branches across M GPUs using round-robin (rule: branch `i` → rank `i % M`): - -| Branches (N) | GPUs (M) | Dispatch | -|:---:|:---:|:---| -| 3 | 2 | `[[0, 2], [1]]` | -| 3 | 3 | `[[0], [1], [2]]` | -| 4 | 2 | `[[0, 2], [1, 3]]` | -| 4 | 3 | `[[0, 3], [1], [2]]` | -| 4 | 4 | `[[0], [1], [2], [3]]` | - -When a rank handles multiple branches, it runs them sequentially. After `all_gather`, all ranks execute `combine_multi_branch_cfg_noise()` locally, producing identical results. - --- ## Step-by-Step Implementation @@ -118,7 +98,6 @@ class YourModelPipeline(nn.Module, CFGParallelMixin): - `positive_kwargs`: transformer arguments for conditional (text-guided) prediction - `negative_kwargs`: transformer arguments for unconditional prediction (set to `None` if CFG disabled) - For image editing pipelines, add `output_slice=image_seq_len` to extract the generative image portion -- For models with 3+ CFG branches, see [Multi-Branch CFG](#multi-branch-cfg-3-branches) in the Customization section ### Step 2: Call `diffuse` @@ -192,42 +171,20 @@ class LongCatImagePipeline(nn.Module, CFGParallelMixin): ``` -### Multi-Branch CFG (3+ branches) - -For models with 3 or more CFG branches, use `predict_noise_with_multi_branch_cfg()` instead of `predict_noise_maybe_with_cfg()`, and override `combine_multi_branch_cfg_noise()` for custom combine logic. This interface also works for standard 2-branch CFG — just pass 2 branches in `branches_kwargs`. +### Override `combine_cfg_noise()` for Multi-Output Models -**Example (3-branch with dual guidance scale):** +When `predict_noise()` returns a tuple (e.g., video + audio), the default `combine_cfg_noise()` applies CFG to every element. Override it to apply different logic per element — for example, CFG on video but positive-only on audio: ```python -class YourMultiBranchPipeline(nn.Module, CFGParallelMixin): - def combine_multi_branch_cfg_noise(self, predictions, true_cfg_scale, cfg_normalize=False): - text_scale = true_cfg_scale["text"] - image_scale = true_cfg_scale["image"] - pos, ref, uncond = predictions - return uncond + image_scale * (ref - uncond) + text_scale * (pos - ref) - - def diffuse(self, ...): - for i, t in enumerate(timesteps): - positive_kwargs = {...} # conditional prompt - ref_neg_kwargs = {...} # negative prompt + reference - uncond_kwargs = {...} # unconditional - - noise_pred = self.predict_noise_with_multi_branch_cfg( - do_true_cfg=do_true_cfg, - true_cfg_scale={"text": text_guidance_scale, "image": image_guidance_scale}, - branches_kwargs=[positive_kwargs, ref_neg_kwargs, uncond_kwargs], - ) - latents = self.scheduler_step_maybe_with_cfg(noise_pred, t, latents, do_true_cfg) - - return latents +class MyVideoAudioPipeline(nn.Module, CFGParallelMixin): + def combine_cfg_noise(self, positive_noise_pred, negative_noise_pred, scale, normalize): + (video_pos, audio_pos) = positive_noise_pred + (video_neg, audio_neg) = negative_noise_pred + video_combined = super().combine_cfg_noise(video_pos, video_neg, scale, normalize) + return (video_combined, audio_pos) # audio: positive only, no CFG ``` -### Override Combine Functions - -There are two combine functions for different scenarios: - -- **`combine_cfg_noise()`** — Used by `predict_noise_maybe_with_cfg()`. Override when `predict_noise()` returns a tuple (e.g., video + audio) and you need per-element CFG logic. -- **`combine_multi_branch_cfg_noise()`** — Used by `predict_noise_with_multi_branch_cfg()`. Override to implement custom multi-branch combine formulas (see [Multi-Branch CFG](#multi-branch-cfg-3-branches) above). +This also requires `predict_noise()` to return a tuple (see [Override predict_noise](#override-predict_noise-for-custom-transformer-calls) above). ### Implement a Composite Scheduler for Multi-Output Models @@ -346,5 +303,4 @@ Adding CFG-Parallel support: 1. ✅ **Create mixin** - Inherit from `CFGParallelMixin` and implement `diffuse()` method 2. ✅ **(Optional) Customize** - Override `predict_noise()` or `cfg_normalize_function()` for custom behavior -3. ✅ **(Optional) Multi-branch** - For 3+ branch models, use `predict_noise_with_multi_branch_cfg()` and override `combine_multi_branch_cfg_noise()` -4. ✅ **Test** - Verify with `--cfg-parallel-size 2` (or 3/4 for multi-branch) and compare performance +3. ✅ **Test** - Verify with `--cfg-parallel-size 2` and compare performance diff --git a/docs/design/feature/expert_parallel.md b/docs/design/feature/expert_parallel.md deleted file mode 100644 index e05eec33613..00000000000 --- a/docs/design/feature/expert_parallel.md +++ /dev/null @@ -1,221 +0,0 @@ -# Expert Parallel - -This section describes how to add Expert Parallel (EP) to a diffusion transformer that uses Mixture-of-Experts (MoE) layers. -We use **HunyuanImage3.0** as the reference implementation. - ---- - -## Table of Contents - -- [Overview](#overview) -- [Step-by-Step Implementation](#step-by-step-implementation) -- [Testing](#testing) -- [Reference Implementations](#reference-implementations) -- [Summary](#summary) - ---- - -## Overview - -### What is Expert Parallel? - -**Expert Parallel** is a parallelism strategy in Mixture-of-Experts (MoE) models that distributes different expert networks across distinct computational devices. Each device holds and computes only a subset of experts (local experts), with tokens dispatched to and gathered from remote devices via collective communication operations (e.g., All-to-All, All-Gather). - -| Backend | Description | -|---------|-------------| -| `allgather_reducescatter` | Default backend based on allgather/reducescatter primitives, suitable for general EP+DP deployments.| - -## Configuration - -Enable EP by setting the `--enable-expert-parallel` flag. The EP size is automatically calculated as: - -```text -EP_SIZE = TP_SIZE × SP_SIZE × CFG_SIZE × DP_SIZE -``` - - -Where: - -- `TP_SIZE`: Tensor parallel size -- `SP_SIZE`: Sequence parallel size -- `CFG_SIZE`: Classifier-free guidance parallel size -- `DP_SIZE`: Data parallel size -- `EP_SIZE`: Expert parallel size (computed automatically) - -Note: -- Expert parallelism is only applicable to Mixture-of-Experts (MoE) models. -- The EP group is created **per pipeline stage**, meaning it includes all ranks that participate in model parallelism except pipeline parallelism. -- The underlying communication pattern for expert parallelism is **All-to-All** among the ranks in the EP group. - -For example, consider a configuration with `TP=2`, `SP=1`, `CFG=2`, and `DP=4` (total 2×1×2×4 = 16 GPUs). - -- Expert layers are handled by an EP group of size 16. - -- Attention layers use tensor parallelism of size 2 within each of the 8 DP groups (because `DP×CFG×SP = 4×2×1 = 8` groups, each containing the 2 TP ranks). Inside each such group, the attention weights are sharded across the 2 GPUs. - - -## Step-by-Step Implementation - -### Step 1: Configure Expert Parallelism Settings - -Calculate local experts per rank: - -``` -ep_size = 8 # Expert Parallel size (typically equals TP size) -num_experts = 64 -num_local_experts = num_experts // ep_size # 8 experts per card - -# Check divisibility -assert num_experts % ep_size == 0, "Experts must be divisible by EP size" -``` - -### Step 2: Use Sparse MoE Block to enable EP routing. - -Example: -``` -from vllm.model_executor.layers.linear import ReplicatedLinear -class HunYuanSparseMoeBlock(nn.Module): - def __init__( - self, - config: PretrainedConfig, - layer_id: int = -1, - prefix: str = "", - ): - super().__init__() - self.tp_size = get_tensor_model_parallel_world_size() - self.n_routed_experts = config.num_experts # 64 - - # Calculate local experts per rank (key for EP) - if self.tp_size > self.n_routed_experts: - raise ValueError(f"TP size {self.tp_size} > experts {self.n_routed_experts}") - - # Routing gate (replicated on all ranks, computes scores for all tokens to all experts) - self.gate = ReplicatedLinear( - config.hidden_size, - config.num_experts, - bias=False, - quant_config=None, - prefix=f"{prefix}.gate", - ) - - # EP expert layer (factory loads platform-specific implementation) - self.experts = HunyuanFusedMoE(...) -``` -**Key Points:** -- gate is **ReplicatedLinear** (replicated on all ranks) -- experts is created via **HunyuanFusedMoE factory**, which automatically handles EP dispatch - -### Step 3: Initialize EP Runtime - -Initialize the EP communication context before model loading. -``` -from vllm.utils.import_utils import resolve_obj_by_qualname -# Call during __init__ or model loading -op_name = "hunyuan_fused_moe" - -# Prepare EP runtime: establish communication groups, assign local expert indices, init _expert_map -current_omni_platform.prepare_diffusion_op_runtime(op_name) - -# Factory automatically resolves platform implementation (GPU: FusedMoE / NPU: AscendFusedMoE) -impl = resolve_obj_by_qualname( - current_omni_platform.get_diffusion_model_impl_qualname(op_name) -) -``` - -### Step 4: Expert Weight Mapping & Loading - -Each rank loads only the expert weights assigned to its local allocation. -``` -# Get expert parameter mapping (different per rank) -expert_mapping = HunyuanFusedMoE.make_expert_params_mapping( - model=self, - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=64, - num_redundant_experts=0, -) -# Returns: [(param_name, weight_name, expert_id, shard_id), ...] -# Note: Each rank only contains mappings for its local expert_ids - -# Filter non-local experts during loading -for name, loaded_weight in weights: - if "mlp.experts" in name: - # Parse expert_id from weight name (implementation needed) - expert_id = parse_expert_id_from_name(name) - local_expert_start = (ep_rank) * num_local_experts - local_expert_end = (ep_rank + 1) * num_local_experts - - if not (local_expert_start <= expert_id < local_expert_end): - continue # Skip non-local expert weights -``` -### Step 5: Forward Pass with EP - -Example (MoE Forward): -``` -def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - orig_shape = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) - - # 1. Global routing computation (all tokens, all expert scores) - # hidden_states: [num_tokens, hidden_dim] (full tensor) - router_logits, _ = self.gate(hidden_states) # [num_tokens, num_experts] - - # 2. EP dispatch and compute (HunyuanFusedMoE handles all_to_all internally) - # - Dispatch: Send tokens to target ranks based on router_logits - # - Local Compute: Each rank processes only its num_local_experts - # - Combine: Results returned to original token positions - final_hidden_states = self.experts( - hidden_states=hidden_states, - router_logits=router_logits, - ) - - # 3. Add shared expert output (not EP, computed on all ranks) - if self.shared_mlp is not None: - shared_out = self.shared_mlp(hidden_states) - final_hidden_states = final_hidden_states + shared_out - - # 4. Tensor Parallel All-Reduce (synchronize across TP group) - if self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) - - return final_hidden_states.view(orig_shape) -``` - -## Testing -After adding Expert Parallel support, test via command line: -```bash -cd examples/offline_inference/text_to_image -python text_to_image.py \ - --model Your-org/your-model \ - --prompt "a cup of coffee on the table" \ - --output "ep_enabled.png" \ - --num-inference-steps 50 \ - --guidance-scale 5.0 \ - --tensor-parallel-size 8 \ - --seed 1234 \ - --enable-expert-parallel -``` - -vLLM‑Omni currently focuses on core diffusion model inference acceleration, so the Expert Parallel implementation includes only the basic multi‑GPU expert sharding functionality (enabled via --enable-expert-parallel). Advanced features such as communication backend selection (--all2all-backend), load balancing (--enable-eplb and its configuration), and multi‑node deployment belong to the extended capabilities of the main vLLM project and have not yet been integrated into Omni. - -## Reference Implementations - -Complete examples in the codebase: - -| Model | Path | Pattern | Notes | -|-------|------|---------|-------| -| **HunyuanImage3.0** | `vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py` | Standard EP | Full implementation with validation | -| **EP Tests** | `vllm-omni/tests/e2e/offline_inference/test_expert_parallel.py` | E2E testing | EP correctness and performance | -| **Constraint Tests** | `vllm-omni/tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py` | Unit testing | Validation logic | - ---- -## Summary - -Adding Expert Parallel support to diffusion model: - -1. **Identify MoE layers** - Locate the router and expert networks in each transformer block. -2. **Validate EP constraints** – Ensure num_experts is divisible by expert_parallel_size. -3. **Test** - Run with enable-expert-parallel, check memory reduction, speedup, and output quality against single‑GPU baseline. diff --git a/docs/design/feature/omni_connectors/mooncake_transfer_engine_connector.md b/docs/design/feature/omni_connectors/mooncake_transfer_engine_connector.md index 306a0620b4b..798644b96ff 100644 --- a/docs/design/feature/omni_connectors/mooncake_transfer_engine_connector.md +++ b/docs/design/feature/omni_connectors/mooncake_transfer_engine_connector.md @@ -33,8 +33,8 @@ runtime: zmq_port: 50051 # ZMQ base port (see "Port Offset Scheme" below) protocol: "rdma" # "rdma" or "tcp" device_name: "" # RDMA device (e.g., "mlx5_0"), empty for auto-detect - memory_pool_size: 4294967296 # 4 GB (CPU); use 2147483648 (2 GB) for GPU - memory_pool_device: "cpu" # "cpu" for pinned memory (recommended), "cuda" for GPUDirect RDMA + memory_pool_size: 2147483648 # 2GB memory pool + memory_pool_device: "cpu" # "cpu" for pinned memory, "cuda" for GPUDirect RDMA ``` Wire stages to the connector: @@ -64,8 +64,8 @@ stage_args: | Parameter | Default | Description | |---|---|---| -| `memory_pool_size` | 4 GB (CPU) / 2 GB (GPU) | Total size of the RDMA-registered memory pool in bytes. Recommended 4 GB for CPU pinned memory; 2 GB for GPU VRAM to conserve device memory. | -| `memory_pool_device` | `"cpu"` | `"cpu"`: pinned host memory (recommended, works on all topologies). `"cuda"`: GPU VRAM for GPUDirect RDMA (requires NIC-GPU direct PCIe connectivity, PIX topology). | +| `memory_pool_size` | 1 GB | Total size of the RDMA-registered memory pool in bytes. | +| `memory_pool_device` | `"cpu"` | `"cpu"`: pinned host memory (recommended). `"cuda"`: GPU VRAM for GPUDirect RDMA (requires NIC-GPU direct PCIe connectivity). | ### Networking @@ -107,10 +107,10 @@ receiver_connect = remote_side_channel_port + tp_rank ## Memory Pool Modes -| Mode | Config | Recommended Pool Size | Data Flow | Best For | -|---|---|---|---|---| -| CPU Pinned | `memory_pool_device: "cpu"` | 4 GB | GPU → CPU pool → RDMA → CPU pool → GPU | Most hardware topologies (recommended) | -| GPUDirect | `memory_pool_device: "cuda"` | 2 GB | GPU → GPU pool → RDMA (NIC reads GPU BAR1) → GPU pool | NIC-GPU direct PCIe (PIX topology) | +| Mode | Config | Data Flow | Best For | +|---|---|---|---| +| CPU Pinned | `memory_pool_device: "cpu"` | GPU → CPU pool → RDMA → CPU pool → GPU | Most hardware topologies (recommended) | +| GPUDirect | `memory_pool_device: "cuda"` | GPU → GPU pool → RDMA (NIC reads GPU BAR1) → GPU pool | NIC-GPU direct PCIe (PIX topology) | > **Note**: GPUDirect RDMA requires the NIC and GPU to share a direct PCIe > switch (PIX topology). On systems where they are connected via PXB or NODE, diff --git a/docs/design/feature/prefix_caching.md b/docs/design/feature/prefix_caching.md deleted file mode 100644 index ebad8b69106..00000000000 --- a/docs/design/feature/prefix_caching.md +++ /dev/null @@ -1,164 +0,0 @@ -# Automatic Prefix Caching in Omni Models - - ---- - -## Table of Contents - -- [Overview](#overview) -- [High-Level Approach](#high-level-approach) -- [Example](#example) -- [What About Multimodal Inputs?](#what-about-multimodal-inputs) - ---- - -### Overview - -Prefix caching in the context of kv-cache management is a useful optimization for avoiding redundant computations. The main idea is that we store portions of the kv-cache from processed requests, so that we can reuse them if incoming requests have the same prefix as previous requests. - -vLLM manages the kv-cache as blocks, which represent a span of tokens of a fixed length. Blocks are hashable by the content that they contain, which typically means the tokens within the span, but also could be influenced by other factors, e.g., LoRA and multimodal data. - -vLLM implements automatic prefix caching for managing its kv-cache, which is best understood by reading the design document [here](https://docs.vllm.ai/en/latest/design/prefix_caching/). vLLM-Omni builds on top of the prefix caching mechanism in a noninvasive way to allow caching between stages in Omni pipelines. This typically means for a given stage we aim to support caching for the following: - -- The last hidden states produced by the stage -- Model / stage specific multimodal data - -!!! note "Note 1" - This document describes vLLM-Omni's mechanism for caching tensor outputs that are meant to be passed between stages, when requests have common prefixes, similar to the way in which vLLM has prefix caching for the kv-cache. This works in conjunction with vLLM's multimodal encoder caching, but is distinct. See the final section for a concrete example for how they tie together in practice. - -### High-Level Approach -!!! note "Note 2" - Prior to reading this section, it's recommended to take a look at the design documents in vLLM for [Automatic Prefix Caching](https://docs.vllm.ai/en/latest/features/automatic_prefix_caching/), which will make some of the concepts more clear. - -The main focus of vLLM-Omni's approach to prefix caching stage outputs is to build on vLLM's prefix caching in the least invasive way possible while minimizing impact for cache misses, and consuming a minimal amount of GPU memory. To understand the implementation, there are a few important things to note: - -- Between stages, device tensors are generally moved to CPU; this is important since we're just caching the outputs of stages, so it is okay to keep the entire cache on the CPU. - -- For a tensor to be considered cacheable, the first dimension (currently) needs to be the same as the token count, as it allows us to reuse block/slot mappings for our externally maintained tensor caches. This allows us to dynamically discover the tensors to be marked as cacheable outputs in each Omni model without having to explicitly specify cacheable output field names in every model. - -With this in mind, consider the set of blocks in a 2D layout, where the row represents the index of blocks being considered, and the columns represent the slots corresponding to tokens within each block. Since we know the `num_blocks` and `block_size` from our kv cache config, if we want to cache a tensor with feature size `D`, we can preallocate a CPU tensor of size `(num_blocks, block_size, D)`, and use the same block index and slot mapping to retrieve the corresponding feature vector. - - -### Example -!!! note "Note 3" - Prefix caching in vLLM-Omni currently is only supported on AutoRegressive stages with one kv-cache group. It can be enabled/disabled per-stage via the `enable_prefix_caching` parameter in the model's stage config. - -The way in which vLLM-Omni ties into vLLM's prefix caching is best understood by example. Say that we have the following: - -- `num_blocks=8` -- `block_size=4` -- `hidden_size=2` -- A stage specific multimodal output tensor named `mm_feature` with feature dimension `16` - -The prefix cache flow is then outlined below. - -1. When the model is initialized, we can determine the `hidden_size` from the `ModelConfig`, and allocate a cache of size `(num_blocks, block_size, hidden_size)`. - -2. Say we process the request `The quick brown fox was tired and slept beneath the shady tree`, which is 12 tokens and evenly divides into 3 blocks as shown below. - -``` - [ The quick brown fox ] [ was tired and slept ] [beneath the shady tree ] -Block 1: |<--- block tokens ---->| -Block 2: |<------- prefix ------>| |<--- block tokens --->| -Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->| -``` - -When the request processes, we inspect the multimodal outputs and identify the `mm_feature` tensor, which will be of shape `(seq_len, feature_dim)`, i.e., `(12, 16)` in this example. We note that the first axis is dependent on the `seq_len` and add a new cache_tensor of shape `(num_blocks, block_size, feature_dim)` to our multimodal cache for tensors. - - -3. If we lay out the cache as a 2D tensor of shape (`num_blocks`, `block_size`), we'll have something like the following: - -``` -0: [ The quick brown fox ] -1: [ was tired and slept ] -2: [beneath the shady tree ] -3: [EMPTY] -... -7: [EMPTY] -``` - -Or, if we flatten it down to 1D, -``` -0: The -1: quick -2: brown -3: fox -... -11: tree -12: [EMPTY] -... -``` - -which we can think of as row indices into the hidden states tensor if we view it as the 2D shape `(num_blocks x block_size, feature_dim)`. That is, the analogous flattened (from 3D -> 2D) mapping of the cache for hidden states becomes the following. -``` -0: -1: -2: -3: -... -11: -12: [EMPTY] -... -``` - -Similarly, for the multimodal outputs cache, the flattened coordinates are the same, but the `mm_feature` maps to vectors of length `16` instead of the hidden size of `2`. Note that in practice, we may have multiple multimodal output tensors per forward pass, which may have different names and different feature dimensions. - - -4. Now, say that we receive a new request `The quick brown fox jumped over the dog`. - -``` - [ The quick brown fox ] [ jumped over the dog ] -Block 1: |<--- block tokens ---->| -Block 2: |<------- prefix ------>| |<--- block tokens --->| -``` - -Here, we will have a cache hit for `Block 1` which will be detected by vLLM based on the hash of the first block when it's handling the prefix caching on the kv-cache. As a result, when we get the output from the scheduler, we will see that `num_computed_tokens=4` (corresponding to the cached first block), and we only need to process the remaining 4 new tokens in the new prefill. - -Since we have the block indices / slot mappings from the kv cache manager, we can simply mirror the mappings and leverage the same indices for the cached hidden states and multimodal outputs. This allows us to look up the correct tensors from our externally maintained 3D caches. - -``` -0: [ The quick brown fox ] < already in the cache -1: [ was tired and slept ] -2: [beneath the shady tree ] -3: [ jumped over the dog ] < added on the second request -4: [EMPTY] -... -7: [EMPTY] -... -``` - -Finally, to pass the full hidden states and multimodal outputs to the next stage, we simply concatenate the cached contents with the corresponding new tensors computed from the current forward call. - - -### What About Multimodal Inputs? -It's also useful to consider the case about how Omni prefix caching is handled when we have multimodal inputs that don't cleanly end on block boundaries, as well as how this works with multimodal encoder caching in vLLM. For example: - -``` - [ Im0 Im1 Im2 Im3 ] [ Im4 Im5 foo ] -Block 1: |<--- block tokens ---->| -Block 2: |<------- prefix ------>| |<--- block tokens --->| -``` - -In this case, only `Block 1` will have outputs stored in the prefix tensor cache, because vLLM does not store partial blocks. This may appear to be a problem at first glance, because the multimodal input is fragmented across a new block that wasn't cached. - -In reality, this isn't a big problem for correctness, because vLLM also maintains an encoder cache for multimodal inputs. In other words, after the first pass, we'll have the following: - -- The Block 1 hash, which is used for prefix caching -- The hash describing the image data starting at position 0 and with length 6 -- In vLLM's encoder cache, a mapping from the image hash above to the encoder output - - -To understand what happens, say we get the following input as a second request: -``` - [ Im0 Im1 Im2 Im3 ] [ Im4 Im5 bar baz ] -Block 1: |<--- block tokens ---->| -Block 2: |<------- prefix ------>| |<--- block tokens --->| -``` - -First, the scheduler will check for a prefix cache hit, which we will see on `Block 1`. As a result, we will have 4 tokens marked as precomputed, and only see the remaining 4 tokens in the following prefill. - -Because we have multimodal data in a scheduled span that isn't fully precomputed, we still need to call the visual encoder. However, since we have the image hash and encoder cache, we will retrieve the encoder outputs for `Im4` and `Im5` as we create the multimodal embeddings. - -When we pass our multimodal tensors to the language model component in the same stage, we'll then expect the same outputs, because the prefix caching behaviors in vLLM-Omni / vLLM match, so the LLM will use vLLM's KV cache manager's prefix caching to correctly handle the attention information for `Block 1` while calculating the outputs for `Block 2`, giving us the correct results for processing `Block 2` with the context of `Block 1`. - -Finally, we look up the output hidden states/multimodal tensors corresponding to the prefix cache hit `Block 1` and concatenate it with the forward pass result to get the final result, which is expected to be identical to the full hidden states when prefix caching is disabled. diff --git a/docs/design/feature/teacache.md b/docs/design/feature/teacache.md index 8577cff1f05..9fa315cee77 100644 --- a/docs/design/feature/teacache.md +++ b/docs/design/feature/teacache.md @@ -326,41 +326,9 @@ for prompt in tqdm(prompts, desc="Collecting data"): # Estimate coefficients coeffs = estimator.estimate(poly_order=4) -print(f"Estimated coefficients: {coeffs}") +print(f"Estimated coefficients: {coeffs.tolist()}") ``` -Note: some models may require the vLLM context and config to be initialized to initialize vLLM modules. To this end, you may need a workaround like the following to be able to run coefficient estimation. -```python -from vllm_omni.diffusion.forward_context import set_forward_context -from vllm_omni.diffusion.distributed.parallel_state import ( - init_distributed_environment, - initialize_model_parallel, -) -from vllm.config import VllmConfig -... - -if __name__ == "__main__": - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = "8192" - os.environ["LOCAL_RANK"] = "0" - os.environ["RANK"] = "0" - os.environ["WORLD_SIZE"] = "1" - - vllm_config = VllmConfig() - init_distributed_environment() - initialize_model_parallel() - - # NOTE: you may have to pass an initialized OmniDiffusionConfig as a kwarg - # here to make current sp checks happy; if this is the case, just create one - # .from_kwargs() with the model name to get around this check for now, - # since your estimator subclass should handle the actual model configuration. - # - # This will be cleaned up in the future - with set_forward_context(vllm_config): - -``` - - **Data Statistics Guide:** | Metric | Good Range | Warning Signs | diff --git a/docs/design/feature/vae_parallel.md b/docs/design/feature/vae_parallel.md index e330b41a68f..9009ece72a5 100644 --- a/docs/design/feature/vae_parallel.md +++ b/docs/design/feature/vae_parallel.md @@ -1,15 +1,14 @@ # VAE Patch Parallelism This document describes how to add **VAE Patch Parallelism** support to a diffusion model. -We use **Qwen-Image** as the reference implementation for decode parallel, and **Wan2.2** for encode parallel. +We use **Qwen-Image** as the reference implementation. --- ## Table of Contents - [Overview](#overview) -- [Step-by-Step Implementation (Decode)](#step-by-step-implementation-decode) -- [Encode Parallel Implementation](#encode-parallel-implementation) +- [Step-by-Step Implementation](#step-by-step-implementation) - [Testing](#testing) - [Reference Implementations](#reference-implementations) - [Summary](#summary) @@ -20,13 +19,13 @@ We use **Qwen-Image** as the reference implementation for decode parallel, and * ### What is Vae Patch parallel? -**VAE Patch Parallelism** is an acceleration technique for both **encoding** and **decoding**. Instead of processing the entire tensor at once, the tensor is: +**VAE Patch Parallelism** is a decoding acceleration technique. Instead of decoding the entire latent tensor at once, the latent tensor is: + Split into multiple spatial tiles + Distributed across multiple ranks -+ Encoded/Decoded in parallel ++ Decoded in parallel + Merged to reconstruct the final output @@ -36,17 +35,10 @@ This approach: + Reduces peak memory usage per device -+ Accelerates encoding/decoding latency - -### When to Use Encode vs Decode Parallel - -| Operation | Use Case | Example | -|-----------|----------|---------| -| **Decode Parallel** | Text-to-Image, Text-to-Video | Latent → Image/Video | -| **Encode Parallel** | Image-to-Video (I2V) | Image → Latent (for conditioning) | ++ Accelerates decoding latency ### Architecture -We introduce **DistributedVaeExecutor** as the core component responsible for distributed VAE encoding/decoding. +We introduce **DistributedVaeExecutor** as the core component responsible for distributed VAE decoding. The executor is model-agnostic and accepts three function parameters: @@ -92,7 +84,7 @@ Therefore: + Merge must perform blending to avoid seams -## Step-by-Step Implementation (Decode) +## Step-by-Step Implementation ### Step 1: Implement DistributedAutoencoderKLQwenImage `QwenImagePipeline` use `AutoencoderKLQwenImage` for vae, so implement a distributed version: @@ -213,14 +205,14 @@ def tile_merge(self, coord_tensor_map: dict[tuple[int, ...], torch.Tensor], grid We need to override tiled_decode, the main logic is: + check distributed is enabled + select split/exec/merge -+ Invoke self.distributed_executor.execute to decode ++ Invoke self.distributed_decoder.execute to decode ``` def tiled_decode(self, z: torch.Tensor, return_dict: bool = True): if not self.is_distributed_enabled(): return super().tiled_decode(z, return_dict=return_dict) logger.info("Decode run with distributed executor") - result = self.distributed_executor.execute( + result = self.distributed_decoder.execute( z, DistributedOperator(split=self.tile_split, exec=self.tile_exec, merge=self.tile_merge), broadcast_result=True, @@ -251,166 +243,6 @@ class YourModelPipeline(nn.Module): + ).to(self.device) ``` -## Encode Parallel Implementation - -For models that require VAE encoding (e.g., Image-to-Video), you can also parallelize the encode operation. We use **Wan2.2** as the reference implementation. - -### Step 1: Implement encode_tile_split - -Similar to decode, split the input tensor into tiles. Key considerations: - -+ **Patchify handling**: If the model uses `patch_size`, scale tile parameters accordingly -+ **Temporal chunking**: Video VAEs may have temporal compression (e.g., 4x) - -```python -def encode_tile_split(self, x: torch.Tensor) -> tuple[list[TileTask], GridSpec]: - _, _, num_frames, height, width = x.shape - encode_spatial_compression_ratio = self.spatial_compression_ratio - - # Scale tile parameters for patchified coordinate system - tile_sample_min_height = self.tile_sample_min_height - tile_sample_min_width = self.tile_sample_min_width - tile_sample_stride_height = self.tile_sample_stride_height - tile_sample_stride_width = self.tile_sample_stride_width - - if self.config.patch_size is not None: - # When input is patchified, scale tile parameters accordingly - encode_spatial_compression_ratio = self.spatial_compression_ratio // self.config.patch_size - tile_sample_min_height = tile_sample_min_height // self.config.patch_size - tile_sample_min_width = tile_sample_min_width // self.config.patch_size - tile_sample_stride_height = tile_sample_stride_height // self.config.patch_size - tile_sample_stride_width = tile_sample_stride_width // self.config.patch_size - - latent_height = height // encode_spatial_compression_ratio - latent_width = width // encode_spatial_compression_ratio - - tile_latent_min_height = tile_sample_min_height // encode_spatial_compression_ratio - tile_latent_min_width = tile_sample_min_width // encode_spatial_compression_ratio - tile_latent_stride_height = tile_sample_stride_height // encode_spatial_compression_ratio - tile_latent_stride_width = tile_sample_stride_width // encode_spatial_compression_ratio - - blend_height = tile_latent_min_height - tile_latent_stride_height - blend_width = tile_latent_min_width - tile_latent_stride_width - - tiletask_list = [] - # Use temporal compression ratio from config instead of hardcoding - temporal_compression = self.config.scale_factor_temporal - - for i in range(0, height, tile_sample_stride_height): - for j in range(0, width, tile_sample_stride_width): - time_list = [] - frame_range = 1 + (num_frames - 1) // temporal_compression - for k in range(frame_range): - if k == 0: - tile = x[:, :, :1, i : i + tile_sample_min_height, j : j + tile_sample_min_width] - else: - tile = x[ - :, :, - 1 + temporal_compression * (k - 1) : 1 + temporal_compression * k, - i : i + tile_sample_min_height, - j : j + tile_sample_min_width, - ] - time_list.append(tile) - tiletask_list.append( - TileTask(len(tiletask_list), (i // tile_sample_stride_height, j // tile_sample_stride_width), - time_list, workload=time_list[0].shape[3] * time_list[0].shape[4]) - ) - - grid_spec = GridSpec( - split_dims=(3, 4), - grid_shape=(tiletask_list[-1].grid_coord[0] + 1, tiletask_list[-1].grid_coord[1] + 1), - tile_spec={ - "latent_height": latent_height, "latent_width": latent_width, - "blend_height": blend_height, "blend_width": blend_width, - "tile_latent_stride_height": tile_latent_stride_height, - "tile_latent_stride_width": tile_latent_stride_width, - }, - output_dtype=self.dtype, - ) - return tiletask_list, grid_spec -``` - -### Step 2: Implement encode_tile_exec - -```python -def encode_tile_exec(self, task: TileTask) -> torch.Tensor: - """Encode a single sample tile into latent space.""" - self.clear_cache() - time = [] - for k, tile in enumerate(task.tensor): - self._enc_conv_idx = [0] - encoded = self.encoder(tile, feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx) - encoded = self.quant_conv(encoded) - time.append(encoded) - result = torch.cat(time, dim=2) - self.clear_cache() - return result -``` - -### Step 3: Implement encode_tile_merge - -```python -def encode_tile_merge( - self, coord_tensor_map: dict[tuple[int, ...], torch.Tensor], grid_spec: GridSpec -) -> torch.Tensor: - """Merge encoded tiles into a full latent tensor.""" - grid_h, grid_w = grid_spec.grid_shape - result_rows = [] - for i in range(grid_h): - result_row = [] - for j in range(grid_w): - tile = coord_tensor_map[(i, j)] - if i > 0: - tile = self.blend_v(coord_tensor_map[(i - 1, j)], tile, grid_spec.tile_spec["blend_height"]) - if j > 0: - tile = self.blend_h(coord_tensor_map[(i, j - 1)], tile, grid_spec.tile_spec["blend_width"]) - result_row.append(tile[:, :, :, - : grid_spec.tile_spec["tile_latent_stride_height"], - : grid_spec.tile_spec["tile_latent_stride_width"]]) - result_rows.append(torch.cat(result_row, dim=-1)) - - enc = torch.cat(result_rows, dim=3)[ - :, :, :, : grid_spec.tile_spec["latent_height"], : grid_spec.tile_spec["latent_width"] - ] - return enc -``` - -### Step 4: Override tiled_encode method - -Override `tiled_encode` instead of `encode`. The parent's `_encode()` handles patchify before calling `tiled_encode()`, so input `x` is already patchified. - -```python -def tiled_encode(self, x: torch.Tensor) -> torch.Tensor: - """ - Encode using distributed VAE executor. - - Note: x is already patchified by parent's _encode() before calling this method. - """ - if not self.is_distributed_enabled(): - return super().tiled_encode(x) - - self.clear_cache() - result = self.distributed_executor.execute( - x, - DistributedOperator( - split=self.encode_tile_split, - exec=self.encode_tile_exec, - merge=self.encode_tile_merge, - ), - broadcast_result=True, # Latents needed by all ranks for diffusion - ) - self.clear_cache() - return result -``` - -**Key differences from decode parallel:** - -| Aspect | Decode Parallel | Encode Parallel | -|--------|-----------------|-----------------| -| `broadcast_result` | Often `False` (only rank 0 needs output) | `True` (all ranks need latents for diffusion) | -| Patchify | Applied in merge (unpatchify) | Handled by parent `_encode()` before `tiled_encode()` | -| Temporal chunking | Frame-by-frame | Chunk-based (e.g., 1 + 4n frames) | - ## Testing Verify numerical consistency between: + vae_patch_parallel_size = 1 @@ -440,20 +272,18 @@ When vae_patch_parallel_size is larger than the DiT world size, it will automati Complete examples in the codebase: -| Model | Path | Decode Parallel | Encode Parallel | -|-------|------|-----------------|-----------------| -| **Z-Image** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py` | ✅ | ❌ | -| **Wan2.2** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py` | ✅ | ✅ | -| **Qwen-Image** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_qwenimage.py` | ✅ | ❌ | +| Model | Path | Notes | +|-------|------|-------| +| **Z-Image** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py` | Distributed AutoencoderKL | +| **Wan2.2** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py` | Distributed AutoencoderKLWan | +| **Qwen-Image** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_qwenimage.py` | Distributed AutoencoderKLQwenImage | --- ## Summary -Adding VAE Patch Parallel support to diffusion model: +Adding Vae Patch Parallel support to diffusion model: -1. **Implement Distributed VAE** - Inherit from base VAE class and `DistributedVaeMixin` -2. **Decode Parallel** - Refactor `tiled_decode` into `tile_split`/`tile_exec`/`tile_merge` -3. **Encode Parallel** (optional) - Implement `encode_tile_split`/`encode_tile_exec`/`encode_tile_merge` for I2V models -4. **Change VAE model in pipeline** - Use the distributed version -5. **Test** - Verify numerical consistency with `vae_patch_parallel_size=1` vs `N` +1. **Implement Distributed Vae** - mainly copy from `diffusers` tiled_decode, and refactor into split/exec/merge +2. **Change vae model in pipeline to Distributed Vae** +3. **Test** - Verify with `tensor_parallel_size=N` quality diff --git a/docs/design/figures/omni/E2EL_s_vllm_omni_vs_transformers.png b/docs/design/figures/omni/E2EL_s_vllm_omni_vs_transformers.png deleted file mode 100644 index 15112d5862a..00000000000 Binary files a/docs/design/figures/omni/E2EL_s_vllm_omni_vs_transformers.png and /dev/null differ diff --git a/docs/design/figures/omni/Mean_AUDIO_RTF_Baseline_vs_Batch.png b/docs/design/figures/omni/Mean_AUDIO_RTF_Baseline_vs_Batch.png deleted file mode 100644 index 2f0615f77bb..00000000000 Binary files a/docs/design/figures/omni/Mean_AUDIO_RTF_Baseline_vs_Batch.png and /dev/null differ diff --git a/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_CUDA_Graph_vs_Async_Chunk.png b/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_CUDA_Graph_vs_Async_Chunk.png deleted file mode 100644 index 62d8bc79b6b..00000000000 Binary files a/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_CUDA_Graph_vs_Async_Chunk.png and /dev/null differ diff --git a/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_vs_Batch_CUDA_Graph.png b/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_vs_Batch_CUDA_Graph.png deleted file mode 100644 index 5838b45319e..00000000000 Binary files a/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_vs_Batch_CUDA_Graph.png and /dev/null differ diff --git a/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Baseline_vs_Batch.png b/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Baseline_vs_Batch.png deleted file mode 100644 index 24be814b7e9..00000000000 Binary files a/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Baseline_vs_Batch.png and /dev/null differ diff --git a/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_CUDA_Graph_vs_Async_Chunk.png b/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_CUDA_Graph_vs_Async_Chunk.png deleted file mode 100644 index c8df58ebcdf..00000000000 Binary files a/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_CUDA_Graph_vs_Async_Chunk.png and /dev/null differ diff --git a/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_vs_Batch_CUDA_Graph.png b/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_vs_Batch_CUDA_Graph.png deleted file mode 100644 index 2d1a04e9c2c..00000000000 Binary files a/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_vs_Batch_CUDA_Graph.png and /dev/null differ diff --git a/docs/design/figures/omni/Mean_E2EL_ms_Baseline_vs_Batch.png b/docs/design/figures/omni/Mean_E2EL_ms_Baseline_vs_Batch.png deleted file mode 100644 index e598b543431..00000000000 Binary files a/docs/design/figures/omni/Mean_E2EL_ms_Baseline_vs_Batch.png and /dev/null differ diff --git a/docs/design/figures/omni/Mean_E2EL_ms_Batch_CUDA_Graph_vs_Async_Chunk.png b/docs/design/figures/omni/Mean_E2EL_ms_Batch_CUDA_Graph_vs_Async_Chunk.png deleted file mode 100644 index 54452013eb4..00000000000 Binary files a/docs/design/figures/omni/Mean_E2EL_ms_Batch_CUDA_Graph_vs_Async_Chunk.png and /dev/null differ diff --git a/docs/design/figures/omni/Mean_E2EL_ms_Batch_vs_Batch_CUDA_Graph.png b/docs/design/figures/omni/Mean_E2EL_ms_Batch_vs_Batch_CUDA_Graph.png deleted file mode 100644 index 04c5ad7396a..00000000000 Binary files a/docs/design/figures/omni/Mean_E2EL_ms_Batch_vs_Batch_CUDA_Graph.png and /dev/null differ diff --git a/docs/design/figures/omni/RTF_vllm_omni_vs_transformers.png b/docs/design/figures/omni/RTF_vllm_omni_vs_transformers.png deleted file mode 100644 index d93ba0b2af5..00000000000 Binary files a/docs/design/figures/omni/RTF_vllm_omni_vs_transformers.png and /dev/null differ diff --git a/docs/design/figures/omni/Summary_E2EL_ms_vs_features.png b/docs/design/figures/omni/Summary_E2EL_ms_vs_features.png deleted file mode 100644 index 04087b5910f..00000000000 Binary files a/docs/design/figures/omni/Summary_E2EL_ms_vs_features.png and /dev/null differ diff --git a/docs/design/figures/omni/Summary_RTF_vs_features.png b/docs/design/figures/omni/Summary_RTF_vs_features.png deleted file mode 100644 index c2c8ad40834..00000000000 Binary files a/docs/design/figures/omni/Summary_RTF_vs_features.png and /dev/null differ diff --git a/docs/design/figures/omni/Summary_TTFP_ms_vs_features.png b/docs/design/figures/omni/Summary_TTFP_ms_vs_features.png deleted file mode 100644 index 3dcc1c55379..00000000000 Binary files a/docs/design/figures/omni/Summary_TTFP_ms_vs_features.png and /dev/null differ diff --git a/docs/design/figures/omni/TTFP_s_vllm_omni_vs_transformers.png b/docs/design/figures/omni/TTFP_s_vllm_omni_vs_transformers.png deleted file mode 100644 index 9a5b6c9bdaf..00000000000 Binary files a/docs/design/figures/omni/TTFP_s_vllm_omni_vs_transformers.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_AUDIO_RTF_vllm_omni_vs_transformers.png b/docs/design/figures/tts/Mean_AUDIO_RTF_vllm_omni_vs_transformers.png deleted file mode 100644 index 68f0ef17e88..00000000000 Binary files a/docs/design/figures/tts/Mean_AUDIO_RTF_vllm_omni_vs_transformers.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_AUDIO_TTFP_(ms)_vllm_omni_vs_transformers.png b/docs/design/figures/tts/Mean_AUDIO_TTFP_(ms)_vllm_omni_vs_transformers.png deleted file mode 100644 index 44be96e96da..00000000000 Binary files a/docs/design/figures/tts/Mean_AUDIO_TTFP_(ms)_vllm_omni_vs_transformers.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_E2EL_(ms)_vllm_omni_vs_transformers.png b/docs/design/figures/tts/Mean_E2EL_(ms)_vllm_omni_vs_transformers.png deleted file mode 100644 index 2e5d1482bd7..00000000000 Binary files a/docs/design/figures/tts/Mean_E2EL_(ms)_vllm_omni_vs_transformers.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_mean_e2e_ms_baseline_vs_batch.png b/docs/design/figures/tts/Mean_mean_e2e_ms_baseline_vs_batch.png deleted file mode 100644 index 04d8f0bac53..00000000000 Binary files a/docs/design/figures/tts/Mean_mean_e2e_ms_baseline_vs_batch.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_mean_e2e_ms_batch_vs_cuda_graph.png b/docs/design/figures/tts/Mean_mean_e2e_ms_batch_vs_cuda_graph.png deleted file mode 100644 index eb85ec0dd4f..00000000000 Binary files a/docs/design/figures/tts/Mean_mean_e2e_ms_batch_vs_cuda_graph.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_mean_e2e_ms_cuda_graph_vs_async_chunk.png b/docs/design/figures/tts/Mean_mean_e2e_ms_cuda_graph_vs_async_chunk.png deleted file mode 100644 index 6f0e0e2529d..00000000000 Binary files a/docs/design/figures/tts/Mean_mean_e2e_ms_cuda_graph_vs_async_chunk.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_mean_rtf_baseline_vs_batch.png b/docs/design/figures/tts/Mean_mean_rtf_baseline_vs_batch.png deleted file mode 100644 index 89ea30a8643..00000000000 Binary files a/docs/design/figures/tts/Mean_mean_rtf_baseline_vs_batch.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_mean_rtf_batch_vs_cuda_graph.png b/docs/design/figures/tts/Mean_mean_rtf_batch_vs_cuda_graph.png deleted file mode 100644 index 2b207b88987..00000000000 Binary files a/docs/design/figures/tts/Mean_mean_rtf_batch_vs_cuda_graph.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_mean_rtf_cuda_graph_vs_async_chunk.png b/docs/design/figures/tts/Mean_mean_rtf_cuda_graph_vs_async_chunk.png deleted file mode 100644 index f5f7ad72c8f..00000000000 Binary files a/docs/design/figures/tts/Mean_mean_rtf_cuda_graph_vs_async_chunk.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_mean_ttfp_ms_baseline_vs_batch.png b/docs/design/figures/tts/Mean_mean_ttfp_ms_baseline_vs_batch.png deleted file mode 100644 index 6f8c1da4a5b..00000000000 Binary files a/docs/design/figures/tts/Mean_mean_ttfp_ms_baseline_vs_batch.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_mean_ttfp_ms_batch_vs_cuda_graph.png b/docs/design/figures/tts/Mean_mean_ttfp_ms_batch_vs_cuda_graph.png deleted file mode 100644 index b0fe1d02a9d..00000000000 Binary files a/docs/design/figures/tts/Mean_mean_ttfp_ms_batch_vs_cuda_graph.png and /dev/null differ diff --git a/docs/design/figures/tts/Mean_mean_ttfp_ms_cuda_graph_vs_async_chunk.png b/docs/design/figures/tts/Mean_mean_ttfp_ms_cuda_graph_vs_async_chunk.png deleted file mode 100644 index 008ba9bf78f..00000000000 Binary files a/docs/design/figures/tts/Mean_mean_ttfp_ms_cuda_graph_vs_async_chunk.png and /dev/null differ diff --git a/docs/design/figures/tts/Summary_mean_e2e_ms_vs_features.png b/docs/design/figures/tts/Summary_mean_e2e_ms_vs_features.png deleted file mode 100644 index 7c65aa11770..00000000000 Binary files a/docs/design/figures/tts/Summary_mean_e2e_ms_vs_features.png and /dev/null differ diff --git a/docs/design/figures/tts/Summary_mean_rtf_vs_features.png b/docs/design/figures/tts/Summary_mean_rtf_vs_features.png deleted file mode 100644 index 71bb2c54680..00000000000 Binary files a/docs/design/figures/tts/Summary_mean_rtf_vs_features.png and /dev/null differ diff --git a/docs/design/figures/tts/Summary_mean_ttfp_ms_vs_features.png b/docs/design/figures/tts/Summary_mean_ttfp_ms_vs_features.png deleted file mode 100644 index cef2546d6fe..00000000000 Binary files a/docs/design/figures/tts/Summary_mean_ttfp_ms_vs_features.png and /dev/null differ diff --git a/docs/design/qwen3_omni_tts_performance_optimization.md b/docs/design/qwen3_omni_tts_performance_optimization.md deleted file mode 100644 index 2f18a1b1bc0..00000000000 --- a/docs/design/qwen3_omni_tts_performance_optimization.md +++ /dev/null @@ -1,539 +0,0 @@ -# Speech Generation on vLLM-Omni: Performance Optimizations for Qwen3-Omni and Qwen3-TTS - -## Summary - -vLLM-Omni supports end-to-end serving for speech-generating models, including both **Qwen3-Omni** (multimodal understanding + speech) and **Qwen3-TTS** (text-to-speech). Despite their different architectures, both models share the same multi-stage pipeline design and benefit from the same set of stacked optimizations: - -1. **Batching** improves GPU utilization stage by stage and increases overall throughput. -2. **CUDA Graph** reduces CPU launch overhead and decode-time jitter on stable shapes. -3. **Async Chunk and Streaming Output** overlap compute and communication across stages and emit audio incrementally, improving both TTFP and E2E. - -### Model architectures - -**Qwen3-Omni** is a native multimodal model that understands text, audio, image, and video inputs, and generates both text and speech outputs. Its pipeline has three stages: - -- **Thinker**: multimodal understanding and text generation -- **Talker (+ Talker-MTP / code predictor path)**: converts semantic/text representations into codec tokens -- **Code2Wav**: decodes codec tokens into waveform audio - -**Qwen3-TTS** is a lightweight, high-quality text-to-speech model. Its pipeline has two stages: - -- **Talker (AR decoder)**: auto-regressively generates codec tokens from text input -- **Code2Wav (vocoder)**: decodes codec tokens into waveform audio - -The optimizations described in this post apply to both models. We present results for each side by side. - -### vLLM-Omni vs HF Transformers - -Compared with **HF Transformers** (offline, single request), vLLM-Omni with the full optimization stack delivers dramatically lower latency and higher efficiency for both models. - -**Qwen3-Omni** (A100): - - - - - -
Qwen3-Omni E2EL: vLLM vs HFQwen3-Omni TTFP: vLLM vs HFQwen3-Omni RTF: vLLM vs HF
- -| Metric | vLLM-Omni | HF Transformers | Improvement | -| --- | --- | --- | --- | -| E2E latency (s) | 23.78 | 336.10 | ~93% reduction | -| TTFP (s) | 0.934 | 336.10 | ~99.7% reduction | -| RTF | 0.32 | 3.776 | ~91% reduction (~12× faster) | - -- **E2E latency**: 23.78 s vs 336.10 s - **~93%** reduction -- **TTFP**: 0.934 s vs 336.10 s - **~99.7%** reduction -- **RTF**: 0.32 vs 3.776 - **~91%** reduction (~12x faster) - -**Qwen3-TTS** (H200, concurrency 1): - - - - - -
Qwen3-TTS E2EL: vLLM vs HFQwen3-TTS TTFP: vLLM vs HFQwen3-TTS RTF: vLLM vs HF
- -| Metric | vLLM-Omni | HF Transformers | Improvement | -| --- | --- | --- | --- | -| E2E latency (ms) | 941 | 15,513 | ~94% reduction | -| TTFP (ms) | 64 | 15,513 | ~99.6% reduction (242× faster) | -| RTF | 0.16 | 2.64 | ~94% reduction (~16.5× faster) | - -- **E2E latency**: 941 ms vs 15,513 ms - **~94%** reduction -- **TTFP**: 64 ms vs 15,513 ms - **~99.6%** reduction (242x faster) -- **RTF**: 0.16 vs 2.64 - **~94%** reduction (~16.5x faster) - -### Stacked optimization summary - -Each optimization stacks on the previous one. The summary plots below show the cumulative effect at each step, with one line per concurrency level (1, 4, 10). - -**Qwen3-Omni** (A100): - - - - - -
Qwen3-Omni E2EL: stacked optimizationQwen3-Omni TTFP: stacked optimizationQwen3-Omni RTF: stacked optimization
- -- **E2EL reduction**: ~74% at concurrency 10 (410,054 ms -> 104,901 ms); ~90% at concurrency 1 (426,529 ms -> 41,216 ms) -- **TTFP reduction**: ~96% at concurrency 10 (409,705 ms -> 16,482 ms); ~99.7% at concurrency 1 (426,078 ms -> 1,164 ms) -- **RTF reduction**: ~74% at concurrency 10 (2.83 -> 0.74); ~90% at concurrency 1 (2.08 -> 0.21) - -**Qwen3-TTS** (H200): - - - - - -
Qwen3-TTS E2EL: stacked optimizationQwen3-TTS TTFP: stacked optimizationQwen3-TTS RTF: stacked optimization
- -- **E2EL reduction**: ~85% at concurrency 10 (12,141 ms -> 1,767 ms); ~29% at concurrency 1 (1,323 ms -> 941 ms) -- **TTFP reduction**: ~96.5% at concurrency 10 (12,141 ms -> 425 ms); ~95% at concurrency 1 (1,323 ms -> 64 ms) -- **RTF reduction**: ~86% at concurrency 10 (2.19 -> 0.31); ~30% at concurrency 1 (0.23 -> 0.16) - -**Benchmark environment:** - -| | Qwen3-Omni | Qwen3-TTS | -| --- |-----------------------------| --- | -| **GPU** | A100 | H200 | -| **Model** | Qwen3-Omni-30B-A3B-Instruct | Qwen3-TTS-12Hz-1.7B-CustomVoice | -| **vLLM** | v0.17.0 | v0.18.0 | -| **vllm-omni** | commit 199f7832 | v0.18.0rc2 | -| **CUDA** | 12.9 | 12.8 | - -This post walks through each optimization in the same order they are typically enabled in practice, then ends with deployment playbooks for both models. - ---- - -## Pipeline Batching - -### How stage-wise batching works - -For both Qwen3-Omni and Qwen3-TTS, batching is a pipeline-level optimization: - -- Requests are grouped per stage using `runtime.max_batch_size` -- Each stage executes batch inference with its own scheduler/worker -- Stage outputs are routed to downstream stages with per-request mapping preserved - -**Batching strategy by stage:** The understanding and decode stages (Thinker for Omni, Talker for both) use **continuous batching**: requests can join and leave the batch over time. Code2Wav uses **static batching**: once a batch is formed, the stage runs the whole batch before starting the next. This matches the decode pattern of Code2Wav and keeps implementation simple while still improving throughput. - -### Batching results (Baseline vs. Batch) - -Batching alone greatly reduces E2EL and RTF across all concurrencies. The biggest gains appear at high concurrency where requests share GPU resources. - -**Qwen3-Omni** (A100): - - - - - -
Qwen3-Omni E2EL: Baseline vs BatchQwen3-Omni TTFP: Baseline vs BatchQwen3-Omni RTF: Baseline vs Batch
- -| Metric | Concurrency | Baseline | + Batch | Improvement | -| --- | --- | --- | --- | --- | -| E2EL (ms) | 1 | 426,529 | 307,719 | 1.4× | -| E2EL (ms) | 4 | 407,213 | 376,934 | 1.1× | -| E2EL (ms) | 10 | 410,054 | 234,844 | 1.7× | -| TTFP (ms) | 1 | 426,078 | 307,262 | 1.4× | -| TTFP (ms) | 4 | 406,843 | 376,466 | 1.1× | -| TTFP (ms) | 10 | 409,705 | 234,557 | 1.7× | -| RTF | 1 | 2.08 | 1.51 | 1.4× | -| RTF | 4 | 2.55 | 1.83 | 1.4× | -| RTF | 10 | 2.83 | 2.28 | 1.2× | - -At concurrency 10, E2EL drops from ~410 s to ~235 s; at concurrency 1, from ~427 s to ~308 s. - -**Qwen3-TTS** (H200): - - - - - -
Qwen3-TTS E2EL: Baseline vs BatchQwen3-TTS TTFP: Baseline vs BatchQwen3-TTS RTF: Baseline vs Batch
- -| Metric | Concurrency | Baseline | + Batch | Improvement | -| --- | --- | --- | --- | --- | -| E2EL (ms) | 1 | 1,323 | 1,339 | 1.0× | -| E2EL (ms) | 4 | 5,171 | 1,471 | 3.5× | -| E2EL (ms) | 10 | 12,141 | 1,705 | 7.1× | -| RTF | 1 | 0.230 | 0.234 | 1.0× | -| RTF | 4 | 0.908 | 0.255 | 3.6× | -| RTF | 10 | 2.186 | 0.292 | 7.5× | -| Throughput (audio-s/wall-s) | 10 | 3.99 | 33.53 | 8.4× | - -At concurrency 10, batching alone brings Qwen3-TTS RTF from 2.19 (slower than realtime) down to 0.29 (faster than realtime), and throughput from 4.0 to 33.5 audio-sec/wall-sec. - ---- - -## CUDA Graph on the Critical Decode Path - -### Why CUDA Graph helps here - -In decode-heavy serving, repeatedly launching many small kernels from CPU can become a visible overhead. CUDA Graph reduces this overhead by capturing and replaying stable execution graphs. - -In stage configs, this is represented by `enforce_eager: false` for stages where graph capture is desired (Thinker/Talker), while Code2Wav keeps eager mode depending on stage behavior. - -### CUDA Graph results on top of batching - -**Qwen3-Omni** (A100): - - - - - -
Qwen3-Omni E2EL: Batch vs CUDA GraphQwen3-Omni TTFP: Batch vs CUDA GraphQwen3-Omni RTF: Batch vs CUDA Graph
- -| Metric | Concurrency | Batch | + CUDA Graph | Improvement | -| --- | --- | --- | --- | --- | -| E2EL (ms) | 1 | 307,719 | 61,613 | 5.0× | -| E2EL (ms) | 4 | 376,934 | 79,019 | 4.8× | -| E2EL (ms) | 10 | 234,844 | 126,867 | 1.9× | -| TTFP (ms) | 1 | 307,262 | 61,257 | 5.0× | -| TTFP (ms) | 4 | 376,466 | 78,634 | 4.8× | -| TTFP (ms) | 10 | 234,557 | 126,534 | 1.9× | -| RTF | 1 | 1.51 | 0.32 | 4.7× | -| RTF | 4 | 1.83 | 0.43 | 4.3× | -| RTF | 10 | 2.28 | 0.90 | 2.5× | - -For the larger Qwen3-Omni model (30B-A3B), CUDA Graph provides a significant improvement. At concurrency 1, E2EL drops from ~308 s to ~62 s; at concurrency 10, from ~235 s to ~127 s. - -**Qwen3-TTS** (H200): - - - - - -
TTS E2EL: Batch vs +CGTTS TTFP: Batch vs +CGTTS RTF: Batch vs +CG
- -| Metric | Concurrency | Batch | + CUDA Graph | Improvement | -| --- | --- | --- | --- | --- | -| E2EL (ms) | 1 | 1,339 | 733 | 1.8× | -| E2EL (ms) | 4 | 1,471 | 987 | 1.5× | -| E2EL (ms) | 10 | 1,705 | 1,197 | 1.4× | -| RTF | 1 | 0.234 | 0.124 | 1.9× | -| RTF | 10 | 0.292 | 0.203 | 1.4× | -| Throughput (audio-s/wall-s) | 10 | 33.53 | 47.15 | 1.4× | - -At concurrency 1, CUDA Graph reduces E2EL from 1,339 ms to 733 ms and RTF from 0.234 to 0.124 - nearly a 2x improvement. The benefit is consistent across all concurrency levels. - ---- - -## Async Chunk and Streaming Output: Earlier Audio and Cross-Stage Overlap - -### Why this step matters for first-packet latency - -Two mechanisms work together to improve user-visible latency: - -- **Streaming output**: audio streaming emits audio chunks as soon as they are decoded (lower **TTFP**). Without streaming, the client waits for larger buffers or end-of-sequence. -- **Async chunk** is the main enabler for *earlier* audio: instead of handing off whole-request results between stages, each stage forwards **chunks** so the next stage can start as soon as the first chunk is ready. For Omni: Thinker -> Talker forwards hidden-state chunks; for both: Talker -> Code2Wav forwards codec chunks; Code2Wav decodes and emits packets incrementally. This **overlaps compute and communication** across stages and directly reduces time-to-first-audio-packet (TTFP) and end-to-end latency (E2EL). - -So in practice: streaming output defines *how* bytes are sent to the client; async chunk defines *when* the pipeline can produce the first bytes. - -**Dependency between the two:** Async chunk and audio streaming output are mutually dependent. Without async chunk, **audio streaming output cannot truly take effect**. Without audio streaming output, async chunk's **TTFP advantage is not fully realized**: the client would still wait for larger buffers or end-of-sequence instead of hearing the first packet as soon as it is ready. We therefore recommend enabling **both** on top of batching + CUDA Graph; the benchmarks in this post use both. - -### Results: Batch + CUDA Graph vs. Batch + CUDA Graph + Async Chunk + Streaming Output - -**Qwen3-Omni** (A100): - - - - - -
Qwen3-Omni E2EL: CG vs Async ChunkQwen3-Omni TTFP: CG vs Async ChunkQwen3-Omni RTF: CG vs Async Chunk
- -| Metric | Concurrency | Batch + CG | + Async Chunk | Improvement | -| --- | --- | --- | --- | --- | -| E2EL (ms) | 1 | 61,613 | 41,216 | 1.5× | -| E2EL (ms) | 4 | 79,019 | 67,584 | 1.2× | -| E2EL (ms) | 10 | 126,867 | 104,901 | 1.2× | -| TTFP (ms) | 1 | 61,257 | 1,164 | 53× | -| TTFP (ms) | 4 | 78,634 | 3,152 | 24.9× | -| TTFP (ms) | 10 | 126,534 | 16,482 | 7.7× | -| RTF | 1 | 0.32 | 0.21 | 1.5× | -| RTF | 4 | 0.43 | 0.34 | 1.3× | -| RTF | 10 | 0.90 | 0.74 | 1.2× | - -Enabling both brings TTFP down sharply (concurrency 1: 61,257 ms -> 1,164 ms, **~98% reduction**; concurrency 4: 78,634 ms -> 3,152 ms, **~96% reduction**). E2EL and RTF also improve at every concurrency. - -**Qwen3-TTS** (H200): - - - - - -
Qwen3-TTS E2EL: CG vs Async ChunkQwen3-TTS TTFP: CG vs Async ChunkQwen3-TTS RTF: CG vs Async Chunk
- -| Metric | Concurrency | Batch + CG | + Async Chunk | Improvement | -| --- | --- | --- | --- | --- | -| TTFP (ms) | 1 | 733 | **64** | **11.5×** | -| TTFP (ms) | 4 | 987 | **119** | **8.3×** | -| TTFP (ms) | 10 | 1,197 | **425** | **2.8×** | -| E2EL (ms) | 1 | 733 | 941 | 0.8× | -| E2EL (ms) | 10 | 1,197 | 1,767 | 0.7× | -| RTF | 1 | 0.124 | 0.160 | 0.8× | -| RTF | 10 | 0.203 | 0.314 | 0.6× | - -The TTFP improvement is the headline result for both models. For Qwen3-TTS at concurrency 1, users hear the first audio in **64 ms** instead of 733 ms - an **11.5x reduction**. For Qwen3-Omni at concurrency 1, TTFP drops from 61 s to 1.2 s - a **53x reduction**. - -### Why E2EL and RTF are higher with async chunk (TTS) - -The table above shows that enabling async chunk + streaming *increases* E2EL and RTF for TTS compared to CUDA Graph alone. This is expected - the two configurations optimize for fundamentally different metrics: - -- **CUDA Graph (no async chunk)** generates the entire audio end-to-end before returning. No chunking overhead, so total compute is minimized. -- **Async Chunk + Streaming** splits the pipeline into incremental chunks, adding overhead from chunked transport, context overlap in Code2Wav (`codec_left_context_frames=25`), and smaller effective batch sizes per chunk. - -**The tradeoff is intentional.** Async chunk trades ~30% higher total compute for **11x faster time-to-first-audio**. For interactive applications (voice assistants, chatbots), TTFP determines perceived responsiveness. For offline batch processing, CUDA Graph without async chunk is the better choice. - ---- - -## TTS-Specific: Code Predictor Re-prefill + `torch.compile` - -Qwen3-TTS has a **code predictor** - a small 5-layer transformer that generates residual codebook tokens (groups 1 through Q-1) autoregressively. Each AR step operates on very short sequences (2 to ~16 tokens). - -The naive approach uses a KV cache for this small transformer, similar to the main Talker. But the KV cache machinery (block tables, slot mappings, paged attention) introduces significant overhead relative to the tiny model. Two optimizations replace that: - -### Re-prefill (stateless forward, no KV cache) - -Instead of maintaining a KV cache across steps, the code predictor **re-feeds the full growing sequence** at each AR step using `F.scaled_dot_product_attention`. With sequences of at most ~16 tokens through 5 layers, the O(T^2) attention cost is negligible - and removing the KV cache machinery (block table management, `set_forward_context`, slot mapping) saves far more time than it costs. - -### `torch.compile` on the code predictor forward - -The 5-layer transformer forward pass launches ~60 small CUDA kernels per step. `torch.compile(mode="default", dynamic=True)` fuses these into fewer kernels via Inductor: - -```python -self._compiled_model_fwd = torch.compile( - self.model.forward, - mode="default", # no Inductor CUDA graphs, avoids conflict with vLLM's CUDAGraphWrapper - dynamic=True, # sequence length grows each step (2, 3, ..., num_groups+1) -) -``` - -`mode="default"` is used instead of `mode="reduce-overhead"` to avoid conflicts with vLLM's own CUDA graph capture on the main Talker model. `dynamic=True` handles the growing sequence length without recompilation. - -These optimizations are always-on in the current codebase - all Qwen3-TTS benchmark results in this post include them. - ---- - -## TTS-Specific: Dynamic Initial Chunk for Faster First Audio - -In the async chunk pipeline, the standard `codec_chunk_frames` is 25 (each chunk = ~2 seconds of audio at 12 Hz). Waiting for 25 frames before forwarding the first chunk to Code2Wav adds unnecessary TTFP. The **initial codec chunk** optimization sends a smaller first chunk so Code2Wav can start decoding earlier. - -**Dynamic initial chunk sizing (default behavior):** - -Rather than using a fixed initial chunk size, vLLM-Omni dynamically selects it based on current server load. The initial chunk size is chosen from power-of-2 steps [2, 4, 8, 16] based on load factor (`active_requests / max_batch_size`): - -| Server load | Initial chunk frames | Rationale | -| --- | --- | --- | -| Low (e.g. 1/10 active) | **2** (~167 ms of audio) | Minimize TTFP when there's headroom | -| Medium (e.g. 5/10 active) | **4-8** | Balance TTFP vs decode efficiency | -| High (e.g. 10/10 active) | **16** | Larger first chunk to amortize decode cost | - -After the initial chunk, all subsequent chunks use the standard `codec_chunk_frames` (25) size. - -**How it works in the pipeline:** - -1. Talker generates codec tokens auto-regressively -2. The stage input processor checks current load and picks an initial chunk size (e.g. **2 frames** at low load) -3. After that many frames, the first chunk is forwarded to Code2Wav -4. Code2Wav decodes this small chunk and emits the first audio packet -5. Subsequent chunks use the standard 25-frame size for efficient batch decoding - -**Per-request override:** Clients can also set a fixed initial chunk size via the API: - -```json -{"initial_codec_chunk_frames": 2} -``` - -This overrides the dynamic calculation for that request. - -**Config (server-side):** - -```yaml -runtime: - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - codec_streaming: true - codec_chunk_frames: 25 # standard chunk size (~2s of audio) - codec_left_context_frames: 25 - # initial chunk is computed dynamically by default - # set initial_codec_chunk_frames: 2 to force a fixed value -``` - -The 64 ms TTFP result reported above for Qwen3-TTS at concurrency 1 uses the dynamic initial chunk, which picks `initial_codec_chunk_frames=2` at low load. At higher concurrency the dynamic sizing increases the initial chunk to maintain decode efficiency. - ---- - -## Live Demo: Streaming TTS over WebSocket - -vLLM-Omni supports real-time streaming audio output for Qwen3-TTS over WebSocket ([PR #1719](https://github.com/vllm-project/vllm-omni/pull/1719)). With `stream_audio: true`, the server sends chunked PCM audio frames as they are generated, so clients can start playback before full sentence synthesis completes. - -The WebSocket protocol uses `audio.start` / binary PCM chunks / `audio.done` framing per sentence: - -```json -// Client sends: -{"type":"session.config","voice":"Vivian","response_format":"pcm","stream_audio":true} -{"type":"input.text","text":"Hello world. This is a streaming demo."} -{"type":"input.done"} - -// Server streams back per sentence: -{"type":"audio.start","sentence_index":0,"sentence_text":"Hello world.","format":"pcm","sample_rate":24000} - - -... -{"type":"audio.done","sentence_index":0,"total_bytes":96000,"error":false} -{"type":"audio.start","sentence_index":1,"sentence_text":"This is a streaming demo.","format":"pcm","sample_rate":24000} - -... -{"type":"audio.done","sentence_index":1,"total_bytes":72000,"error":false} -{"type":"session.done","total_sentences":2} -``` - - - ---- - -## Deployment Playbook - -### Qwen3-Omni - -#### 1) Serve with the default 3-stage config - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct \ - --omni \ - --port 8091 -``` - -Notes: - -- `runtime.max_batch_size` controls stage-level batching. -- Thinker/Talker commonly use `enforce_eager: false` for CUDA Graph paths. -- Code2Wav often remains eager (`enforce_eager: true`) depending on runtime behavior. - -#### 2) Enable async chunk - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct \ - --omni \ - --port 8091 \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml -``` - -#### 3) Key config knobs - -```yaml -async_chunk: true -stage_args: - - stage_id: 0 # thinker - runtime: - max_batch_size: 64 - engine_args: - enforce_eager: false - max_num_batched_tokens: 32768 - custom_process_next_stage_input_func: >- - vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk - - - stage_id: 1 # talker - runtime: - max_batch_size: 64 - engine_args: - enforce_eager: false - max_num_batched_tokens: 32768 - custom_process_next_stage_input_func: >- - vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk - - - stage_id: 2 # code2wav - runtime: - max_batch_size: 64 - engine_args: - enforce_eager: true - max_num_batched_tokens: 51200 -``` - -#### Reproduce Qwen3-Omni benchmarks - -```bash -vllm bench serve \ - --dataset-name random \ - --port ${PORT} \ - --model ${MODEL_PATH} \ - --endpoint /v1/chat/completions \ - --backend openai-chat-omni \ - --max-concurrency ${MAX_CONCURRENCY} \ - --num-prompts ${NUM_PROMPTS} \ - --random-input-len 2500 \ - --ignore-eos \ - --percentile-metrics ttft,tpot,itl,e2el,audio_ttfp,audio_rtf \ - --random-output-len 900 \ - --extra_body '{"modalities": ["text","audio"]}' -``` - -### Qwen3-TTS - -#### 1) Serve with async chunk (recommended) - -```bash -vllm-omni serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --omni \ - --port 8000 -``` - -The default config (`qwen3_tts.yaml`) enables the full optimization stack: - -- Batching with `max_batch_size: 10` on the Talker stage -- CUDA Graph on the Talker (`enforce_eager: false`) -- Async chunk with streaming transport - -#### 2) Serve without async chunk (for comparison) - -```bash -vllm-omni serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --omni \ - --port 8000 \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_no_async_chunk.yaml -``` - -#### 3) Key config knobs - -```yaml -async_chunk: true -stage_args: - - stage_id: 0 # Talker (AR decoder) - runtime: - max_batch_size: 10 - engine_args: - enforce_eager: false - max_num_batched_tokens: 512 - custom_process_next_stage_input_func: >- - vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - - - stage_id: 1 # Code2Wav (vocoder) - runtime: - max_batch_size: 1 - engine_args: - enforce_eager: true - max_num_batched_tokens: 8192 - -runtime: - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - codec_streaming: true - codec_chunk_frames: 25 - codec_left_context_frames: 25 -``` - -#### Reproduce Qwen3-TTS benchmarks - -```bash -GPU_DEVICE=0 \ -MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ -NUM_PROMPTS=50 \ -CONCURRENCY="1 4 10" \ -bash benchmarks/qwen3-tts/vllm_omni/run_stacked_benchmark.sh -``` - -This cycles through four configs (Baseline -> + Batch -> + CUDA Graph -> + Async Chunk + Streaming), benchmarks each at the specified concurrency levels, and generates all comparison figures automatically. diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md index b4eec162d31..41aa48c1735 100644 --- a/docs/features/sleep_mode.md +++ b/docs/features/sleep_mode.md @@ -1,243 +1,39 @@ -# Sleep Mode & ACK Protocol +# Sleep Mode -vLLM-Omni’s **Sleep Mode** allows you to temporarily release most GPU memory used by a model—such as model weights and key-value (KV) caches—**without stopping the server or unloading the Docker container**. +vLLM-Omni’s **Sleep Mode** allows you to temporarily release most GPU memory used by a model—such as model weights and key-value (KV) caches (for autoregressive models)—**without stopping the server or unloading the Docker container**. -This feature is inherited from [vLLM’s Sleep Mode](https://blog.vllm.ai/2025/10/26/sleep-mode.html) and extended with the **Omni ACK Protocol** to support multi-stage pipelines and heterogeneous hardware backends (NVIDIA, AMD, Intel, Huawei). It is especially useful in **RLHF**, **dynamic model switching**, or **cost-saving scenarios**. +This feature is inherited from [vLLM’s Sleep Mode](https://blog.vllm.ai/2025/10/26/sleep-mode.html), which provides zero-reload model switching for multi-model serving. + +It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**, where GPU resources must be freed between inference workloads. --- -## 1. Feature Documentation +## Omni Model -### Overview -Omni Sleep Mode provides a mechanism to "sleep" specific model stages. When a stage enters sleep, its physical VRAM is reclaimed by the system, while the process state is preserved for rapid "wake-up" without full re-initialization. +Omni model inherit the feature from vLLM' Sleep Mode -### Sleep Levels -We support two levels of hibernation to balance recovery speed and memory efficiency: +This means: -| Level | Name | Mechanism | Recovery Speed | Memory Freed | -| :--- | :--- | :--- | :--- | :--- | -| **Level 1** | **Weight Offloading** | Offloads weights to Host CPU RAM. | **Fast** (DMA) | Substantial | -| **Level 2** | **Full De-mapping** | Physically releases memory pages via VRAM scavenging. | **Moderate** | **Maximum** (up to 95%+) | +- Support both Level 1 and Level 2 sleep, allow to release and reset both model weights and KV Cache -### Supported Platforms +## Diffusion Model Extension -Omni Sleep Mode is optimized for high-performance computing backends: +We added Sleep Mode support for **diffusion models**, which previously lacked this functionality. +In diffusion pipelines, this currently only offloads **model weight memory**, as these models typically do not use KV caches. -* **NVIDIA**: Supported via Virtual Memory Management (VMM). -* **AMD (ROCm)**: Fully supported with physical page de-mapping. -* **Intel XPU**: Supported via Level Zero memory management. -* **Huawei NPU**: Supported via Ascend memory scavenging. +This means: -### Hardware Requirements -* **Memory Considerations**: System RAM must be sufficient to hold offloaded weights during sleep. -* **TP Support**: Tensor Parallel groups synchronize sleep/wake transitions across all workers. +- Diffusion models can now enter Level 1 sleep. +- Pipeline states (e.g., noise schedulers, buffers) remain intact after waking. +- Useful for releasing VRAM between image generation or training cycles. --- +## Enable sleep mode +To enable sleep mode, set the `enable_sleep_mode` in `engine_args` to `True` -## 2. Usage Examples - -### Python API Example -You can programmatically control the lifecycle of stages using the `AsyncOmni` engine. +Example: ```python - -import asyncio -from vllm_omni.entrypoints.async_omni import AsyncOmni - -async def run_sleep_demo(): - # 1. initialization - engine = AsyncOmni( - model="ByteDance-Seed/BAGEL-7B-MoT", - enable_sleep_mode=True - ) - - # 2. sleep mode level2 - acks = await engine.sleep(stage_ids=[0], level=2) - print(f"Freed {acks[0].freed_bytes / 1024**3:.2f} GiB on Stage 0") - - # 3. wake up - await engine.wake_up(stage_ids=[0]) - -if __name__ == "__main__": - asyncio.run(run_sleep_demo()) - -``` - -### server command Example -Start the server with sleep mode enabled: - -The first method - -``` - -vllm serve ByteDance-Seed/BAGEL-7B-MoT \ ---omni \ ---enable-sleep-mode \ ---trust-remote-code \ ---gpu-memory-utilization 0.7 - -``` - -The second method - -``` - -python3 -m vllm_omni.entrypoints.openai.api_server \ - --model ByteDance-Seed/BAGEL-7B-MoT \ - --omni \ - --enable-sleep-mode \ - --trust-remote-code \ ---gpu-memory-utilization 0.7 - -``` - - - - -### Test Scenarios & Commands - -#### Scenario 1: LLM Engine Sleep - -Objective: Verify VRAM reclamation for Stage 0 (Thinker). - -Trigger sleep (Level 1 or Level 2) via client: - +omni = Omni(model=...,enable_sleep_mode=True) ``` - -curl -X POST http://localhost:8000/v1/omni/sleep \ - -H "Content-Type: application/json" \ - -d '{"stage_ids": [0], "level": 2}' - -``` - -Tip: Open a new terminal and run rocm-smi or nvidia-smi or to observe the immediate drop in VRAM usage. - - - -#### Scenario 2: Diffusion Sleep -Objective: Verify VRAM reclamation for Stage 1 (Diffusion). - -Trigger sleep (Level 1 or Level 2) via client: - -``` - -curl -X POST http://localhost:8000/v1/omni/sleep \ - -H "Content-Type: application/json" \ - -d '{"stage_ids": [1], "level": 2}' - -``` - - - -#### Scenario 3: Multi-Stage Coordinated Stress Test -Objective: Test concurrent sleep and rapid wake-up across multiple stages. - -Concurrent Sleep (Stage 0 & 1): - -``` - -curl -X POST http://localhost:8000/v1/omni/sleep \ - -H "Content-Type: application/json" \ - -d '{"stage_ids": [0, 1], "level": 2}' - -``` - - -Rapid Wake-up: - -``` - -curl -X POST http://localhost:8000/v1/omni/wakeup \ - -H "Content-Type: application/json" \ - -d '{"stage_ids": [0, 1]}' - -``` - - -#### Scenario 4: Full Lifecycle Memory Audit & Functional Integrity -Objective: Audit the complete flow from Sleep to Wake-up followed by an Inference validation. - -Check Initial State: Observe baseline VRAM usage. - -Trigger Deep Sleep (Level 2): - -``` - -curl -X POST http://localhost:8000/v1/omni/sleep \ - -H "Content-Type: application/json" \ - -d '{"stage_ids": [0], "level": 2}' - -``` - -Wake-up Model: - -``` - -curl -X POST http://localhost:8000/v1/omni/wakeup \ - -H "Content-Type: application/json" \ - -d '{"stage_ids": [0]}' - -``` - -Verify Functional Integrity (Inference): -Ensure the model still generates valid output after reloading weights. - -``` - -curl -X POST http://localhost:8000/v1/images/generations \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "A huge swimming pool, with many people swimming.", - "model": "ByteDance-Seed/BAGEL-7B-MoT", - "response_format": "b64_json", - "extra_body": {"sampling_params": {"num_inference_steps": 4, "seed": 42}} - }' > post.json - -``` - - - - -## 3. API Reference - - -### Methods - -| Method | Arguments | Return Type | Description | -| :--- | :--- | :--- | :--- | -| **sleep** | `stage_ids: List[int], level: int` | `List[OmniACK]` | Triggers hibernation for specified stages. | -| **wake_up** | `stage_ids: List[int]` | `List[OmniACK]` | Reloads weights and re-maps memory. | - - - -### OmniACK Dataclass Fields - -| Field | Type | Description | -| :--- | :--- | :--- | -| **task_id** | `str` | Unique identifier for the operation. | -| **status** | `str` | `SUCCESS` or `ERROR`. | -| **stage_id** | `int` | The ID of the stage that responded. | -| **rank** | `int` | The rank ID within the Tensor Parallel group. | -| **freed_bytes** | `int` | Actual amount of physical VRAM reclaimed. | -| **metadata** | `dict` | Additional platform-specific metrics. | - -Metadata Field Analysis -The metadata field is a dynamic dictionary containing hardware-specific telemetry and audit data, primarily used for verifying memory reclamation on various backends (e.g., AMD ROCm, NVIDIA CUDA). - -``` -"metadata": { - "source": "Platform_AMD_Instinct_MI300X", - "total_freed_gib": "78.57", - "rank_residual_gib": "2.07" -} -``` - -#### Core Utility: -**VRAM Reclamation Audit (total_freed_gib)**: Converts raw freed_bytes into human-readable GiB. It serves as the primary metric to verify that Level 2 sleep has successfully purged model weights from VRAM. - -**Residual & Fragmentation Monitoring (rank_residual_gib)**: Reports the remaining VRAM footprint after memory de-mapping. A low residual value (e.g., 2.07 GiB) confirms a successful "clean" state, ensuring the device is ready for high-memory co-located tasks like training or diffusion pipelines. - -**Backend Traceability (source)**: Identifies the underlying hardware driver or audit source. This is critical for debugging synchronization issues in multi-stage, distributed environments. - -**Performance Analytics (Roadmap)**: Future updates will include latency_ms (context-switch overhead) and cuda_graph_recalled (graph engine status) to optimize performance in high-frequency sleep/wake scenarios. diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index 89562c53c51..353fbe1c073 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -6,5 +6,4 @@ vLLM-Omni supports the following hardware platforms: - [NVIDIA CUDA](gpu.md) - [AMD ROCm](gpu.md) - [Intel XPU](gpu.md) - - [MThreads MUSA](gpu.md) - [NPU](npu.md) diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md index d08f134b5d6..297c3666169 100644 --- a/docs/getting_started/installation/gpu.md +++ b/docs/getting_started/installation/gpu.md @@ -22,10 +22,6 @@ vLLM-Omni is a Python library that supports the following GPU variants. The libr --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:requirements" -=== "MThreads MUSA" - - --8<-- "docs/getting_started/installation/gpu/musa.inc.md:requirements" - ## Set up using Python ### Create a new Python environment @@ -48,10 +44,6 @@ Note: Pre-built wheels are currently available for vLLM-Omni 0.11.0rc1, 0.12.0rc --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-wheels" -=== "MThreads MUSA" - - --8<-- "docs/getting_started/installation/gpu/musa.inc.md:pre-built-wheels" - [](){ #build-from-source } ### Build wheel from source @@ -68,10 +60,6 @@ Note: Pre-built wheels are currently available for vLLM-Omni 0.11.0rc1, 0.12.0rc --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-wheel-from-source" -=== "MThreads MUSA" - - --8<-- "docs/getting_started/installation/gpu/musa.inc.md:build-wheel-from-source" - ## Set up using Docker ### Pre-built images @@ -88,10 +76,6 @@ Note: Pre-built wheels are currently available for vLLM-Omni 0.11.0rc1, 0.12.0rc --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-images" -=== "MThreads MUSA" - - --8<-- "docs/getting_started/installation/gpu/musa.inc.md:pre-built-images" - ### Build your own docker image === "AMD ROCm" @@ -101,7 +85,3 @@ Note: Pre-built wheels are currently available for vLLM-Omni 0.11.0rc1, 0.12.0rc === "Intel XPU" --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-docker" - -=== "MThreads MUSA" - - --8<-- "docs/getting_started/installation/gpu/musa.inc.md:build-docker" diff --git a/docs/getting_started/installation/gpu/musa.inc.md b/docs/getting_started/installation/gpu/musa.inc.md deleted file mode 100644 index a7cbc848f58..00000000000 --- a/docs/getting_started/installation/gpu/musa.inc.md +++ /dev/null @@ -1,65 +0,0 @@ -# --8<-- [start:requirements] - -- GPU: Moore Threads GPU with MUSA SDK installed (validated on MTT S5000) - -# --8<-- [end:requirements] -# --8<-- [start:set-up-using-python] - -vLLM-Omni for MUSA requires building from source. Pre-built wheels are not currently available. - -!!! note - MUSA platform requires vLLM-MUSA to be installed first. - -# --8<-- [start:pre-built-wheels] - -# --8<-- [end:pre-built-wheels] - -# --8<-- [start:build-wheel-from-source] - -#### Prerequisites - -- **MUSA SDK**: Download from [MUSA SDK Download](https://developer.mthreads.com/sdk/download/musa) -- **torchada**: CUDA→MUSA compatibility layer for PyTorch (`pip install torchada`) -- **mthreads-ml-py**: MTML Python bindings (`pip install mthreads-ml-py`) -- **MATE**: MUSA AI Tensor Engine ([GitHub](https://github.com/MooreThreads/mate)) - -#### Installation of vLLM-MUSA - -```bash -git clone https://github.com/MooreThreads/vllm-musa.git -cd vllm-musa -git checkout v0.18.0-dev -pip install . --no-build-isolation -v -``` - -#### Installation of vLLM-Omni - -```bash -git clone https://github.com/vllm-project/vllm-omni.git -cd vllm-omni -VLLM_OMNI_TARGET_DEVICE=musa pip install -e . --no-build-isolation -``` - -For Gradio demos: - -```bash -pip install -e '.[demo]' --no-build-isolation -``` - -#### Environment Variables - -```bash -export MUSA_VISIBLE_DEVICES=0,1 -export VLLM_WORKER_MULTIPROC_METHOD=spawn -export VLLM_MUSA_CUSTOM_OP_USE_NATIVE=false -``` - -# --8<-- [end:build-wheel-from-source] - -# --8<-- [start:build-docker] - -# --8<-- [end:build-docker] - -# --8<-- [start:pre-built-images] - -# --8<-- [end:pre-built-images] diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 5dfea8d2ffe..1a683d174f7 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -26,7 +26,7 @@ uv pip install vllm-omni # Optional if want to run Qwen3 TTS uv pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm -uv pip install onnxruntime-rocm +uv pip install onnxruntime-rocm sox ``` # --8<-- [end:pre-built-wheels] diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index e3cfb1b6a86..0aed44a0c65 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -121,7 +121,6 @@ def add_parser(self, name, **kwargs): "FlexibleArgumentParser": _FlexibleArgumentParser, "make_arg_parser": lambda parser: parser, # no-op for doc "_ensure_vllm_platform": lambda: None, # no-op for doc - "nullify_stage_engine_defaults": lambda parser: None, # no-op for doc "VLLM_SUBCMD_PARSER_EPILOG": "", "logger": logger, "DummySubparsers": DummySubparsers, diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index feea969e51f..d611c0311c5 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -36,10 +36,7 @@ th { | `LTX2ImageToVideoPipeline` | LTX-2-I2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | | | `LTX2TwoStagesPipeline` | LTX-2-T2V | `rootonchair/LTX-2-19b-distilled` | ✅︎ | ✅︎ | | | | `LTX2ImageToVideoTwoStagesPipeline` | LTX-2-I2V | `rootonchair/LTX-2-19b-distilled` | ✅︎ | ✅︎ | | | -| `LTX23Pipeline` | LTX-2.3-T2V | `dg845/LTX-2.3-Diffusers` | ✅︎ | ✅︎ | | | -| `LTX23ImageToVideoPipeline` | LTX-2.3-I2V | `dg845/LTX-2.3-Diffusers` | ✅︎ | ✅︎ | | | | `HeliosPipeline`, `HeliosPyramidPipeline` | Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | ✅︎ | ✅︎ | ✅︎ | | -| `MagiHumanPipeline` | MagiHuman | `SII-GAIR/daVinci-MagiHuman-Base-1080p` | ✅︎ | ✅︎ | | | | `OvisImagePipeline` | Ovis-Image | `OvisAI/Ovis-Image` | ✅︎ | ✅︎ | | ✅︎ | | `LongcatImagePipeline` | LongCat-Image | `meituan-longcat/LongCat-Image` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `LongCatImageEditPipeline` | LongCat-Image-Edit | `meituan-longcat/LongCat-Image-Edit` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | @@ -49,21 +46,18 @@ th { | `Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `FluxKontextPipeline` | FLUX.1-Kontext-dev | `black-forest-labs/FLUX.1-Kontext-dev` | ✅︎ | ✅︎ | | | | `FluxPipeline` | FLUX.1-dev | `black-forest-labs/FLUX.1-dev` | ✅︎ | ✅︎ | | ✅︎ | -| `FluxPipeline` | FLUX.1-schnell | `black-forest-labs/FLUX.1-schnell` | ✅︎ | ✅︎ | | ✅︎ | | `OmniGen2Pipeline` | OmniGen2 | `OmniGen2/OmniGen2` | ✅︎ | ✅︎ | | ✅︎ | | `StableAudioPipeline` | Stable-Audio-Open | `stabilityai/stable-audio-open-1.0` | ✅︎ | ✅︎ | | ✅︎ | | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-CustomVoice | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-VoiceDesign | `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-Base | `Qwen/Qwen3-TTS-12Hz-0.6B-Base` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `NextStep11Pipeline` | NextStep-1.1 | `stepfun-ai/NextStep-1.1` | ✅︎ | ✅︎ | | ✅︎ | -| `MiMoAudioModel` | MiMo-Audio-7B-Instruct | `XiaomiMiMo/MiMo-Audio-7B-Instruct` | ✅︎ | ✅︎ | | | -| `MiMoV2ASRForCausalLM` | MiMo-V2.5-ASR | `XiaomiMiMo/MiMo-V2.5-ASR` | ✅︎ | ✅︎ | | | +| `MiMoAudioForConditionalGeneration` | MiMo-Audio-7B-Instruct | `XiaomiMiMo/MiMo-Audio-7B-Instruct` | ✅︎ | ✅︎ | | | | `Flux2Pipeline` | FLUX.2-dev | `black-forest-labs/FLUX.2-dev` | ✅︎ | ✅︎ | | | | `FishSpeechSlowARForConditionalGeneration` | Fish Speech S2 Pro | `fishaudio/s2-pro` | ✅︎ | ✅︎ | | | | `DreamIDOmniPipeline` | DreamID-Omni | `XuGuo699/DreamID-Omni` | ✅︎ | ✅︎ | | | | `HunyuanVideo15Pipeline` | HunyuanVideo-1.5-T2V | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v` | ✅︎ | ✅︎ | | | | `HunyuanVideo15ImageToVideoPipeline` | HunyuanVideo-1.5-I2V | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_i2v`, `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_i2v` | ✅︎ | ✅︎ | | | | `VoxtralTTSForConditionalGeneration` | Voxtral TTS | `mistralai/Voxtral-4B-TTS-2603` | ✅︎ | ✅︎ | | | -|`DyninOmniForConditionalGeneration` | Dynin-Omni | `snu-aidas/Dynin-Omni` | ✅︎ | | | | ✅︎ indicates the model is supported on that backend. Empty cells mean not listed as supported on that backend. diff --git a/docs/pr_reviewer.md b/docs/pr_reviewer.md new file mode 100644 index 00000000000..ad32355328b --- /dev/null +++ b/docs/pr_reviewer.md @@ -0,0 +1,255 @@ +# VLLM-Omni PR Reviewer + +## Overview + +The VLLM-Omni PR Reviewer is an automated code review bot powered by GLM-4.7 AI model. It helps maintain code quality by providing intelligent feedback on pull requests. + +## Features + +- **Intelligent Code Analysis**: Leverages GLM-4.7 for understanding code context and providing meaningful feedback +- **Comprehensive Reviews**: Covers code quality, architecture, security, testing, and documentation +- **Structured Output**: Provides well-formatted reviews with clear sections and actionable suggestions +- **Rate Limiting**: Built-in cooldown mechanism to prevent excessive API usage +- **Retry Logic**: Automatic retries with exponential backoff for transient API failures +- **Defensive Parsing**: Robust validation of API responses to handle malformed data +- **Cost Control**: Only repository members/collaborators/owners can trigger reviews + +## How to Use + +### Triggering a Review + +To trigger an automated PR review, mention the bot in a PR comment: + +``` +@vllm-omni-reviewer please review +``` + +Or include in your PR description: + +``` +@vllm-omni-reviewer +``` + +The bot will automatically review your changes and post a detailed comment. + +## What Gets Reviewed + +- **vLLM Architecture Compatibility**: Ensures changes align with vLLM's design patterns +- **Multi-modal Integration**: Reviews audio, vision, and text processing implementations +- **Performance Implications**: Analyzes impact on inference latency and throughput +- **Code Quality**: Checks Python best practices, type hints, and documentation +- **Security Considerations**: Identifies potential security vulnerabilities +- **Testing Coverage**: Recommends additional test cases when needed + +## Review Output + +The bot posts a structured review comment with: + +- **Overview**: Brief summary of the PR's purpose +- **Critical Issues (Must Fix)**: Blocking issues that need to be addressed +- **Important Issues (Should Fix)**: Significant concerns that should be resolved +- **Minor Issues & Suggestions**: Small improvements and optional suggestions +- **Positive Aspects**: Highlights well-implemented features +- **Performance Considerations**: Analysis of performance impact +- **Testing Recommendations**: Suggestions for additional tests +- **Overall Assessment**: Final recommendation (Approve/Request Changes/Needs Major Work) + +## Rate Limiting and Cooldown + +The bot includes a cooldown mechanism to prevent excessive API usage: + +- **Default cooldown**: 5 minutes between reviews per PR +- **Configurable**: Can be adjusted via `PR_REVIEWER_COOLDOWN_MINUTES` environment variable +- **Smart detection**: Checks for previous bot comments before starting a review + +If you trigger a review within the cooldown period, the bot will log a message and skip the review. + +## Architecture + +``` +┌─────────────────┐ +│ PR Comment │ +│ @vllm-omni- │ +│ reviewer │ +└────────┬────────┘ + │ + ▼ +┌─────────────────────────────────┐ +│ GitHub Actions Workflow │ +│ (.github/workflows/ │ +│ pr-reviewer.yml) │ +│ │ +│ - Python 3.11 │ +│ - requests==2.31.0 │ +│ - pyyaml==6.0.1 │ +└────────┬────────────────────────┘ + │ + ▼ +┌─────────────────────────────────┐ +│ PR Reviewer Script │ +│ (.github/scripts/ │ +│ pr_reviewer.py) │ +│ │ +│ 1. Check cooldown │ +│ 2. Fetch PR details & diff │ +│ 3. Build review prompt │ +│ 4. Call GLM-4.7 API │ +│ (with retry logic) │ +│ 5. Validate response │ +│ 6. Post review comment │ +└────────┬────────────────────────┘ + │ + ▼ +┌─────────────────────────────────┐ +│ GLM-4.7 API │ +│ (open.bigmodel.cn) │ +└─────────────────────────────────┘ +``` + +1. **GitHub Actions Workflow** (`.github/workflows/pr-reviewer.yml`): Triggers on @mention +2. **Python Script** (`.github/scripts/pr_reviewer.py`): Fetches PR data and calls GLM-4.7 API +3. **GLM-4.7 API**: Provides intelligent code analysis + +## Testing + +### Testing the PR Reviewer Bot + +To test the PR reviewer bot before deploying to production: + +1. **Create a test PR** - Make a small, safe change (e.g., documentation update) +2. **Open the PR** - Create a pull request with a descriptive title +3. **Trigger the review** - Comment `@vllm-omni-reviewer` on the PR +4. **Monitor results** - Check the Actions tab for workflow execution logs + +### Running Unit Tests + +The bot includes comprehensive unit tests that can be run locally: + +```bash +# Run all tests +pytest .github/tests/test_pr_reviewer.py -v + +# Run specific test +pytest .github/tests/test_pr_reviewer.py::TestCheckTrigger -v + +# Run with coverage +pytest .github/tests/test_pr_reviewer.py --cov=.github/scripts/pr_reviewer.py --cov-report=term-missing +``` + +### What to Look For + +When testing, verify that: +- [ ] The workflow triggers on the `@vllm-omni-reviewer` comment +- [ ] The cooldown mechanism works correctly +- [ ] The GLM API call completes without errors (with retry if needed) +- [ ] A review comment is posted to the PR +- [ ] The review content is meaningful and well-structured +- [ ] The cost is within the expected range (0.50-5 CNY) + +### Safe Test Changes + +For testing, consider making these types of safe changes: +- Documentation updates (like adding this Testing section) +- Comment improvements +- README enhancements +- Non-functional file additions + +### Example Test PR + +A good test PR might: +- Update a documentation file +- Add explanatory comments +- Improve code formatting +- Fix a minor typo + +These changes are safe to merge if the test is successful and won't affect functionality. + +## Troubleshooting + +### Bot Doesn't Respond + +1. **Check permissions** - Verify you have Owner/Member/Collaborator access +2. **Check Actions tab** - Look for workflow execution and view logs +3. **Check cooldown** - If another review was posted recently, wait for the cooldown period +4. **Check API key** - Ensure `GLM_API_KEY` is configured in repository secrets + +### API Errors + +If the GLM API call fails: +- Check the Actions tab for detailed error logs +- Verify the `GLM_API_KEY` secret is correctly configured +- Ensure sufficient API quota is available +- The bot will automatically retry up to 3 times with exponential backoff + +### Review Seems Truncated + +If the review appears incomplete: +- Large diffs may be truncated at 100,000 characters +- Check the logs for truncation warnings +- Consider breaking large PRs into smaller chunks + +## Configuration + +### Required Secrets + +The following secret must be configured in the repository settings: + +- `GLM_API_KEY` - Your GLM (BigModel) API key for accessing the GLM-4.7 API + +To add the secret: +1. Go to repository Settings → Secrets and variables → Actions +2. Click "New repository secret" +3. Name: `GLM_API_KEY` +4. Value: Your GLM API key + +### Optional Configuration + +The following optional environment variables can be set in the workflow file: + +| Variable | Default | Description | +|----------|---------|-------------| +| `GLM_API_URL` | `https://open.bigmodel.cn/api/paas/v4/chat/completions` | GLM API endpoint | +| `GLM_MODEL` | `glm-4.7` | Model to use for reviews | +| `PR_REVIEWER_COOLDOWN_MINUTES` | `5` | Cooldown period between reviews | +| `PR_REVIEWER_MAX_RETRIES` | `3` | Maximum API retry attempts | +| `PR_REVIEWER_RETRY_DELAY` | `1.0` | Base delay for retry backoff (seconds) | +| `PR_REVIEWER_MAX_DIFF_SIZE` | `100000` | Maximum diff size before truncation | + +### Workflow Customization + +The workflow can be customized in `.github/workflows/pr-reviewer.yml`: +- Change Python version (default: 3.11) +- Adjust timeout value (default: 10 minutes) +- Modify trigger conditions +- Add additional dependencies + +## Code Quality + +The PR reviewer script follows vllm-omni coding standards: + +- **Type hints**: All functions have complete type hints following mypy strict mode +- **Logging**: Uses Python's logging module for structured logging +- **Testing**: Comprehensive unit tests with pytest +- **Pre-commit**: Script is checked by pre-commit hooks (flake8) + +## Cost Estimate + +| Component | Cost | +|-----------|------| +| GitHub Actions (public repo) | Free | +| GLM API (glm-4.7) | ~0.50-5 CNY per PR (varies by size) | +| Total (20 PRs/month) | ~10-100 CNY/month (~$2-15 USD) | + +## Contributing + +To improve the PR reviewer bot: + +1. Edit `.github/scripts/pr_reviewer.py` for logic changes +2. Edit `.github/workflows/pr-reviewer.yml` for workflow changes +3. Add tests to `.github/tests/test_pr_reviewer.py` +4. Run `pre-commit run --files .github/scripts/pr_reviewer.py` to check code quality +5. Test thoroughly with a test PR before deploying to production + +## License + +This bot is part of the VLLM-Omni project and follows the same license terms. diff --git a/docs/serving/image_edit_api.md b/docs/serving/image_edit_api.md index 79303e1a690..d254ac06ad7 100644 --- a/docs/serving/image_edit_api.md +++ b/docs/serving/image_edit_api.md @@ -104,8 +104,6 @@ Content-Type: multipart/form-data | `guidance_scale` | float | model defaults | Classifier-free guidance scale (typically 0.0-20.0) | | `true_cfg_scale` | float | model defaults | True CFG scale (model-specific parameter, may be ignored if not supported) | | `seed` | integer | null | Random seed for reproducibility | -| `reference_image` | string or array | null | Reference image for inpainting | -| `mask_image` | string or array | null | Mask for inpainting (white areas will be inpainted) | ### Response Format diff --git a/docs/serving/speech_api.md b/docs/serving/speech_api.md index 8f78d6a2001..ecbe8d9ac98 100644 --- a/docs/serving/speech_api.md +++ b/docs/serving/speech_api.md @@ -15,17 +15,28 @@ Each server instance runs a single model (specified at startup via `vllm serve < ```bash # Qwen3-TTS: CustomVoice model (predefined speakers) vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ --enforce-eager # Fish Speech S2 Pro -vllm serve fishaudio/s2-pro --omni --port 8091 +vllm-omni serve fishaudio/s2-pro \ + --stage-configs-path vllm_omni/model_executor/stage_configs/fish_speech_s2_pro.yaml \ + --omni \ + --port 8091 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.9 # Voxtral TTS -vllm serve mistralai/Voxtral-4B-TTS-2603 --omni --port 8091 +vllm serve mistralai/Voxtral-4B-TTS-2603 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ + --omni \ + --port 8091 \ + --trust-remote-code \ + --enforce-eager ``` ### Generate Speech @@ -289,7 +300,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ ```bash # Start server with VoiceDesign model first vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \ - --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -311,7 +322,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ ```bash # Start server with Base model first vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base \ - --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -506,16 +517,15 @@ for result in response.json()["results"]: All items are fanned out to `generate()` concurrently. The engine's stage worker automatically batches them up to the configured `max_batch_size` and queues the rest — no client-side throttling needed. -For best throughput, set both stages' `max_num_seqs` to ≥4 via `--stage-overrides`: +For best throughput, use a batch-optimized stage config with `max_batch_size > 1`: ```bash vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --omni --port 8091 --trust-remote-code --enforce-eager \ - --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2}, - "1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml \ + --omni --port 8091 --trust-remote-code --enforce-eager ``` -The bundled `qwen3_tts.yaml` uses `max_num_seqs: 1` (single request) on both stages. Bumping to 4 yields roughly 4× throughput on the talker and lets stage 1 batch chunks across in-flight requests. +The default `qwen3_tts.yaml` uses `max_batch_size: 1` (single request). The `qwen3_tts_batch.yaml` config sets `max_batch_size: 4` for ~4x throughput. ## Supported Models @@ -607,7 +617,7 @@ Enable debug logging: ```bash vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ diff --git a/docs/serving/video_stream_api.md b/docs/serving/video_stream_api.md deleted file mode 100644 index 88f74affca9..00000000000 --- a/docs/serving/video_stream_api.md +++ /dev/null @@ -1,93 +0,0 @@ -# Streaming Video Input API - -vLLM-Omni provides a WebSocket API for streaming video frames and optional audio chunks into Qwen3-Omni, then asking questions over the buffered session context. - -Each server instance runs a single model specified at startup with `vllm serve --omni`. - -## Quick Start - -### Start the Server - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct \ - --deploy-config vllm_omni/deploy/qwen3_omni.yaml \ - --omni \ - --port 8000 \ - --trust-remote-code -``` - -### Run the Example Client - -```bash -python examples/online_serving/qwen3_omni/streaming_video_client.py \ - --url ws://localhost:8000/v1/video/chat/stream \ - --video /path/to/video.mp4 \ - --query "Describe what is happening in the video." -``` - -## API Reference - -### Endpoint - -```text -WebSocket /v1/video/chat/stream -``` - -### Protocol - -| Direction | Type | Required fields | Description | -|-----------|------|-----------------|-------------| -| Client -> Server | `session.config` | none | First message. Configures output modalities, frame sampling, EVS, and prompts. | -| Client -> Server | `video.frame` | `data` | Base64 JPEG/PNG frame. | -| Client -> Server | `audio.chunk` | `data` | Base64 PCM16 16 kHz mono audio bytes. | -| Client -> Server | `video.query` | `text` | Ask a question over the buffered frames and audio. | -| Client -> Server | `video.done` | none | End the WebSocket session. | -| Server -> Client | `response.start` | none | Query generation started. | -| Server -> Client | `response.text.delta` | `delta` | Incremental text output. | -| Server -> Client | `response.text.done` | `text` | Final text output for the query. | -| Server -> Client | `response.audio.delta` | `data`, `format` | Incremental generated audio, base64 WAV. | -| Server -> Client | `response.audio.done` | none | Audio output finished. | -| Server -> Client | `session.done` | none | Session closed. | -| Server -> Client | `error` | `message` | Recoverable protocol or generation error. | - -### `session.config` Fields - -| Field | Type | Default | Description | -|-------|------|---------|-------------| -| `model` | string or null | null | Optional model name. Usually omitted because the server hosts one model. | -| `modalities` | list[string] | `["text", "audio"]` | Output modalities. Use `["text"]`, `["audio"]`, or both. | -| `num_frames` | integer, 1-128 | `4` | Number of buffered frames sampled for each query. | -| `max_frames` | integer, 1-256 | `50` | Maximum retained frame buffer size. Oldest frames are evicted first. | -| `system_prompt` | string or null | null | Optional custom system prompt. | -| `use_audio_in_video` | bool | `true` | Include streamed audio chunks in multimodal video understanding when audio is present. | -| `sampling_params_list` | list or null | null | Optional per-stage sampling parameter overrides. | -| `enable_frame_filter` | bool | `true` | Enable EVS near-duplicate frame filtering. | -| `frame_filter_threshold` | float, 0.0-1.0 | `0.95` | EVS similarity threshold. Higher keeps more frames; lower drops more near-duplicates. | - -### Legacy Aliases - -The server accepts these legacy field names and rewrites them before validation. New clients should send the canonical names above. - -| Legacy field | Canonical field | -|--------------|-----------------| -| `num_sample_frames` | `num_frames` | -| `evs_enabled` | `enable_frame_filter` | -| `evs_threshold` | `frame_filter_threshold` | - -### Environment Variables - -| Variable | Values | Default | Description | -|----------|--------|---------|-------------| -| `VLLM_VIDEO_ASYNC_CHUNK` | `on`, `off` | `on` | Wire-level streaming switch. `off` buffers server-side deltas and emits coalesced outputs at the end of a query. | -| `VLLM_VIDEO_AUDIO_DELTA_MODE` | `fast`, `slow` | `fast` | Audio delta extraction strategy. `fast` emits only newly produced chunks; `slow` recomputes from accumulated audio and exists for A/B verification. | - -## EVS Semantics - -EVS compares downsampled frames and drops near-duplicate frames before they enter the session frame buffer. `frame_filter_threshold` controls retention: higher values are more permissive and keep more frames; lower values are more aggressive and drop more similar frames. - -## Known Limitations - -- Session KV reuse and incremental prefill are not implemented in this PR. Each `video.query` rebuilds the model prompt from the retained frame and audio buffers. -- Back-to-back short replies can still expose an engine-layer scheduler race. The PR notes an observed workaround of at least 200 ms idle between turns when clients repeatedly see idle timeouts. -- If the audio buffer exceeds the server limit, the server emits `Audio buffer overflow` and clears the currently buffered audio for the session. -- The API is intended for Qwen3-Omni streaming video understanding; other models may not support the same multimodal processor arguments. diff --git a/docs/source/architecture/async-chunk-architecture.png b/docs/source/architecture/async-chunk-architecture.png index 7b3e95e4df9..249de53bfe3 100644 Binary files a/docs/source/architecture/async-chunk-architecture.png and b/docs/source/architecture/async-chunk-architecture.png differ diff --git a/docs/source/architecture/qwen3-omni-async-chunk.png b/docs/source/architecture/qwen3-omni-async-chunk.png index e73ca84b283..b2d98b80f33 100644 Binary files a/docs/source/architecture/qwen3-omni-async-chunk.png and b/docs/source/architecture/qwen3-omni-async-chunk.png differ diff --git a/docs/source/architecture/qwen3-omni-non-async-chunk.png b/docs/source/architecture/qwen3-omni-non-async-chunk.png index 47a9ba66a5e..da5610a11bb 100644 Binary files a/docs/source/architecture/qwen3-omni-non-async-chunk.png and b/docs/source/architecture/qwen3-omni-non-async-chunk.png differ diff --git a/docs/source/architecture/vllm-omni-dataflow-between-stages.png b/docs/source/architecture/vllm-omni-dataflow-between-stages.png index 74abc81ff07..cdbc9a8b7b3 100644 Binary files a/docs/source/architecture/vllm-omni-dataflow-between-stages.png and b/docs/source/architecture/vllm-omni-dataflow-between-stages.png differ diff --git a/docs/usage/faq.md b/docs/usage/faq.md index 0539e158b01..c080eae4023 100644 --- a/docs/usage/faq.md +++ b/docs/usage/faq.md @@ -4,6 +4,14 @@ A: Now, we support natively disaggregated deployment for different model stages within a model. There is a restriction that one chip can only have one AutoRegressive model stage. This is because the unified KV cache management of vLLM. Stages of other types can coexist within a chip. The restriction will be resolved in later version. +> Q: When trying to run examples, I encounter error about backend of librosa or soundfile. How to solve it? + +A: If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` + > Q: I see GPU OOM or "free memory is less than desired GPU memory utilization" errors. How can I fix it? A: Refer to [GPU memory calculation and configuration](../configuration/gpu_memory_utilization.md) for guidance on tuning `gpu_memory_utilization` and related settings. diff --git a/docs/user_guide/diffusion/attention_backends.md b/docs/user_guide/diffusion/attention_backends.md deleted file mode 100644 index 692bcc0f9d0..00000000000 --- a/docs/user_guide/diffusion/attention_backends.md +++ /dev/null @@ -1,120 +0,0 @@ -# Diffusion Attention Backends - -This document describes the diffusion attention backends available in vLLM-Omni, how to select them, and how to use SageAttention. - -## Overview - -Diffusion attention backend selection is controlled by the `DIFFUSION_ATTENTION_BACKEND` environment variable and resolved in `vllm_omni.diffusion.attention.selector`. - -This backend is used by diffusion attention layers such as the DiT attention in video and image generation models. - -On CUDA, the practical choices today are: - -- `FLASH_ATTN`: FlashAttention backend. This is the default on supported CUDA systems when FlashAttention is installed. -- `TORCH_SDPA`: PyTorch `scaled_dot_product_attention`. -- `SAGE_ATTN`: SageAttention backend, if `sageattention` is installed. - -If `DIFFUSION_ATTENTION_BACKEND` is unset, vLLM-Omni asks the current platform to choose the default backend. On CUDA, that normally means `FLASH_ATTN` when available, otherwise `TORCH_SDPA`. - -## Backend Options - -| Value | Notes | -|---|---| -| `FLASH_ATTN` | Default on CUDA when FlashAttention is available. Good default for most diffusion workloads. | -| `TORCH_SDPA` | Most conservative fallback. Useful for debugging or compatibility. | -| `SAGE_ATTN` | Requires `sageattention`. Can improve performance on some workloads, but output quality must be validated model-by-model. | - -## Selection Priority - -Diffusion attention backend selection follows this order: - -1. `DIFFUSION_ATTENTION_BACKEND` -2. Platform default - -Example: - -```bash -export DIFFUSION_ATTENTION_BACKEND=SAGE_ATTN -``` - -## SageAttention Installation - -vLLM-Omni expects SageAttention to be installed into the same Python environment as vLLM-Omni. - -Build from source: - -```bash -git clone https://github.com/thu-ml/SageAttention.git -cd SageAttention - -export EXT_PARALLEL=4 NVCC_APPEND_FLAGS="--threads 8" MAX_JOBS=32 -pip install . --no-build-isolation -``` - -Quick check: - -```bash -python -c "import sageattention; print(sageattention.__file__)" -``` - -## Usage - -### Enable SageAttention - -Example: HunyuanVideo-1.5 text-to-video - -```bash -DIFFUSION_ATTENTION_BACKEND=SAGE_ATTN python examples/offline_inference/text_to_video/text_to_video.py \ - --model hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v \ - --prompt "A dog running across a field of golden wheat." \ - --height 480 --width 832 --num-frames 33 \ - --num-inference-steps 30 --seed 42 --guidance-scale 6.0 \ - --tensor-parallel-size 2 \ - --output ../tmp/hv15_modelopt_sage.mp4 -``` - -Example: Wan2.2 TI2V 5B - -```bash -DIFFUSION_ATTENTION_BACKEND=SAGE_ATTN python examples/offline_inference/text_to_video/text_to_video.py \ - --model Wan-AI/Wan2.2-TI2V-5B-Diffusers \ - --prompt "A dog running across a field of golden wheat." \ - --height 704 --width 1280 --num-frames 49 \ - --num-inference-steps 30 --seed 42 --guidance-scale 5.0 \ - --tensor-parallel-size 2 \ - --output outputs/wan22_sage.mp4 -``` - -### Compare Against FlashAttention - -Unset the backend override, or explicitly use `FLASH_ATTN`: - -```bash -python examples/offline_inference/text_to_video/text_to_video.py \ - --model Wan-AI/Wan2.2-TI2V-5B-Diffusers \ - --prompt "A dog running across a field of golden wheat." \ - --height 704 --width 1280 --num-frames 49 \ - --num-inference-steps 30 --seed 42 --guidance-scale 5.0 \ - --tensor-parallel-size 2 \ - --output outputs/wan22_fa3.mp4 -``` - -## Validation Guidance - -Do not assume that a faster attention backend is numerically interchangeable with `FLASH_ATTN`. - -Always compare: - -- End-to-end runtime -- DiT / diffusion stage runtime -- Output quality against a known-good baseline - -At minimum, keep the same: - -- model -- prompt -- seed -- resolution -- frame count -- inference steps -- parallel config diff --git a/docs/user_guide/diffusion/cache_acceleration/cache_dit.md b/docs/user_guide/diffusion/cache_acceleration/cache_dit.md index eaaca84ad6d..824e8c93051 100644 --- a/docs/user_guide/diffusion/cache_acceleration/cache_dit.md +++ b/docs/user_guide/diffusion/cache_acceleration/cache_dit.md @@ -283,10 +283,3 @@ Using Cache-DiT acceleration: 1. ✅ **Enable Cache-DiT** - Set `cache_backend="cache_dit"` to get 1.5x-3x speedup with optimized defaults 2. ✅ **(Optional) Customize** - Adjust `cache_config` parameters for specific speed/quality trade-offs - ---- - -## Additional Resources - -- [Cache-DiT documentation](https://cache-dit.readthedocs.io/en/latest/) -- [Cache-DiT API reference](https://cache-dit.readthedocs.io/en/latest/user_guide/CACHE_API/) diff --git a/docs/user_guide/diffusion/cpu_offload_diffusion.md b/docs/user_guide/diffusion/cpu_offload_diffusion.md index 1d3f1811aed..8786ae9649a 100644 --- a/docs/user_guide/diffusion/cpu_offload_diffusion.md +++ b/docs/user_guide/diffusion/cpu_offload_diffusion.md @@ -36,45 +36,6 @@ m = Omni(model="Wan-AI/Wan2.2-T2V-A14B-Diffusers", enable_cpu_offload=True) vllm-omni serve diffusion Wan-AI/Wan2.2-T2V-A14B-Diffusers --enable-cpu-offload ``` -### To Support a Model - -Implement the `SupportsModuleOffload` protocol to declare which -submodules participate in offloading: - -```python -from typing import ClassVar -from vllm_omni.diffusion.models.interface import SupportsModuleOffload - -class MyPipeline(nn.Module, SupportsModuleOffload): - _dit_modules: ClassVar[list[str]] = ["transformer"] - _encoder_modules: ClassVar[list[str]] = ["text_encoder", "vision_model"] - _vae_modules: ClassVar[list[str]] = ["vae"] - _resident_modules: ClassVar[list[str]] = [] # optional - - def __init__(self): - super().__init__() - self.transformer = ... # DiT — stays on GPU during denoising - self.text_encoder = ... # Encoder — offloaded to CPU during denoising - self.vision_model = ... # Encoder — offloaded to CPU during denoising - self.vae = ... # VAE — always on GPU -``` - -- `_dit_modules`: attribute names of denoising submodules (kept on GPU - during the diffusion loop). -- `_encoder_modules`: attribute names of encoder/vision submodules - (offloaded to CPU during the diffusion loop). -- `_vae_modules`: attribute names of VAE(s) (always kept on GPU, not - part of the mutual exclusion hooks). -- `_resident_modules`: attribute names of small submodules that must - stay on GPU during layerwise offloading (e.g. embedders, connectors). - Optional — defaults to `[]`. - -All attribute names support dotted paths for nested submodules -(e.g. `"pipe.transformer"`, `"bagel.time_embedder"`). - -Both DiT and encoder lists are needed because the offload hooks use -mutual exclusion: when one group runs, the other moves to CPU. - ### Limitations - Cold start latency increases - Adds overhead from CPU-GPU transfers between encoder and denoising phases @@ -130,19 +91,12 @@ Models must define the blocks attribute name for layerwise offloading: ```python class WanTransformer3DModel(nn.Module): - _layerwise_offload_blocks_attrs = ["blocks"] # Attribute names containing transformer blocks + _layerwise_offload_blocks_attr = "blocks" # Attribute name containing transformer blocks def __init__(self): self.blocks = nn.ModuleList([...]) # Transformer blocks ``` -For models with multiple block types: - -```python -class Flux2Transformer2DModel(nn.Module): - _layerwise_offload_blocks_attrs = ["transformer_blocks", "single_transformer_blocks"] -``` - ### Limitations - Cold start latency increases because of 1) components are loaded to CPU first at the very first during initialization, @@ -155,19 +109,11 @@ class Flux2Transformer2DModel(nn.Module): **Module Discovery** -The offloader discovers pipeline components in two ways: +The offloader automatically discovers pipeline components: -1. **Protocol-based** (preferred): If the pipeline implements - `SupportsModuleOffload`, its `_dit_modules`, `_encoder_modules`, - `_vae_modules`, and `_resident_modules` class variables are used - directly. All attribute names support dotted paths (e.g. - `"pipe.transformer"`, `"bagel.time_embedder"`) for nested submodules. - -2. **Fallback attribute scan**: Otherwise, the offloader scans for - well-known attribute names: - - **DiT modules**: `transformer`, `transformer_2`, `dit`, `sr_dit`, `language_model`, `transformer_blocks`, `model` - - **Encoders**: `text_encoder`, `text_encoder_2`, `text_encoder_3`, `image_encoder` - - **VAE**: `vae`, `audio_vae` +- **DiT modules**: `transformer`, `transformer_2`, `dit` +- **Encoders**: `text_encoder`, `text_encoder_2`, `text_encoder_3`, `image_encoder` +- **VAE**: `vae` **Hook System** @@ -186,17 +132,12 @@ Factory function `get_offload_backend()` selects the appropriate backend based o ## Supported Models -| Architecture | Example Models | DiT Class | Model-Level Offload | Layerwise Offload | Blocks Attrs (Layerwise specific) | -|--------------|----------------|-----------|---------------------|-------------------|-----------------------------------| -| LongCatImagePipeline | `meituan-longcat/LongCat-Image` | `LongCatImageTransformer2DModel` | - | ✓ | `"transformer_blocks"`, `"single_transformer_blocks"` | -| NextStep11Pipeline | `stepfun-ai/NextStep-1.1` | `NextStepModel` | - | ✓ | `"layers"` | -| OvisImagePipeline | `AIDC-AI/Ovis-Image-7B` | `OvisImageTransformer2DModel` | - | ✓ | `"transformer"` | -| QwenImagePipeline | `Qwen/Qwen-Image` | `QwenImageTransformer2DModel` | ✓ | ✓ | `"transformer_blocks"` | -| StableDiffusion3Pipeline | `stabilityai/stable-diffusion-3.5-medium` | `SD3Transformer2DModel` | - | ✓ | `"transformer_blocks"` | -| Wan22I2VPipeline | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` | +| Architecture | Example Models | DiT Class | Model-Level Offload | Layerwise Offload | Blocks Attr (Layerwise specific) | +|--------------|----------------|-----------|---------------------|-------------------|-------------| | Wan22Pipeline | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` | -| BagelPipeline | `ByteDance-Seed/BAGEL-7B-MoT` | `Qwen2MoTModel` | - | ✓ | `"layers"`, `"customized modules"` | +| Wan22I2VPipeline | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` | +| QwenImagePipeline | `Qwen/Qwen-Image` | `QwenImageTransformer2DModel` | ✓ | ✓ | `"transformer_blocks"` | **Notes:** - Model-Level Offloading is expected to be supported by all common diffusion models (DiT and encoders) naturally -- Layerwise Offloading requires DiT class to define `_layerwise_offload_blocks_attrs` pointing to transformer blocks +- Layerwise Offloading requires DiT class to define `_layerwise_offload_blocks_attr` pointing to transformer blocks diff --git a/docs/user_guide/diffusion/frame_interpolation.md b/docs/user_guide/diffusion/frame_interpolation.md deleted file mode 100644 index 349af50c51c..00000000000 --- a/docs/user_guide/diffusion/frame_interpolation.md +++ /dev/null @@ -1,92 +0,0 @@ -# Frame Interpolation - -## Overview - -vLLM-Omni supports post-generation frame interpolation for supported video -diffusion pipelines. This feature inserts synthesized intermediate frames -between adjacent generated frames to improve temporal smoothness without -rerunning the diffusion denoising loop. - -Frame interpolation runs in the diffusion worker post-processing path instead -of the API server encoding path. This allows the interpolation step to reuse -the worker's current accelerator device and keeps the FastAPI event loop free -from heavy synchronous PyTorch work. - -For an input video with `N` generated frames and interpolation exponent `exp`, -the output frame count is: - -```text -(N - 1) * 2**exp + 1 -``` - -The output FPS is multiplied by `2**exp` so the clip duration remains close to -the original generated video. - -## Supported Pipelines - -Frame interpolation is currently supported for: - -- `WanPipeline` (Wan2.2 text-to-video) -- `WanImageToVideoPipeline` -- `Wan22TI2VPipeline` - -## Request Parameters - -The video APIs `/v1/videos` and `/v1/videos/sync` accept: - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `enable_frame_interpolation` | bool | `false` | Enable post-generation frame interpolation | -| `frame_interpolation_exp` | int | `1` | Interpolation exponent. `1=2x`, `2=4x`, etc. | -| `frame_interpolation_scale` | float | `1.0` | RIFE inference scale | -| `frame_interpolation_model_path` | str | `None` | Local directory or Hugging Face repo ID containing `flownet.pkl` | - -## Execution Flow - -For supported Wan2.2 pipelines, the execution order is: - -1. Diffusion worker finishes denoising and decodes the raw video tensor. -2. Worker-side model-specific post-processing runs. -3. If frame interpolation is enabled, RIFE interpolates the decoded video - tensor on the worker side and records a FPS multiplier in `custom_output`. -4. The API server receives the already-interpolated video and only performs - MP4 export. - -This design keeps interpolation close to the generated tensor and avoids -introducing another heavyweight GPU context in the API server process. - -## Example - -Start the server: - -```bash -vllm serve Wan-AI/Wan2.2-T2V-A14B-Diffusers --omni --port 8091 -``` - -Run a sync request with interpolation enabled: - -```bash -curl -X POST http://localhost:8091/v1/videos/sync \ - -F "prompt=A dog running through a park" \ - -F "num_frames=81" \ - -F "width=832" \ - -F "height=480" \ - -F "fps=16" \ - -F "num_inference_steps=40" \ - -F "guidance_scale=1.0" \ - -F "guidance_scale_2=1.0" \ - -F "enable_frame_interpolation=true" \ - -F "frame_interpolation_exp=1" \ - -F "frame_interpolation_scale=1.0" \ - -F "seed=42" \ - -o sync_t2v_interpolated.mp4 -``` - -## Notes - -- This is a post-processing feature. It does not modify the diffusion denoising - schedule. -- Higher interpolation exponents increase post-processing time and memory usage. -- If the interpolation model weights are not available locally, - `frame_interpolation_model_path` may point to a Hugging Face repo containing - `flownet.pkl`. diff --git a/docs/user_guide/diffusion/lora.md b/docs/user_guide/diffusion/lora.md index 256698752a1..e45c033b848 100644 --- a/docs/user_guide/diffusion/lora.md +++ b/docs/user_guide/diffusion/lora.md @@ -56,92 +56,6 @@ outputs = omni.generate( !!! note "Server-side Path Requirement" The LoRA adapter path (`local_path`) must be readable on the **server** machine. If your client and server are on different machines, ensure the LoRA adapter is accessible via a shared mount or copied to the server. -## Wan2.2 LightX2V Offline Assembly - -This workflow is LoRA-adjacent: it uses external LightX2V conversion plus -`Wan2.2-Distill-Loras` to bake converted Wan2.2 I2V checkpoints into a local -Diffusers directory, instead of loading LoRA adapters at runtime. - -### Required assets - -- Base model: `Wan-AI/Wan2.2-I2V-A14B` -- Diffusers skeleton: `Wan-AI/Wan2.2-I2V-A14B-Diffusers` -- Optional external converter from the LightX2V project (not shipped in this repository) -- Optional LoRA weights: `lightx2v/Wan2.2-Distill-Loras` - -### Step 1: Optional - convert high/low-noise DiT weights with LightX2V - -Install or clone LightX2V from the upstream repository -(`https://github.com/ModelTC/LightX2V`). After cloning, the converter used -below is available at `/tools/convert/converter.py`. - -```bash -python /path/to/lightx2v/tools/convert/converter.py \ - --source /path/to/Wan2.2-I2V-A14B/high_noise_model \ - --output /tmp/wan22_lightx2v/high_noise_out \ - --output_ext .safetensors \ - --output_name diffusion_pytorch_model \ - --model_type wan_dit \ - --direction forward \ - --lora_path /path/to/wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_1022.safetensors \ - --lora_key_convert auto \ - --single_file - -python /path/to/lightx2v/tools/convert/converter.py \ - --source /path/to/Wan2.2-I2V-A14B/low_noise_model \ - --output /tmp/wan22_lightx2v/low_noise_out \ - --output_ext .safetensors \ - --output_name diffusion_pytorch_model \ - --model_type wan_dit \ - --direction forward \ - --lora_path /path/to/wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_1022.safetensors \ - --lora_key_convert auto \ - --single_file -``` - -If you are not using LightX2V, skip this step and either keep the original -Diffusers weights from the skeleton or point Step 2 at any other converted -`transformer/` and `transformer_2/` checkpoints. - -### Step 2: Assemble a final Diffusers-style directory - -```bash -python tools/wan22/assemble_wan22_i2v_diffusers.py \ - --diffusers-skeleton /path/to/Wan2.2-I2V-A14B-Diffusers \ - --transformer-weight /tmp/wan22_lightx2v/high_noise_out \ - --transformer-2-weight /tmp/wan22_lightx2v/low_noise_out \ - --output-dir /path/to/Wan2.2-I2V-A14B-Custom-Diffusers \ - --asset-mode symlink \ - --overwrite -``` - -`--transformer-weight` and `--transformer-2-weight` are optional. If you omit -them, the tool keeps the original weights from the Diffusers skeleton. - -### Step 3: Run offline inference - -```bash -python examples/offline_inference/image_to_video/image_to_video.py \ - --model /path/to/Wan2.2-I2V-A14B-Custom-Diffusers \ - --image /path/to/input.jpg \ - --prompt "A cat playing with yarn" \ - --num-frames 81 \ - --num-inference-steps 4 \ - --tensor-parallel-size 4 \ - --height 480 \ - --width 832 \ - --flow-shift 12 \ - --sample-solver euler \ - --guidance-scale 1.0 \ - --guidance-scale-high 1.0 \ - --boundary-ratio 0.875 -``` - -Notes: - -- This route avoids runtime LoRA loading changes in vLLM-Omni when you choose to bake converted weights into a local Diffusers directory. -- Output quality and speed depend on the replacement checkpoints and sampling params you choose. - ## See Also diff --git a/docs/user_guide/diffusion/quantization/autoround.md b/docs/user_guide/diffusion/quantization/autoround.md index d06627d40a3..48df176b037 100644 --- a/docs/user_guide/diffusion/quantization/autoround.md +++ b/docs/user_guide/diffusion/quantization/autoround.md @@ -72,8 +72,6 @@ At load time: | Model | HF Checkpoint | Scheme | Group Size | Backend | |-------|--------------|--------|------------|---------| | FLUX.1-dev | `vllm-project-org/FLUX.1-dev-AutoRound-w4a16` | W4A16 | 128 | GPTQ-Marlin | -| Qwen2.5-Omni-7B | `Intel/Qwen2.5-Omni-7B-int4-AutoRound` | W4A16 | 128 | GPTQ-Marlin | -| Qwen3-Omni-30B-A3B-Instruct | `Intel/Qwen3-Omni-30B-A3B-Instruct-int4-AutoRound` | W4A16 | 128 | GPTQ-Marlin | ## Creating a Quantized Checkpoint diff --git a/docs/user_guide/diffusion/quantization/fp8.md b/docs/user_guide/diffusion/quantization/fp8.md index ceb3d006c2e..9906631b625 100644 --- a/docs/user_guide/diffusion/quantization/fp8.md +++ b/docs/user_guide/diffusion/quantization/fp8.md @@ -65,7 +65,6 @@ The available `ignored_layers` names depend on the model architecture (e.g., `to | Flux | `black-forest-labs/FLUX.1-dev` | All layers | None | | HunyuanImage-3 | `tencent/HunyuanImage3` | All layers | None | | HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | All layers | None | -| GLM-Image | `zai-org/GLM-Image` | All layers | None | | Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | All layers | None | ## Combining with Other Features diff --git a/docs/user_guide/diffusion/quantization/msmodelslim.md b/docs/user_guide/diffusion/quantization/msmodelslim.md deleted file mode 100644 index 5492cd9272b..00000000000 --- a/docs/user_guide/diffusion/quantization/msmodelslim.md +++ /dev/null @@ -1,56 +0,0 @@ -# msModelSlim Quantization - -## Overview - -[msModelSlim](https://github.com/Ascend/msmodelslim) is an Ascend-friendly compression tool focused on acceleration, using compression techniques, and built for Ascend hardware. It includes a series of inference optimization technologies such as quantization and compression, aiming to accelerate large language dense models, MoE models, multimodal understanding models, multimodal generation models, etc. - -Once you have a quantized model which is generated by **msModelSlim**, you can use vLLM Omni for inference by specifying the --quantization ascend parameter to enable quantization features. - -### Supported Schemes - -| Scheme | Bits | Status | -|--------|------|--------| -| W8A8 | 8 | ✅ Supported | -| W4A4 | 4 | Planned | - -W8A8 is the first supported scheme. Additional schemes will be added in future releases. - -## Model Quantization - -The following example shows how to generate W8A8 quantized weights for the [Wan2_2 model](https://gitcode.com/Ascend/msmodelslim/blob/master/example/multimodal_sd/Wan2_2/README.md). - -**Quantization Script:** - -```bash -msmodelslim quant \ - --model_path /path/to/wan2_2_t2v_float_weights \ - --save_path /path/to/wan2_2_t2v_quantized_weights \ - --device npu \ - --model_type Wan2_2 \ - --config_path /lab_practice/wan2_2/wan2_2_w8a8f8_mxfp_t2v.yaml \ - --trust_remote_code True -``` - -After quantization completes, the output directory will contain the quantized model files. - -For more examples, refer to the [official examples](https://gitcode.com/Ascend/msit/tree/master/msmodelslim/example). - -## Configuration - -1. **CLI**: pass `--quantization ascend`. - -```bash -# Offline inference -python text_to_image.py --model --quantization ascend - -# Online serving -vllm serve --omni --quantization ascend -``` - -## Supported Models - -| Model | HF Models | Recommendation | `ignored_layers` | -|-------|-----------|---------------|------------------| -| HunyuanImage-3.0 | - | All layers | None | - -Currently, quantized HunyuanImage-3.0 weights have not been uploaded to public model platforms such as Hugging Face. You can use a [HunyuanImage-3.0-adapted msModelSlim version](https://gitcode.com/betta18/msmodelslim/tree/hyimage3_mxfp8) to generate the quantized weights manually. We will upload the quantized weights as soon as possible. diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index be1602788b7..9cd407d377a 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -12,9 +12,9 @@ vLLM-Omni supports various advanced features for diffusion models: -- Acceleration: **cache methods**, **parallelism methods**, **startup optimizations** +- Acceleration: **cache methods**, **parallelism methods** - Memory optimization: **cpu offloading**, **quantization** -- Extensions: **LoRA inference**, **frame interpolation** +- Extensions: **LoRA inference** - Execution modes: **step execution** ## Supported Features @@ -44,12 +44,6 @@ Parallelism methods distribute computation across GPUs without quality loss (mat | **[HSDP](diffusion/parallelism/hsdp.md)** | Weight sharding via FSDP2, redistributed on-demand at runtime | Very large models (14B+) on limited VRAM, combinable with SP | | **[Expert Parallelism](diffusion/parallelism/expert_parallel.md)** | Shards MoE expert MLP blocks across devices | MoE diffusion models (e.g., HunyuanImage3.0) | -#### Startup Optimization - -| Method | Description | Best For | -|--------|-------------|----------| -| **[Multi-Thread Weight Loading](#multi-thread-weight-loading)** | Loads safetensors shards in parallel using a thread pool | All diffusion models; reduces startup from minutes to seconds | - **Note:** Some acceleration methods can be combined together for optimized performance. See [Feature Compatibility Table](#feature-compatibility) and [Feature Compatibility Tutorial](feature_compatibility.md) for detailed configuration examples. ### Memory Optimization @@ -69,7 +63,6 @@ Extension methods add specialized capabilities to diffusion models beyond standa | Method | Description | Best For | |--------|-------------|----------| | **[LoRA Inference](diffusion/lora.md)** | Enables inference with Low-Rank Adaptation (LoRA) adapters weights | Reinforcement learning extensions | -| **[Frame Interpolation](diffusion/frame_interpolation.md)** | Inserts intermediate video frames after generation for smoother motion | Video generation pipelines that need higher temporal smoothness | ### Execution Modes @@ -107,55 +100,47 @@ The following tables show which models support each feature: | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| -| **Bagel** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | +| **Bagel** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **FLUX.1-dev** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | -| **FLUX.1-schnell** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | | **FLUX.2-klein** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | | **FLUX.1-Kontext-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **FLUX.2-dev** | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | +| **FLUX.2-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **GLM-Image** | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | -| **LongCat-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | -| **LongCat-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | -| **MagiHuman** | ❌ | ❌ | ❌ | ❓ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | +| **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **LongCat-Image-Edit** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **MammothModa2(T2I)** | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| **Nextstep_1(T2I)** | ❓ | ❓ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | -| **OmniGen2** | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| **Ovis-Image** | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | -| **Qwen-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ✅ | -| **Qwen-Image-2512** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ✅ | -| **Qwen-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ❌ | ❌ | -| **Qwen-Image-Edit-2509** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ❌ | ❌ | -| **Qwen-Image-Layered** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ❌ | ❌ | -| **Stable-Diffusion3.5** | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ (decode) | ❌ | ❌ | -| **Z-Image** | ✅ | ✅ | ✅ | ❓ | ✅ (TP=2 only) | ✅ | ❌ | ✅ (decode) | ✅ | ❌ | +| **Nextstep_1(T2I)** | ❓ | ❓ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **OmniGen2** | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Ovis-Image** | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Qwen-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| **Qwen-Image-2512** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| **Qwen-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| **Qwen-Image-Edit-2509** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| **Qwen-Image-Layered** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| **Stable-Diffusion3.5** | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | +| **Z-Image** | ✅ | ✅ | ✅ | ❓ | ✅ (TP=2 only) | ✅ | ❌ | ✅ | ✅ | ❌ | > Notes: > 1. Nextstep_1(T2I) does not support cache acceleration methods such as TeaCache or Cache-DiT. -> 2. `Tongyi-MAI/Z-Image-Turbo` and `SII-GAIR/daVinci-MagiHuman-Base-1080p` are distilled models with minimal NFEs; CFG-Parallel is not necessary. +> 2. `Tongyi-MAI/Z-Image-Turbo` is a distilled model with minimal NFEs; CFG-Parallel is not necessary. ### VideoGen | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| -| **Wan2.2** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (encode/decode) | ❌ | ❌ | -| **Wan2.1-VACE** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ❌ | ❌ | -| **LTX-2** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | -| **LTX-2.3** | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Wan2.2** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| **Wan2.1-VACE** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| **LTX-2** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **Helios** | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | -| **HunyuanVideo-1.5 T2V I2V** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ❌ | -| **DreamID-Omni** | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | - -**Frame Interpolation Support** - -- **Supported**: Wan2.2 text-to-video, image-to-video, and TI2V pipelines -- **Not supported**: Wan2.1-VACE, LTX-2, LTX-2.3, Helios, HunyuanVideo-1.5, DreamID-Omni +| **HunyuanVideo-1.5 T2V I2V** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | +| **DreamID-Omni** | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ### AudioGen | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| -| **Stable-Audio-Open** | ✅ | ❌ | ❓ | ❓ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | +| **Stable-Audio-Open** | ❌ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ## Feature Compatibility @@ -193,59 +178,6 @@ The following tables show which models support each feature: 6. Step Execution is not compatible with cache backends (TeaCache, Cache-DiT) or LoRA. -## Multi-Thread Weight Loading - -Large diffusion models can take several minutes to load weights at startup (e.g., ~3 min for Qwen-Image, ~5 min for Wan2.2 I2V 14B). Multi-thread weight loading speeds up this process by loading safetensors shards in parallel using a thread pool instead of sequentially. - -This optimization is **enabled by default** with 4 threads. No configuration is needed for the default behavior. - -### Configuration - -| Parameter | CLI Flag | Default | Description | -|-----------|----------|---------|-------------| -| `enable_multithread_weight_load` | `--disable-multithread-weight-load` | `True` (enabled) | Pass the flag to disable multi-thread loading | -| `num_weight_load_threads` | `--num-weight-load-threads` | `4` | Number of threads for parallel weight loading | - -!!! tip - The default of 4 threads balances speed and disk I/O contention. On fast NVMe storage you may benefit from more threads (e.g., 8). On HDD or network storage, the default of 4 avoids saturating I/O bandwidth. - -### Online Serving - -```bash -# Default (multi-thread enabled, 4 threads) -vllm serve Qwen/Qwen-Image --omni --port 8091 - -# Custom thread count -vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers --omni --num-weight-load-threads 8 - -# Disable multi-thread loading -vllm serve Qwen/Qwen-Image --omni --disable-multithread-weight-load -``` - -### Offline Inference - -```python -from vllm_omni import Omni - -# Default (multi-thread enabled, 4 threads) -omni = Omni(model="Qwen/Qwen-Image") - -# Custom thread count -omni = Omni( - model="Wan-AI/Wan2.2-I2V-A14B-Diffusers", - num_weight_load_threads=8, -) -``` - -### Benchmarks - -Measured on NVIDIA H800: - -| Model | Before | After | Speedup | -|-------|--------|-------|---------| -| **Qwen/Qwen-Image** (53.7 GiB) | 168s | 27s | **6.2x** | -| **Wan-AI/Wan2.2-I2V-A14B-Diffusers** (64.5 GiB) | 283s | 56s | **5.1x** | - ## Learn More **Cache Acceleration:** @@ -266,16 +198,11 @@ Measured on NVIDIA H800: **Extensions:** - **[LoRA Inference Guide](diffusion/lora.md)** - Low-Rank Adaptation for style customization and fine-tuning -- **[Frame Interpolation Guide](diffusion/frame_interpolation.md)** - Worker-side post-generation video frame interpolation for smoother motion **Execution Modes:** - **[Step Execution Guide](diffusion/step_execution.md)** - Per-step denoise execution with mid-request abort support -**Startup Optimization:** - -- **[Multi-Thread Weight Loading](#multi-thread-weight-loading)** - Speed up model startup by loading safetensors shards in parallel - **Advanced Topics:** - **[Feature Compatibility](feature_compatibility.md)** - How to combine multiple features for maximum performance diff --git a/docs/user_guide/examples/offline_inference/bagel.md b/docs/user_guide/examples/offline_inference/bagel.md index 0d3498b28d9..5f458750b44 100644 --- a/docs/user_guide/examples/offline_inference/bagel.md +++ b/docs/user_guide/examples/offline_inference/bagel.md @@ -2,61 +2,46 @@ Source . -## Setup -Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup. - -## Architecture +## Set up -BAGEL-7B-MoT is a Mixture-of-Transformers (MoT) model supporting both image generation and understanding. It offers two deployment topologies: +Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup. -| Topology | Stages | Description | -| :------- | :----- | :---------- | -| **Two-stage** (default) | Stage 0 (Thinker, AR) + Stage 1 (DiT, Diffusion) | Thinker handles text/understanding via vLLM AR engine; DiT handles image generation. KV cache is transferred between stages. | -| **Single-stage** | Stage 0 (DiT, Diffusion) only | The DiT stage contains a full LLM, ViT, VAE, and tokenizer internally. All modalities are handled within a single diffusion process. | +## Run examples -Both topologies support all four modalities: `text2img`, `img2img`, `img2text`, `text2text`. +**Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, please modify the stage configuration to distribute the model across devices. -## Quick Start +Get into the bagel folder ```bash cd examples/offline_inference/bagel - -# Default two-stage mode (auto-detected) -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A cute cat" - -# Single-stage mode -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A cute cat" \ - --deploy-config vllm_omni/deploy/bagel_single_stage.yaml ``` -> **Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. For dual-GPU setups, modify the deploy YAML to distribute stages across devices. +### Modality Control -## Modality Control +BAGEL-7B-MoT supports multiple modality modes. You can control the mode using the `--modality` argument: -Control the mode using the `--modality` argument: +#### Text to Image (text2img) -| Modality | Input | Output | Description | -| :------- | :---- | :----- | :---------- | -| `text2img` | Text | Image | Generate images from text prompts | -| `img2img` | Image + Text | Image | Transform images using text guidance | -| `img2text` | Image + Text | Text | Generate text descriptions from images | -| `text2text` | Text | Text | Pure text generation (language model mode) | +- **Pipeline**: Text → Thinker → DiT → VAE Decode → Image +- **Stages Used**: Stage 0 (Thinker) + Stage 1 (DiT) +- **KV Transfer**: Thinker sends KV cache to DiT for conditioned generation -### Text to Image (text2img) +Generate images from text prompts: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ - --prompts "A cute cat" \ - --steps 50 + --prompts "A cute cat" ``` -### Image to Image (img2img) +#### Image to Image (img2img) + +- **Pipeline**: Image → VAE Encode → DiT → VAE Decode → New Image +- **Stages Used**: Stage 1 (DiT) only +- **Special**: Bypasses the Thinker stage, direct image-to-image transformation + +Transform images based on text prompts: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ @@ -65,7 +50,13 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --prompts "Let the woman wear a blue dress" ``` -### Image to Text (img2text) +#### Image to Text (img2text) + +- **Pipeline**: Image → ViT + VAE Encode → Thinker → Text Output +- **Stages Used**: Stage 0 (Thinker) only +- **Special**: Uses both VAE latent encoding AND ViT semantic encoding for comprehensive image understanding + +Generate text descriptions from images: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ @@ -74,210 +65,205 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --prompts "Describe this image in detail" ``` -### Text to Text (text2text) +#### Text to Text (text2text) + +- **Pipeline**: Text → Thinker → Text Output +- **Stages Used**: Stage 0 (Thinker) only +- **Special**: No visual components involved, operates as pure language model + +Pure text generation: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2text \ --prompts "What is the capital of France?" -# Load prompts from a text file (one prompt per line): +# You can load prompts from a text file (one prompt per line): python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2text \ --txt-prompts /path/to/prompts.txt ``` -## Think Mode - -Think mode enables the model to generate `...` planning/reasoning tokens before producing the final output. This improves generation quality for complex prompts. +### Inference Steps -- **Two-stage**: The Thinker (AR) stage decodes think tokens, then transfers the augmented KV cache to the DiT stage for image generation. -- **Single-stage**: The DiT's internal LLM generates think tokens in-place before proceeding to denoise. +Control the number of inference steps for image generation: ```bash -# Think + text2img: plan before generating +# You can adjust steps to 100 to improve image quality python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ - --prompts "A futuristic city with flying cars" \ - --think \ - --max-think-tokens 1000 + --steps 50 \ + --prompts "A cute cat" +``` -# Think + img2img: reason about the edit -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality img2img \ - --image-path /path/to/image.jpg \ - --prompts "Make it look like a watercolor painting" \ - --think +### Key arguments -# Think + img2text: reason before describing -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality img2text \ - --image-path /path/to/image.jpg \ - --prompts "What is happening in this image?" \ - --think +BAGEL-7B-MoT supports **multiple modality modes** for different use cases. -# Think + text2text: chain-of-thought reasoning -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2text \ - --prompts "Solve: 23 * 47" \ - --think -``` +The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml) -Think mode parameters: +#### 📌 Command Line Arguments (end2end.py) -| Argument | Default | Description | -| :------- | :------ | :---------- | -| `--think` | `False` | Enable thinking mode | -| `--max-think-tokens` | `1000` | Maximum tokens for think generation | -| `--do-sample` | `False` | Enable sampling (vs. greedy) for text generation | -| `--text-temperature` | `0.3` | Temperature for text generation sampling | +| Argument | Type | Default | Description | +| :--------------------- | :----- | :---------------------------- | :----------------------------------------------------------- | +| `--model` | string | `ByteDance-Seed/BAGEL-7B-MoT` | Model path or name | +| `--modality` | choice | `text2img` | Modality mode: `text2img`, `img2img`, `img2text`, `text2text` | +| `--prompts` | list | `None` | Input text prompts directly | +| `--txt-prompts` | string | `None` | Path to txt file with one prompt per line | +| `--image-path` | string | `None` | Input image path (for `img2img`/`img2text`) | +| `--steps` | int | `50` | Number of inference steps | +| `--stage-configs-path` | string | `None` | Custom stage config file path | +| `--worker-backend` | choice | `process` | Worker backend: `process` or `ray` | +| `--ray-address` | string | `None` | Ray cluster address | +| `--enable-stats` | flag | `False` | Enable statistics logging | +| `--init-sleep-seconds` | int | `20` | Initialization sleep time | +| `--batch-timeout` | int | `5` | Batch timeout | +| `--init-timeout` | int | `300` | Initialization timeout | -## Classifier-Free Guidance (CFG) +------ -CFG controls the trade-off between prompt fidelity and diversity. These parameters apply to image generation modalities (`text2img`, `img2img`). +#### ⚙️ Stage Configuration Parameters (bagel.yaml) -```bash -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A photorealistic portrait" \ - --cfg-text-scale 6.0 \ - --cfg-img-scale 2.0 \ - --negative-prompt "blurry, low quality, distorted" \ - --cfg-interval 0.4 1.0 \ - --cfg-renorm-type global \ - --cfg-renorm-min 0.0 -``` + **Stage 0 - Thinker (LLM Stage)** -| Argument | Default | Description | -| :------- | :------ | :---------- | -| `--cfg-text-scale` | `4.0` | Text CFG scale (higher = more prompt-adherent) | -| `--cfg-img-scale` | `1.5` | Image CFG scale (for img2img) | -| `--negative-prompt` | `None` | Negative prompt for CFG conditioning | -| `--cfg-interval` | pipeline default | CFG active interval `[start, end]` as fractions of total timesteps | -| `--cfg-renorm-type` | `None` | Renormalization type: `global`, `text_channel`, `channel` | -| `--cfg-renorm-min` | `None` | Minimum renormalization value | -| `--cfg-parallel-size` | `1` | CFG parallel size: `1` = batched (single GPU), `2` = 2-branch parallel, `3` = full 3-GPU parallel | +| Parameter | Value | Description | +| :------------------------------- | :------------------------------ | :----------------------- | +| `stage_type` | `llm` | Stage type | +| `devices` | `"0"` | GPU device ID | +| `max_num_seqs` | `1` | Maximum batch size | +| `model_stage` | `thinker` | Model stage identifier | +| `model_arch` | `BagelForConditionalGeneration` | Model architecture | +| `gpu_memory_utilization` | `0.4` | GPU memory utilization | +| `tensor_parallel_size` | `1` | Tensor parallel size | +| `max_num_batched_tokens` | `32768` | Maximum batched tokens | +| `omni_kv_config.need_send_cache` | `true` | Whether to send KV cache | -## Deployment Topologies +------ -### Two-Stage (Default) +**Stage 1 - DiT (Diffusion Stage)** -The default topology auto-detected from the model. No extra flags needed. +| Parameter | Value | Description | +| :------------------------------- | :---------- | :-------------------------- | +| `stage_type` | `diffusion` | Stage type | +| `devices` | `"0"` | GPU device ID | +| `max_num_seqs` | `1` | Maximum batch size | +| `model_stage` | `dit` | Model stage identifier | +| `gpu_memory_utilization` | `0.4` | GPU memory utilization | +| `omni_kv_config.need_recv_cache` | `true` | Whether to receive KV cache | +| `engine_input_source` | `[0]` | Input source from Stage 0 | -```bash -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A cute cat" -``` +------ -The pipeline is defined in [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml). Stage 0 (Thinker) and Stage 1 (DiT) share GPU 0 by default. For dual-GPU setups, customize the deploy YAML and set `devices: "1"` for stage 1. +#### Tensor Parallelism (TP) -### Single-Stage +For larger models or multi-GPU environments, you can enable Tensor Parallelism (TP) by modifying the stage configuration (e.g., [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml)). -Pass the single-stage deploy config via `--deploy-config`: +1. **Set `tensor_parallel_size`**: Increase this value (e.g., to `2` or `4`). +2. **Set `devices`**: Specify the comma-separated GPU IDs to be used for the stage (e.g., `"0,1"`). -```bash -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A cute cat" \ - --deploy-config vllm_omni/deploy/bagel_single_stage.yaml +Example configuration for TP=2 on GPUs 0 and 1: +```yaml + engine_args: + tensor_parallel_size: 2 + ... + runtime: + devices: "0,1" ``` -See [`bagel_single_stage.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel_single_stage.yaml) for configuration details. The `pipeline: bagel_single_stage` field selects the single-stage topology from the pipeline registry. +------ -### Tensor Parallelism (TP) +#### 🔗 Runtime Configuration -For larger models or multi-GPU environments, customize the deploy YAML (see [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml)) and set per-stage `tensor_parallel_size` and `devices`: +| Parameter | Value | Description | +| :-------------------- | :------ | :------------------------------- | +| `window_size` | `-1` | Window size (-1 means unlimited) | +| `max_inflight` | `1` | Maximum inflight requests | +| `shm_threshold_bytes` | `65536` | Shared memory threshold (64KB) | -```yaml -# Example: TP=2 on GPUs 0,1 for the Thinker stage -stages: - - stage_id: 0 - tensor_parallel_size: 2 - devices: "0,1" +## Using Mooncake Connector + +[Mooncake](https://github.com/kvcache-ai/Mooncake) is a high-performance distributed KV cache transfer engine that enables efficient cross-node data movement via TCP or RDMA, making it ideal for multi-node disaggregated inference. + +By default, BAGEL uses `SharedMemoryConnector` for inter-stage communication. You can switch to the Mooncake connector for better performance on multi-GPU setups and to enable multi-node deployment. + +### Prerequisites + +Install the Mooncake transfer engine: + +```bash +# For CUDA-enabled systems (recommended) +pip install mooncake-transfer-engine + +# For non-CUDA systems +pip install mooncake-transfer-engine-non-cuda ``` -Then pass the custom deploy YAML: +### Step 1: Start the Mooncake Master + +On the **primary node**, start the Mooncake master service (run in a separate terminal or background with `&`): ```bash -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A cute cat" \ - --deploy-config /path/to/custom_bagel.yaml +# Optional: enable disk-backed storage by creating a directory and passing --root_fs_dir. +# Without it, Mooncake runs in memory-only mode, which is sufficient for KV cache transfer. +mkdir -p ./mc_storage + +mooncake_master \ + --rpc_port=50051 \ + --enable_http_metadata_server=true \ + --http_metadata_server_host=0.0.0.0 \ + --http_metadata_server_port=8080 \ + --metrics_port=9003 \ + --root_fs_dir=./mc_storage/ \ + --cluster_id=mc-local-1 & ``` -### FP8 Quantization +### Step 2: Run Offline Inference with Mooncake + +Use the provided Mooncake stage config [`bagel_multiconnector.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml). Before launching, update the `metadata_server` and `master` addresses in the YAML to match your Mooncake master node's IP (use `127.0.0.1` for single-node testing). ```bash +cd examples/offline_inference/bagel + +# Text to Image with Mooncake python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --quantization fp8 + --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml + +# Image to Text with Mooncake +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality img2text \ + --image-path /path/to/image.jpg \ + --prompts "Describe this image" \ + --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml + +# Text to Text with Mooncake +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2text \ + --prompts "What is the capital of France?" \ + --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml ``` -## Command Line Reference - -### Core Arguments - -| Argument | Type | Default | Description | -| :------- | :--- | :------ | :---------- | -| `--model` | string | `ByteDance-Seed/BAGEL-7B-MoT` | Model path or HuggingFace name | -| `--modality` | choice | `text2img` | `text2img`, `img2img`, `img2text`, `text2text` | -| `--prompts` | list | `None` | Input text prompts | -| `--txt-prompts` | string | `None` | Path to text file with one prompt per line | -| `--image-path` | string | `None` | Input image path (required for `img2img`/`img2text`) | -| `--output` | string | `.` | Output directory for saved images | -| `--steps` | int | `50` | Number of diffusion inference steps | -| `--seed` | int | `None` | Random seed for reproducibility | - -### Think Mode Arguments - -| Argument | Type | Default | Description | -| :------- | :--- | :------ | :---------- | -| `--think` | flag | `False` | Enable `...` planning/reasoning | -| `--max-think-tokens` | int | `1000` | Maximum tokens for think generation | -| `--do-sample` | flag | `False` | Use sampling instead of greedy decoding | -| `--text-temperature` | float | `0.3` | Sampling temperature for text generation | - -### CFG Arguments - -| Argument | Type | Default | Description | -| :------- | :--- | :------ | :---------- | -| `--cfg-text-scale` | float | `4.0` | Text CFG guidance scale | -| `--cfg-img-scale` | float | `1.5` | Image CFG guidance scale | -| `--negative-prompt` | string | `None` | Negative prompt for CFG | -| `--cfg-parallel-size` | int | `1` | CFG parallel GPU count (1, 2, or 3) | -| `--cfg-interval` | float[2] | pipeline default | CFG active window `[start, end]` | -| `--cfg-renorm-type` | string | `None` | `global`, `text_channel`, or `channel` | -| `--cfg-renorm-min` | float | `None` | Minimum renormalization value | - -### Engine Arguments - -| Argument | Type | Default | Description | -| :------- | :--- | :------ | :---------- | -| `--deploy-config` | string | `None` | Path to deploy YAML (auto-detected if omitted) | -| `--stage-configs-path` | string | `None` | [Deprecated] Legacy path to `stage_args` YAML; prefer `--deploy-config` | -| `--worker-backend` | choice | `process` | `process` or `ray` | -| `--ray-address` | string | `None` | Ray cluster address | -| `--quantization` | string | `None` | Quantization method (e.g. `fp8`) | -| `--log-stats` | flag | `False` | Enable statistics logging | -| `--init-timeout` | int | `300` | Initialization timeout (seconds) | -| `--batch-timeout` | int | `5` | Batch timeout (seconds) | -| `--enable-diffusion-pipeline-profiler` | flag | `False` | Profile diffusion stage durations | +For more details on the Mooncake connector and multi-node setup, see the [Mooncake Store Connector documentation](https://github.com/vllm-project/vllm-omni/tree/main/docs/design/feature/omni_connectors/mooncake_store_connector.md). + +------ ## FAQ -- If you encounter OOM errors, try decreasing `max_model_len` or `gpu_memory_utilization` in the deploy YAML. +- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. -**Two-stage VRAM usage:** +```bash +sudo apt update +sudo apt install ffmpeg +``` -| Stage | VRAM | -| :---- | :--- | -| Stage 0 (Thinker) | **15.04 GiB + KV Cache** | -| Stage 1 (DiT) | **26.50 GiB** | -| Total | **~42 GiB + KV Cache** | +- If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. -**Single-stage VRAM usage:** The DiT loads the full model (~42 GiB) in one process. +| Stage | VRAM | +| :------------------ | :--------------------------- | +| Stage-0 (Thinker) | **15.04 GiB** **+ KV Cache** | +| Stage-1 (DiT) | **26.50 GiB** | +| Total | **~42 GiB + KV Cache** | ## Example materials diff --git a/docs/user_guide/examples/offline_inference/cosyvoice3.md b/docs/user_guide/examples/offline_inference/cosyvoice3.md index d0638f4140f..d912f1c62eb 100644 --- a/docs/user_guide/examples/offline_inference/cosyvoice3.md +++ b/docs/user_guide/examples/offline_inference/cosyvoice3.md @@ -10,7 +10,7 @@ Install dependencies: uv pip install -e . ``` -> **Note:** This includes required libraries such as `soundfile`, +> **Note:** This includes required libraries such as `librosa`, `soundfile`, > `onnxruntime`, `x-transformers`, and `einops` via > `requirements/common.txt` and platform-specific requirements files. @@ -61,17 +61,10 @@ Key components live in `vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py - Stage 0 uses `CosyVoice3LM` and outputs speech tokens + conditioning features. - Stage 1 runs the flow model (DiT-based CFM) and HiFiGAN to synthesize waveform. -Pipeline topology lives in `vllm_omni/model_executor/models/cosyvoice3/pipeline.py`; -runtime tunables (batch size, memory limits, sampling) live in -`vllm_omni/deploy/cosyvoice3.yaml`. The deploy config auto-loads by -HF `model_type` and defaults to `async_chunk: true` (shared-memory -streaming). Pass `--no-async-chunk` on `vllm serve` to switch to the -legacy sync path where stage 1 runs `text2flow` over the full -speech-token sequence. +Stage wiring is configured in `vllm_omni/model_executor/stage_configs/cosyvoice3.yaml`. - Stage 0 emits latent speech tokens. -- Stage 1 consumes them via `sync_process_input_func` (sync mode) or the - shared-memory connector (async-chunk mode) and outputs audio. +- Stage 1 consumes them via `custom_process_input_func` and outputs audio. ## Example materials diff --git a/docs/user_guide/examples/offline_inference/glm_image.md b/docs/user_guide/examples/offline_inference/glm_image.md index c6ac6e33ffd..4519e26fda6 100644 --- a/docs/user_guide/examples/offline_inference/glm_image.md +++ b/docs/user_guide/examples/offline_inference/glm_image.md @@ -1,87 +1,154 @@ -# GLM-Image Offline Inference +# GLM-Image Multistage End-to-End Inference -GLM-Image is a 2-stage image generation model (AR + Diffusion) supported by vLLM-Omni's -declarative config system. The pipeline topology and stage structure are declared in -`vllm_omni/model_executor/models/glm_image/pipeline.py`; deployment knobs live in -`vllm_omni/deploy/glm_image.yaml`. +Source . + + +This example demonstrates how to run GLM-Image with the vLLM-Omni multistage architecture. ## Architecture +GLM-Image uses a 2-stage pipeline: + +``` +┌─────────────────────────────────────────────────────────────┐ +│ GLM-Image Pipeline │ +├─────────────────────────────────────────────────────────────┤ +│ │ +│ Stage 0 (AR Model) Stage 1 (Diffusion) │ +│ ┌─────────────────┐ ┌─────────────────────┐ │ +│ │ vLLM-optimized │ │ GlmImagePipeline │ │ +│ │ GlmImageFor │ prior │ ┌───────────────┐ │ │ +│ │ Conditional │──tokens───►│ │ DiT Denoiser │ │ │ +│ │ Generation │ │ └───────────────┘ │ │ +│ │ (9B AR model) │ │ │ │ │ +│ └─────────────────┘ │ ▼ │ │ +│ ▲ │ ┌───────────────┐ │ │ +│ │ │ │ VAE Decode │──┼──► Image +│ Text/Image │ └───────────────┘ │ │ +│ Input └─────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Features + +- **vLLM-optimized AR**: Uses PagedAttention and tensor parallelism for faster prior token generation +- **Flexible deployment**: AR and Diffusion stages can run on different GPUs +- **Text-to-Image**: Generate images from text descriptions +- **Image-to-Image**: Edit existing images with text prompts + +## Usage + +### Text-to-Image + +```bash +python end2end.py \ + --config-path ../../../vllm_omni/model_executor/stage_configs/glm_image.yaml \ + --prompt "A beautiful sunset over the ocean with sailing boats" \ + --height 1024 \ + --width 1024 \ + --output output_t2i.png ``` -Stage 0 (AR Model) Stage 1 (Diffusion) -┌───────────────────┐ ┌─────────────────────┐ -│ vLLM-optimized │ prior │ GlmImagePipeline │ -│ GlmImageFor │──tokens──►│ ┌───────────────┐ │ -│ Conditional │ │ │ DiT Denoiser │ │ -│ Generation │ │ └───────┬───────┘ │ -│ (9B AR model) │ │ ▼ │ -└───────────────────┘ │ ┌───────────────┐ │ - ▲ │ │ VAE Decode │──┼──► Image - │ │ └───────────────┘ │ - Text / Image └─────────────────────┘ - Input + +### Image-to-Image (Image Editing) + +```bash +python end2end.py \ + --config-path ../../../vllm_omni/model_executor/stage_configs/glm_image.yaml \ + --prompt "Transform this scene into a winter wonderland" \ + --image input.png \ + --output output_i2i.png ``` -## Text-to-Image - -```python -from vllm_omni.entrypoints.omni import Omni - -if __name__ == "__main__": - omni = Omni(model="zai-org/GLM-Image") - outputs = omni.generate( - "A photorealistic mountain landscape at sunset", - sampling_params={ - "height": 1024, - "width": 1024, - "num_inference_steps": 50, - "guidance_scale": 1.5, - "seed": 42, - }, - ) - outputs[0].request_output.images[0].save("output.png") +### With Custom Parameters + +```bash +python end2end.py \ + --model-path /path/to/glm-image \ + --config-path ../../../vllm_omni/model_executor/stage_configs/glm_image.yaml \ + --prompt "A photorealistic cat sitting on a window sill" \ + --height 1024 \ + --width 1024 \ + --num-inference-steps 50 \ + --guidance-scale 1.5 \ + --seed 42 \ + --output output.png ``` -## Image-to-Image (Image Editing) - -```python -from vllm_omni.entrypoints.omni import Omni - -if __name__ == "__main__": - omni = Omni(model="zai-org/GLM-Image") - outputs = omni.generate( - { - "prompt": "Convert this image to watercolor style", - "multi_modal_data": { - "image": "input.png", - }, - }, - sampling_params={ - "height": 1024, - "width": 1024, - "num_inference_steps": 50, - "guidance_scale": 1.5, - "seed": 42, - }, - ) - outputs[0].request_output.images[0].save("output.png") +## Shell Scripts + +### Run Text-to-Image + +```bash +./run_t2i.sh +``` + +### Run Image-to-Image + +```bash +./run_i2i.sh --image /path/to/input.png +``` + +## Stage Configuration + +The stage config (`glm_image.yaml`) defines: + +- **Stage 0 (AR)**: Uses `GPUARWorker` with vLLM engine + + - Model: `GlmImageForConditionalGeneration` + - Output: `token_ids` (prior tokens) + +- **Stage 1 (Diffusion)**: Uses diffusion engine + - Model: `GlmImagePipeline` + - Output: Generated image + +See `vllm_omni/model_executor/stage_configs/glm_image.yaml` for full configuration. + +## Comparison with Single-Stage + +| Aspect | Single-Stage (transformers) | Multistage (vLLM) | +| ----------- | --------------------------- | ------------------- | +| AR Model | transformers native | vLLM PagedAttention | +| Memory | Higher (no KV cache opt) | Lower (optimized) | +| Throughput | Lower | Higher | +| Flexibility | Single GPU | Multi-GPU support | + +## Troubleshooting + +### OOM Error + +Try reducing memory usage: + +```bash +# In glm_image.yaml, adjust: +gpu_memory_utilization: 0.5 # Reduce from 0.6 +``` + +### Slow Initialization + +The first run loads model weights. Subsequent runs are faster: + +```bash +--stage-init-timeout 900 # Increase timeout for slow storage ``` -## Generation Parameters +## Requirements -| Parameter | Type | Default | Description | -| --------------------- | ----- | ------- | ----------------------------------- | -| `height` | int | 1024 | Image height in pixels | -| `width` | int | 1024 | Image width in pixels | -| `num_inference_steps` | int | 50 | Number of diffusion denoising steps | -| `guidance_scale` | float | 1.5 | Classifier-free guidance scale | -| `seed` | int | None | Optional random seed | -| `negative_prompt` | str | None | Negative prompt | +- vLLM-Omni with GLM-Image support +- CUDA-capable GPU (recommended: H100/A100 with 80GB) +- GLM-Image model weights -## VRAM Requirements +## Example materials -| Stage | VRAM | -| :---------------- | :--------------------- | -| Stage-0 (AR) | **~18 GiB + KV Cache** | -| Stage-1 (DiT+VAE) | **~20 GiB** | -| Total | **~38 GiB + KV Cache** | +??? abstract "end2end.py" + ``````py + --8<-- "examples/offline_inference/glm_image/end2end.py" + `````` +??? abstract "run_i2i.sh" + ``````sh + --8<-- "examples/offline_inference/glm_image/run_i2i.sh" + `````` +??? abstract "run_t2i.sh" + ``````sh + --8<-- "examples/offline_inference/glm_image/run_t2i.sh" + `````` diff --git a/docs/user_guide/examples/offline_inference/image_to_video.md b/docs/user_guide/examples/offline_inference/image_to_video.md index 6e105741a7e..7a750aeff3b 100644 --- a/docs/user_guide/examples/offline_inference/image_to_video.md +++ b/docs/user_guide/examples/offline_inference/image_to_video.md @@ -62,13 +62,12 @@ Key arguments: - `--negative-prompt`: Optional list of artifacts to suppress. - `--boundary-ratio`: Boundary split ratio for two-stage MoE models. - `--flow-shift`: Scheduler flow shift (5.0 for 720p, 12.0 for 480p). -- `--sample-solver`: Wan2.2 sampling solver. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints. - `--num-inference-steps`: Number of denoising steps (default 50). - `--fps`: Frames per second for the saved MP4 (requires `diffusers` export_to_video). - `--output`: Path to save the generated video. - `--vae-use-slicing`: Enable VAE slicing for memory optimization. - `--vae-use-tiling`: Enable VAE tiling for memory optimization. -- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism/cfg_parallel.md). +- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel). - `--tensor-parallel-size`: tensor parallel size (effective for models that support TP, e.g. LTX2). - `--enable-cpu-offload`: enable CPU offloading for diffusion models. - `--use-hsdp`: Enable Hybrid Sharded Data Parallel to shard model weights across GPUs. @@ -79,9 +78,6 @@ Key arguments: > ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage. -For Wan2.2 LightX2V-converted local Diffusers directories and related LoRA -assets, see the [LoRA guide](../../diffusion/lora.md#wan22-lightx2v-offline-assembly). - ## Example materials ??? abstract "image_to_video.py" diff --git a/docs/user_guide/examples/offline_inference/mimo_audio.md b/docs/user_guide/examples/offline_inference/mimo_audio.md index 1cba2f77dcf..1a3be15d69a 100644 --- a/docs/user_guide/examples/offline_inference/mimo_audio.md +++ b/docs/user_guide/examples/offline_inference/mimo_audio.md @@ -38,6 +38,7 @@ Run a single sample for basic TTS: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft ``` @@ -46,6 +47,7 @@ Run batch samples for basic TTS: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft \ --num-prompts {batch_size} @@ -63,6 +65,7 @@ Generate speech from text input: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft \ --text "The weather is so nice today." @@ -74,6 +77,7 @@ Generate speech with explicit voice style instructions: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft_with_instruct \ --text "The weather is so nice today." \ @@ -86,6 +90,7 @@ Generate speech using an audio reference for voice cloning: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft_with_audio \ --text "The weather is so nice today." \ @@ -98,6 +103,7 @@ Generate speech from text containing natural voice descriptions: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft_with_natural_instruction \ --text "In a panting young male voice, he said: I can't run anymore, wait for me!" @@ -109,6 +115,7 @@ Transcribe audio to text: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type audio_trancribing_sft \ --audio-path "./spoken_dialogue_assistant_turn_1.wav" @@ -120,6 +127,7 @@ Understand and analyze audio content with text queries: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type audio_understanding_sft \ --text "Summarize the audio." \ @@ -132,6 +140,7 @@ Audio understanding with reasoning chain: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type audio_understanding_sft_with_thinking \ --text "Summarize the audio." \ @@ -144,6 +153,7 @@ Multi-turn dialogue with audio input and output: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type spoken_dialogue_sft_multiturn \ --audio-path "./prompt_speech_zh_m.wav" @@ -157,6 +167,7 @@ Multi-turn dialogue converting speech to text: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type speech2text_dialogue_sft_multiturn ``` @@ -169,6 +180,7 @@ Multi-turn text-only dialogue: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type text_dialogue_sft_multiturn ``` @@ -177,6 +189,29 @@ Note: This task uses hardcoded message lists in the script. ## Troubleshooting +### Audio dependencies (soundfile, librosa) + +This example depends on **soundfile** (read/write WAV) and **librosa** (load audio including MP3). Install the project requirements first: + +```bash +pip install -r requirements/common.txt +# or at least: pip install soundfile>=0.13.1 librosa>=0.11.0 +``` + +- **`soundfile` / libsndfile not found** + `soundfile` uses the C library **libsndfile**. On Linux, install the system package before pip: + - Debian/Ubuntu: `sudo apt-get install libsndfile1` + - For development builds: `sudo apt-get install libsndfile1-dev` + - Then: `pip install soundfile` + +- **`librosa` fails to load MP3 or reports "No backend available"** + Loading MP3 (e.g. in `spoken_dialogue_sft_multiturn` with `.mp3` files) uses **ffmpeg** as the backend. Install ffmpeg: + - Debian/Ubuntu: `sudo apt-get install ffmpeg` + - macOS: `brew install ffmpeg` + +- **`ImportError: No module named 'soundfile'` or `ModuleNotFoundError: ... librosa`** + Ensure you are in the same Python environment where vLLM Omni and the example dependencies are installed, and that `requirements/common.txt` (or the packages above) are installed. + ### Tokenizer path - **`MIMO_AUDIO_TOKENIZER_PATH` not set or model fails to find tokenizer** diff --git a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md index c54976b540d..07a56cf9a06 100644 --- a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md +++ b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md @@ -64,6 +64,14 @@ If media file paths are not provided, the script will use default assets. Suppor - `use_audio_in_video`: Extract audio from video - `text`: Text-only query +### FAQ + +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` + ## Example materials ??? abstract "end2end.py" diff --git a/docs/user_guide/examples/offline_inference/qwen3_omni.md b/docs/user_guide/examples/offline_inference/qwen3_omni.md index 2d856f7380a..6577092bbfe 100644 --- a/docs/user_guide/examples/offline_inference/qwen3_omni.md +++ b/docs/user_guide/examples/offline_inference/qwen3_omni.md @@ -112,6 +112,14 @@ python end2end_async_chunk.py \ > async_chunk example when you need the stage-level concurrency semantics > described in PR #962 / #1151. +### FAQ + +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` + ## Example materials ??? abstract "end2end.py" diff --git a/docs/user_guide/examples/offline_inference/qwen3_tts.md b/docs/user_guide/examples/offline_inference/qwen3_tts.md index 7226ac1fe4b..19fea4132ce 100644 --- a/docs/user_guide/examples/offline_inference/qwen3_tts.md +++ b/docs/user_guide/examples/offline_inference/qwen3_tts.md @@ -18,11 +18,11 @@ Please refer to the [stage configuration documentation](https://docs.vllm.ai/pro ### ROCm Dependencies -You will need to install the dependency `onnxruntime-rocm`. +You will need to install these two dependencies `onnxruntime-rocm` and `sox`. ``` pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm -pip install onnxruntime-rocm +pip install onnxruntime-rocm sox ``` ## Quick Start @@ -144,13 +144,13 @@ completes. This demonstrates that audio data is available progressively rather t ## Batched Decoding -The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, set `max_num_seqs > 1` on both stages via `--stage-overrides` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. +The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, provide a stage config with `max_num_seqs > 1` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. ``` python end2end.py --query-type CustomVoice \ --txt-prompts benchmark_prompts.txt \ --batch-size 4 \ - --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2},"1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml ``` **Important:** `--batch-size` must match a CUDA graph capture size (1, 2, 4, 8, 16...) because the Talker's code predictor KV cache is sized to `max_num_seqs`, and CUDA graphs pad the batch to the next capture size. Both stages need `max_num_seqs >= batch_size` in the stage config for batching to take effect. If only stage 1 has a higher `max_num_seqs`, it won't help — stage 1 can only batch chunks from requests that are in-flight simultaneously, which requires stage 0 to also process multiple requests concurrently. diff --git a/docs/user_guide/examples/offline_inference/text_to_audio.md b/docs/user_guide/examples/offline_inference/text_to_audio.md index a31a4d7a4d5..62a70e5254d 100644 --- a/docs/user_guide/examples/offline_inference/text_to_audio.md +++ b/docs/user_guide/examples/offline_inference/text_to_audio.md @@ -29,22 +29,6 @@ python text_to_audio.py \ --output stable_audio_output.wav ``` -To reduce per-GPU memory for multi-GPU inference, launch with HSDP: - -```bash -python text_to_audio.py \ - --model stabilityai/stable-audio-open-1.0 \ - --prompt "The sound of a hammer hitting a wooden surface" \ - --negative-prompt "Low quality" \ - --seed 42 \ - --guidance-scale 7.0 \ - --audio-length 10.0 \ - --num-inference-steps 100 \ - --use-hsdp \ - --hsdp-shard-size 2 \ - --output stable_audio_output.wav -``` - Key arguments: - `--prompt`: text description (string). @@ -53,9 +37,6 @@ Key arguments: - `--guidance-scale`: classifier-free guidance scale. - `--audio-length`: audio duration in seconds. - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower). -- `--use-hsdp`: enable HSDP weight sharding for the Stable Audio DiT. -- `--hsdp-shard-size`: number of GPUs used for HSDP sharding. -- `--hsdp-replicate-size`: number of HSDP replica groups. - `--output`: path to save the generated WAV file. ## Example materials diff --git a/docs/user_guide/examples/offline_inference/text_to_video.md b/docs/user_guide/examples/offline_inference/text_to_video.md index a09dbfc979f..4288c089c60 100644 --- a/docs/user_guide/examples/offline_inference/text_to_video.md +++ b/docs/user_guide/examples/offline_inference/text_to_video.md @@ -5,8 +5,6 @@ Source : In the frame, a woman with black long hair is identified as .\n**Overall Environment/Scene**: A lively open-kitchen café at night; stove flames flare, steam rises, and warm pendant lights swing slightly as staff move behind her. The shot is an upper-body close-up.\n**Main Characters/Subjects Appearance**: is a young woman with thick dark wavy hair and a side part. She wears a fitted black top under a light apron, a thin gold chain necklace, and small stud earrings.\n**Main Characters/Subjects Actions**: tastes the sauce with a spoon, then turns her face toward the camera while still holding the spoon, her expression shifting from focused to conflicted.\n maintains eye contact, swallows as if choosing her words, and says, I keep telling myself I’m fine,but some nights it feels like I’m just performing calm." \ - --image-path 9.png \ - --audio-path 9.wav \ - --video-negative-prompt "jitter, bad hands, blur, distortion" \ - --audio-negative-prompt "robotic, muffled, echo, distorted" \ - --cfg-parallel-size 2 \ - --num-inference-steps 45 \ - --height 704 \ - --width 1280 \ - --output out_dreamid_omni_oneip.mp4 -``` - - Key arguments: - `--prompt`: text description (string). - `--model`: path to the model local directory. diff --git a/docs/user_guide/examples/online_serving/bagel.md b/docs/user_guide/examples/online_serving/bagel.md index aa8b33de802..4a6094c0894 100644 --- a/docs/user_guide/examples/online_serving/bagel.md +++ b/docs/user_guide/examples/online_serving/bagel.md @@ -2,112 +2,147 @@ Source . -## Installation -Please refer to [README.md](https://github.com/vllm-project/vllm-omni/tree/main/README.md) - -## Architecture - -BAGEL-7B-MoT is a Mixture-of-Transformers (MoT) model supporting both image generation and understanding. It offers two deployment topologies: +## 🛠️ Installation -| Topology | Stages | Description | -| :------- | :----- | :---------- | -| **Two-stage** (default) | Stage 0 (Thinker, AR) + Stage 1 (DiT, Diffusion) | Thinker handles text/understanding via vLLM AR engine; DiT handles image generation. KV cache is transferred between stages. | -| **Single-stage** | Stage 0 (DiT, Diffusion) only | The DiT stage contains a full LLM, ViT, VAE, and tokenizer internally. All modalities are handled within a single diffusion process. | - -Both topologies support all four modalities: `text2img`, `img2img`, `img2text`, `text2text`. - -> **Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, modify the deploy YAML to distribute stages across devices. +Please refer to [README.md](https://github.com/vllm-project/vllm-omni/tree/main/README.md) -## Launch the Server +## Run examples (BAGEL-7B-MoT) -### Two-Stage (Default) +**Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, please modify the stage configuration to distribute the model across devices. -The default pipeline is auto-detected from the model. No extra flags needed: +### Launch the Server ```bash +# Use default configuration vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 ``` Or use the convenience script: ```bash -cd examples/online_serving/bagel +cd /workspace/vllm-omni/examples/online_serving/bagel bash run_server.sh +``` -# Launch a single stage per terminal -bash run_server_stage_cli.sh --stage 0 -bash run_server_stage_cli.sh --stage 1 +```bash +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --stage-configs-path /path/to/stage_configs_file ``` -To use a custom deploy YAML (note: `--stage-configs-path` is deprecated in favor of `--deploy-config`): +#### 🚀 Tensor Parallelism (TP) + +For larger models or multi-GPU environments, you can enable Tensor Parallelism (TP) for the server. + +1. **Modify Stage Config**: Create or modify a stage configuration yaml (e.g., [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml)). Set `tensor_parallel_size` to `2` (or more) and update `devices` to include multiple GPU IDs (e.g., `"0,1"`). +```yaml + engine_args: + tensor_parallel_size: 2 + ... + runtime: + devices: "0,1" +``` + +2. **Launch Server**: ```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ - --deploy-config /path/to/deploy_config.yaml +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --stage-configs-path /path/to/your/custom_bagel.yaml ``` -See [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml) for the default two-stage deploy configuration. +#### Using Mooncake Connector -### Single-Stage +By default, BAGEL uses `SharedMemoryConnector` for inter-stage communication. You can use the [Mooncake](https://github.com/kvcache-ai/Mooncake) connector to transfer KV cache between stages, which also enables multi-node deployment. -The DiT stage contains a full LLM, ViT, VAE, and tokenizer, so it can handle all modalities (text2img, img2img, img2text, text2text, think) without a separate Thinker stage: +**1. Install Mooncake** ```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ - --deploy-config vllm_omni/deploy/bagel_single_stage.yaml +# For CUDA-enabled systems (recommended) +pip install mooncake-transfer-engine + +# For non-CUDA systems +pip install mooncake-transfer-engine-non-cuda ``` -See [`bagel_single_stage.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel_single_stage.yaml) for configuration. The `pipeline: bagel_single_stage` field selects the single-stage topology from the pipeline registry. +**2. Start Mooncake Master** on the primary node: -### Tensor Parallelism (TP) +```bash +# Optional: enable disk-backed storage by creating a directory and passing --root_fs_dir. +# Without it, Mooncake runs in memory-only mode, which is sufficient for KV cache transfer. +mkdir -p ./mc_storage + +mooncake_master \ + --rpc_port=50051 \ + --enable_http_metadata_server=true \ + --http_metadata_server_host=0.0.0.0 \ + --http_metadata_server_port=8080 \ + --metrics_port=9003 \ + --root_fs_dir=./mc_storage/ \ + --cluster_id=mc-local-1 & +``` -For larger models or multi-GPU environments, enable TP via CLI: +**3. Launch the server** with the Mooncake stage config: ```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --tensor-parallel-size 2 +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml ``` -Or set `tensor_parallel_size` per stage in a custom deploy YAML. +> **Note**: Before launching, edit [`bagel_multiconnector.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml) and replace the `metadata_server` and `master` addresses with your Mooncake master node's actual IP. For single-node testing, `127.0.0.1` works. + +The client-side usage is identical to the default setup -- the Mooncake connector is transparent to the API. See the requests section below. + +For more details on the Mooncake connector configuration, see the [Mooncake Store Connector documentation](https://github.com/vllm-project/vllm-omni/tree/main/docs/design/feature/omni_connectors/mooncake_store_connector.md). -### Multi-Node Deployment +#### Multi-Node Deployment -Deploy each stage on a **separate node** for better resource utilization. Replace `` with the actual IP address of your orchestrator node. +You can deploy each stage on a **separate node** for better resource utilization. In this example, the orchestrator (Stage 0 / Thinker) and Stage 1 (DiT) run on different machines, connected via Mooncake. -**1. Launch Stage 0 (Thinker / Orchestrator)** on the orchestrator node: +Replace `` below with the actual IP address of your orchestrator node (e.g., `10.244.227.244`). + +> [!WARNING] +> **Before launching**, edit [`bagel_multiconnector.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml) and replace the `metadata_server` and `master` addresses with your Mooncake master node's actual IP. Mismatched addresses will cause silent connection failures. + +**1. Start Mooncake Master** (on the orchestrator node): + +```bash +mooncake_master \ + --rpc_port=50051 \ + --enable_http_metadata_server=true \ + --http_metadata_server_host= \ + --http_metadata_server_port=8080 \ + --metrics_port=9003 +``` + +**2. Launch Stage 0 (Thinker / Orchestrator)** on the orchestrator node: ```bash -# API server port for client requests: 8000 vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ - --port 8000 \ + --port 8000 \ # API server port for client requests + --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml \ --stage-id 0 \ - --omni-master-address \ - --omni-master-port 8091 + -oma \ + -omp 8091 ``` -**2. Launch Stage 1 (DiT)** on the remote node in headless mode: +**3. Launch Stage 1 (DiT)** on the remote node in headless mode: ```bash vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ + --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml \ --stage-id 1 \ --headless \ - --omni-master-address \ - --omni-master-port 8091 + -oma \ + -omp 8091 ``` -Or use the convenience script: - -```bash -# Terminal 1: Stage 0 -bash run_server_stage_cli.sh --stage 0 +**Mooncake Master arguments:** -# Terminal 2: Stage 1 -bash run_server_stage_cli.sh --stage 1 - -# With extra args -bash run_server_stage_cli.sh --stage 0 -- --tensor-parallel-size 2 -bash run_server_stage_cli.sh --stage 1 -- --gpu-memory-utilization 0.9 -``` +| Argument | Description | +| :------- | :---------- | +| `--rpc_port` | Mooncake RPC port for control-plane coordination between stages | +| `--enable_http_metadata_server` | Enable the HTTP metadata server for service discovery | +| `--http_metadata_server_host` | IP address to bind the metadata server (use the orchestrator node's IP) | +| `--http_metadata_server_port` | Port for the HTTP metadata server | +| `--metrics_port` | Port for Prometheus-compatible metrics endpoint | **vllm serve arguments:** @@ -115,31 +150,85 @@ bash run_server_stage_cli.sh --stage 1 -- --gpu-memory-utilization 0.9 | :------- | :---------- | | `--stage-id` | Which stage this process runs (0 = Thinker, 1 = DiT) | | `--headless` | Run without the API server (worker-only mode) | -| `-oma` / `--omni-master-address` | Orchestrator master address | -| `-omp` / `--omni-master-port` | Orchestrator master port | +| `-oma` | Orchestrator master address | +| `-omp` | Orchestrator master port for Stage 1 to connect to Stage 0 for task coordination | > [!IMPORTANT] > **Startup Order**: Stage 0 (orchestrator) must be launched **before** Stage 1 (headless). > Stage 0 will appear to hang on startup until Stage 1 (worker) connects — this is expected behavior. -### Inter-Stage Connectors +**Network Requirements** + +All nodes must have network connectivity to each other. Ensure the following ports are open **between all participating nodes**: -When deploying stages across nodes, configure the connector type in the deploy YAML: +| Port | Protocol | Service | Direction | +| :--- | :------- | :------ | :-------- | +| 50051 | TCP | Mooncake Master RPC | Worker → Orchestrator | +| 8080 | TCP | Mooncake HTTP Metadata Server | Worker → Orchestrator | +| 8091 | TCP | Orchestrator Master (`-omp`) | Worker → Orchestrator | +| 8000 | TCP | API Server (`--port`) | Client → Orchestrator | +| 9003 | TCP | Metrics (optional) | Monitoring → Orchestrator | -- **SharedMemoryConnector** (default): Used for single-node deployments. No explicit configuration needed. -- **MooncakeTransferEngineConnector**: For multi-node setups with RDMA hardware. Defined in [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml) under `connectors.rdma_connector`. +> **Tip**: If nodes are behind a firewall or in different VPCs/security groups, make sure the above ports are allowed in ingress/egress rules. All nodes should be reachable via their IP addresses (no NAT). Using nodes on the same subnet or VPC is recommended to minimize latency for Mooncake KV cache transfers. -To use Mooncake, create a custom deploy YAML that binds `output_connectors` / `input_connectors` on each stage to the `rdma_connector` defined in the `connectors` section. +### Send Multi-modal Request -## Send Requests +Get into the bagel folder: ```bash cd examples/online_serving/bagel ``` +Send request via Python + +```bash +python openai_chat_client.py --prompt "A cute cat" --modality text2img +``` + +The Python client supports the following command-line arguments: + +- `--prompt` (or `-p`): Text prompt for generation (default: `A cute cat`) +- `--output` (or `-o`): Output file path for image results (default: `bagel_output.png`) +- `--server` (or `-s`): Server URL (default: `http://localhost:8091`) +- `--image-url` (or `-i`): Input image URL or local file path (for img2img/img2text modes) +- `--modality` (or `-m`): Task modality (default: `text2img`). Options: `text2img`, `img2img`, `img2text`, `text2text` +- `--height`: Image height in pixels (default: 512) +- `--width`: Image width in pixels (default: 512) +- `--steps`: Number of inference steps (default: 25) +- `--seed`: Random seed (default: 42) +- `--negative`: Negative prompt for image generation + +Example with custom parameters: + +```bash +python openai_chat_client.py \ + --prompt "A futuristic city" \ + --modality text2img \ + --height 768 \ + --width 768 \ + --steps 50 \ + --seed 42 \ + --negative "blurry, low quality" +``` + +## Modality Control + +BAGEL-7B-MoT supports **multiple modality modes** for different use cases. + +The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml) + +| Modality | Input | Output | Description | +| ----------- | ------------ | ------ | -------------------------------------- | +| `text2img` | Text | Image | Generate images from text prompts | +| `img2img` | Image + Text | Image | Transform images using text guidance | +| `img2text` | Image + Text | Text | Generate text descriptions from images | +| `text2text` | Text | Text | Pure text generation | + ### Text to Image (text2img) -**Python client:** +Generate images from text prompts: + +**Using Python client** ```bash python openai_chat_client.py \ @@ -149,7 +238,7 @@ python openai_chat_client.py \ --steps 50 ``` -**curl:** +**Using curl** ```bash curl http://localhost:8091/v1/chat/completions \ @@ -164,9 +253,12 @@ curl http://localhost:8091/v1/chat/completions \ }' ``` + ### Image to Image (img2img) -**Python client:** +Transform images based on text prompts: + +**Using Python client** ```bash python openai_chat_client.py \ @@ -176,7 +268,7 @@ python openai_chat_client.py \ --output transformed.png ``` -**curl:** +**Using curl** ```bash IMAGE_BASE64=$(base64 -w 0 cat.jpg) @@ -201,11 +293,14 @@ EOF curl http://localhost:8091/v1/chat/completions \ -H "Content-Type: application/json" \ -d @payload.json + ``` ### Image to Text (img2text) -**Python client:** +Generate text descriptions from images: + +**Using Python client** ```bash python openai_chat_client.py \ @@ -214,7 +309,7 @@ python openai_chat_client.py \ --image-url /path/to/image.jpg ``` -**curl:** +**Using curl** ```bash IMAGE_BASE64=$(base64 -w 0 cat.jpg) @@ -239,7 +334,9 @@ curl http://localhost:8091/v1/chat/completions \ ### Text to Text (text2text) -**Python client:** +Pure text generation: + +**Using Python client** ```bash python openai_chat_client.py \ @@ -247,81 +344,33 @@ python openai_chat_client.py \ --modality text2text ``` -**curl:** +**Using curl** ```bash curl http://localhost:8091/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>user\nWhat is the capital of France?<|im_end|>\n<|im_start|>assistant\n"}]}], + "messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>user\nWhat is the capital of France?<|im_end|>\n<|im_start|>assistant\n"}]}] "modalities": ["text"] }' ``` -### Python Client Arguments - -| Argument | Default | Description | -| :------- | :------ | :---------- | -| `--prompt` / `-p` | `A cute cat` | Text prompt | -| `--output` / `-o` | `bagel_output.png` | Output file path | -| `--server` / `-s` | `http://localhost:8091` | Server URL | -| `--image-url` / `-i` | `None` | Input image URL or local path (img2img/img2text) | -| `--modality` / `-m` | `text2img` | `text2img`, `img2img`, `img2text`, `text2text` | -| `--height` | `512` | Image height in pixels | -| `--width` | `512` | Image width in pixels | -| `--steps` | `25` | Number of inference steps | -| `--seed` | `42` | Random seed | -| `--negative` | `None` | Negative prompt for CFG | +## FAQ -Example with custom parameters: +- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. ```bash -python openai_chat_client.py \ - --prompt "A futuristic city" \ - --modality text2img \ - --height 768 \ - --width 768 \ - --steps 50 \ - --seed 42 \ - --negative "blurry, low quality" +sudo apt update +sudo apt install ffmpeg ``` -## Configuration Reference - -### Deploy YAML Files - -| File | Description | -| :--- | :---------- | -| [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml) | Two-stage default (Thinker + DiT on GPU 0) | -| [`bagel_single_stage.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel_single_stage.yaml) | Single-stage (DiT only) | - -### Key Deploy YAML Fields - -| Field | Scope | Description | -| :---- | :---- | :---------- | -| `pipeline` | top-level | Override auto-detected pipeline (e.g. `bagel_single_stage`) | -| `stages[].stage_id` | per-stage | Stage identifier (0, 1, ...) | -| `stages[].devices` | per-stage | GPU device IDs (e.g. `"0"`, `"0,1"`) | -| `stages[].max_num_seqs` | per-stage | Maximum concurrent sequences | -| `stages[].gpu_memory_utilization` | per-stage | Fraction of GPU memory to use | -| `stages[].enforce_eager` | per-stage | Disable CUDA graphs | -| `stages[].tensor_parallel_size` | per-stage | TP degree for this stage | -| `connectors` | top-level | Define available connector instances (SHM, Mooncake) | -| `platforms` | top-level | Platform-specific overrides (e.g. `xpu`) | - -## FAQ - -- If you encounter OOM errors, try decreasing `max_model_len` or `gpu_memory_utilization` in the deploy YAML. - -**Two-stage VRAM usage:** - -| Stage | VRAM | -| :---- | :--- | -| Stage 0 (Thinker) | **15.04 GiB + KV Cache** | -| Stage 1 (DiT) | **26.50 GiB** | -| Total | **~42 GiB + KV Cache** | +- If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. -**Single-stage VRAM usage:** The DiT loads the full model (~42 GiB) in one process. +| Stage | VRAM | +| :------------------ | :--------------------------- | +| Stage-0 (Thinker) | **15.04 GiB** **+ KV Cache** | +| Stage-1 (DiT) | **26.50 GiB** | +| Total | **~42 GiB + KV Cache** | ## Example materials diff --git a/docs/user_guide/examples/online_serving/diffusers_pipeline_adapter.md b/docs/user_guide/examples/online_serving/diffusers_pipeline_adapter.md deleted file mode 100644 index ac88071d53f..00000000000 --- a/docs/user_guide/examples/online_serving/diffusers_pipeline_adapter.md +++ /dev/null @@ -1,93 +0,0 @@ -# Diffusers Backend Adapter Example - -Source . - - -This example demonstrates how to serve any 🤗 Diffusers pipeline through vLLM-Omni -using the `diffusers` load format. - -## Supported Models - -Any model loadable via `DiffusionPipeline.from_pretrained()` should be supported, including text-to-image, image-to-image, text-to-video, image-to-video, and text-to-audio. - -## Limitations - -The diffusers backend is a black-box adapter. The following features are NOT yet supported. -It is not guaranteed whether they will be supported in the future. - -- CFG parallel execution -- Sequence parallel execution -- TeaCache / Cache-DiT acceleration -- Step-wise execution (continuous batching) - -For these features, it is recommended to use natively supported pipelines instead. - -## Usage - -### Option 1: CLI arguments - -```bash -vllm serve "stable-diffusion-v1-5/stable-diffusion-v1-5" \ - --omni \ - --diffusion-load-format diffusers \ - --diffusers-load-kwargs '{"use_safetensors": true}' \ - --diffusers-call-kwargs '{"num_inference_steps": 30, "guidance_scale": 7.5}' -``` - -`--diffusers-load-kwargs` and `--diffusers-call-kwargs` are only valid together with `--diffusion-load-format diffusers`. - -### Option 2: Stage config YAML - -```bash -vllm serve stable-diffusion-v1-5/stable-diffusion-v1-5 --stage-configs-path examples/online_serving/diffusers_pipeline_adapter/stage_config.yaml --omni -``` - -The particular fields of interest are `model`, `diffusion_load_format`, `diffusers_load_kwargs`, and `diffusers_call_kwargs` under `engine_args`. They are the same as the CLI arguments. - -## Send a Request - -```bash -curl http://localhost:8000/v1/images/generations \ - -H "Content-Type: application/json" \ - -d '{ - "model": "stable-diffusion-v1-5/stable-diffusion-v1-5", - "prompt": "a photo of an astronaut riding a horse on mars", - "n": 1, - "size": "512x512" - }' -``` - -Or refer to other documentation pages on how to request a particular input/output modality, such as `examples/online_serving/text_to_image/openai_chat_client.py`. - -## Configuration Reference - -For the diffusers adapter, set options under **`engine_args`**: - -### `diffusion_load_format: "diffusers"` - -This field selects the Hugging Face diffusers adapter path (see `DiffusersPipelineLoader`). - -### `diffusers_load_kwargs` - -Passed to `DiffusionPipeline.from_pretrained()`. - -This is suitable for model-specific configurations not available through the vLLM-Omni interface (such as `Omni.__init__()`, `vllm serve` CLI arguments, and stage config YAML fields outside `diffusers_load_kwargs`). - -When a parameter is available in the vLLM-Omni interface, it will be adapted here. -But if that parameter is simultaneously set in both the vLLM-Omni interface and `diffusers_load_kwargs`, the **latter** will take precedence. - -### `diffusers_call_kwargs` - -Passed to `pipeline.__call__()`. - -This is suitable for sampling parameters not available through the vLLM-Omni interface (such as `Omni.generate()` and online serving payloads). - -When a parameter is available in the vLLM-Omni interface, it will be adapted here. -But if that parameter is simultaneously set in both the vLLM-Omni interface and `diffusers_call_kwargs`, the **former** will take precedence (because it is set at request time). - -## Example materials - -??? abstract "stage_config.yaml" - ``````yaml - --8<-- "examples/online_serving/diffusers_pipeline_adapter/stage_config.yaml" - `````` diff --git a/docs/user_guide/examples/online_serving/fish_speech.md b/docs/user_guide/examples/online_serving/fish_speech.md index 2a15ef44ac8..7322d06aaaf 100644 --- a/docs/user_guide/examples/online_serving/fish_speech.md +++ b/docs/user_guide/examples/online_serving/fish_speech.md @@ -41,11 +41,15 @@ Features: ## Launch the Server ```bash -vllm serve fishaudio/s2-pro --omni --port 8091 +vllm-omni serve fishaudio/s2-pro \ + --stage-configs-path vllm_omni/model_executor/stage_configs/fish_speech_s2_pro.yaml \ + --omni \ + --port 8091 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.9 ``` -The deploy config is auto-loaded from `vllm_omni/deploy/fish_qwen3_omni.yaml`. - Or use the convenience script: ```bash diff --git a/docs/user_guide/examples/online_serving/glm_image.md b/docs/user_guide/examples/online_serving/glm_image.md index 4cc49e84602..f7027b906db 100644 --- a/docs/user_guide/examples/online_serving/glm_image.md +++ b/docs/user_guide/examples/online_serving/glm_image.md @@ -1,96 +1,99 @@ # GLM-Image Online Serving -GLM-Image is a 2-stage image generation model (AR + Diffusion) supported by vLLM-Omni's -declarative config system. The pipeline topology and stage structure are declared in -`vllm_omni/model_executor/models/glm_image/pipeline.py`; deployment knobs (GPU placement, -memory, sampling params) live in `vllm_omni/deploy/glm_image.yaml`. +Source . -## Start Server + +This example demonstrates how to deploy GLM-Image for online image generation using vLLM-Omni. + +## 🛠️ Installation + +Please refer to [README.md](https://github.com/vllm-project/vllm-omni/tree/main/README.md) + +## Run examples (GLM-Image) + +**Note**: These examples work with the default configuration on **2× NVIDIA A100 (80GB)** or equivalent. Stage 0 (AR) and Stage 1 (Diffusion) each use one GPU by default. For single-GPU setups, modify the stage configuration to share the same device. + +### Launch the Server ```bash +# Use default configuration vllm serve zai-org/GLM-Image --omni --port 8091 ``` -The config system auto-detects the pipeline from the model's `model_index.json` — no -manual `--stage-configs-path` or `--deploy-config` needed. +Or use the convenience script: + +```bash +cd examples/online_serving/glm_image +bash run_server.sh +``` + +If you have a custom stage configs file: + +```bash +vllm serve zai-org/GLM-Image --omni --port 8091 --stage-configs-path /path/to/glm_image.yaml +``` + +### Send Requests -By default, stage 0 (AR) runs on GPU 0 and stage 1 (Diffusion) on GPU 1. To colocate -both stages on a single GPU, override per stage: +Get into the glm_image folder: ```bash -vllm serve zai-org/GLM-Image --omni --port 8091 \ - --stage-0-devices 0 --stage-1-devices 0 +cd examples/online_serving/glm_image ``` -## API Calls +Send request via Python: + +```bash +python openai_chat_client.py --prompt "A cute cat sitting on a window sill" +``` + +The Python client supports the following command-line arguments: + +- `--prompt` (or `-p`): Text prompt for generation (default: `A beautiful sunset over the ocean with sailing boats`) +- `--output` (or `-o`): Output file path (default: `glm_image_output.png`) +- `--server` (or `-s`): Server URL (default: `http://localhost:8091`) +- `--image` (or `-i`): Input image path (for image-to-image editing) +- `--height`: Image height in pixels (default: 1024) +- `--width`: Image width in pixels (default: 1024) +- `--steps`: Number of inference steps (default: 50) +- `--guidance-scale`: Classifier-free guidance scale (default: 1.5) +- `--seed`: Random seed (default: 42) +- `--negative`: Negative prompt + +## Modality Control + +GLM-Image supports **text-to-image** and **image-to-image** modes. + +The default yaml configuration deploys AR on GPU 0 and DiT on GPU 1. You can use the default configuration file: [`glm_image.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/glm_image.yaml) + +| Mode | Input | Output | Description | +| -------------- | ------------ | ------ | ---------------------------------- | +| Text-to-Image | Text | Image | Generate images from text prompts | +| Image-to-Image | Image + Text | Image | Edit images with text instructions | ### Text-to-Image ```bash -curl -s http://localhost:8091/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "messages": [ - {"role": "user", "content": "A photorealistic mountain landscape at sunset"} - ], - "extra_body": { - "height": 1024, - "width": 1024, - "num_inference_steps": 50, - "guidance_scale": 1.5, - "seed": 42 - } - }' | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > output.png +python openai_chat_client.py \ + --prompt "A photorealistic mountain landscape at sunset" \ + --height 1024 \ + --width 1024 \ + --output landscape.png + +# Or use the curl script: +bash run_curl_text_to_image.sh "A futuristic city skyline at night" ``` ### Image-to-Image (Image Editing) ```bash -curl -s http://localhost:8091/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": "Convert this image to watercolor style"}, - {"type": "image_url", "image_url": {"url": "data:image/png;base64,$(base64 -w0 input.png)}"} - ] - } - ], - "extra_body": { - "height": 1024, - "width": 1024, - "num_inference_steps": 50, - "guidance_scale": 1.5 - } - }' | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > output.png -``` +python openai_chat_client.py \ + --prompt "Convert this image to watercolor style" \ + --image input.png \ + --output watercolor.png -### Using the OpenAI Python SDK - -```python -from openai import OpenAI -import base64 - -client = OpenAI(base_url="http://localhost:8091/v1", api_key="none") - -response = client.chat.completions.create( - model="zai-org/GLM-Image", - messages=[{"role": "user", "content": "A beautiful sunset over the ocean"}], - extra_body={ - "height": 1024, - "width": 1024, - "num_inference_steps": 50, - "guidance_scale": 1.5, - "seed": 42, - }, -) - -img_url = response.choices[0].message.content[0].image_url.url -_, b64_data = img_url.split(",", 1) -with open("output.png", "wb") as f: - f.write(base64.b64decode(b64_data)) +# Or use the curl script: +bash run_curl_image_edit.sh input.png "Convert to watercolor style" ``` For general-purpose request methods (curl, OpenAI SDK, Python `requests`), see @@ -101,9 +104,9 @@ guides. When using `/v1/chat/completions`, pass these inside `extra_body` in the curl JSON, or via the `extra_body` keyword argument in the OpenAI Python SDK (see the -[Diffusion Chat API guide](../../../serving/diffusion_chat_api.md)). -When using the dedicated [`/v1/images/generations`](../../../serving/image_generation_api.md) -or [`/v1/images/edits`](../../../serving/image_edit_api.md) endpoints, pass +[Diffusion Chat API guide](../../../../serving/diffusion_chat_api.md)). +When using the dedicated [`/v1/images/generations`](../../../../serving/image_generation_api.md) +or [`/v1/images/edits`](../../../../serving/image_edit_api.md) endpoints, pass the supported generation controls as top-level fields directly. For image dimensions and count, use `size` and `n` rather than `height` or `width`. @@ -113,7 +116,7 @@ dimensions and count, use `size` and `n` rather than `height` or `width`. | `width` | int | 1024 | Image width in pixels | | `num_inference_steps` | int | 50 | Number of diffusion denoising steps | | `guidance_scale` | float | 1.5 | Classifier-free guidance scale | -| `seed` | int | None | Optional random seed | +| `seed` | int | None | Optional random seed; `/v1/images/*` generates one server-side if omitted | | `negative_prompt` | str | None | Negative prompt | ## Response Format @@ -147,12 +150,13 @@ dimensions and count, use `size` and `n` rather than `height` or `width`. ## Extract Image ```bash +# From a saved JSON response cat response.json | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > output.png ``` ## Architecture -GLM-Image uses a 2-stage pipeline: +GLM-Image uses a 2-stage multistage pipeline: ``` Stage 0 (AR Model) Stage 1 (Diffusion) @@ -177,13 +181,41 @@ Stage 0 (AR Model) Stage 1 (Diffusion) | Stage-1 (DiT+VAE) | **~20 GiB** | | Total | **~38 GiB + KV Cache** | +## File Description + +| File | Description | +| --------------------------- | ------------------------------------- | +| `run_server.sh` | Server startup script | +| `run_curl_text_to_image.sh` | Text-to-image curl example | +| `run_curl_image_edit.sh` | Image-to-image (editing) curl example | +| `openai_chat_client.py` | Python client (t2i + i2i) | + ## FAQ -- If you encounter OOM errors, adjust `gpu_memory_utilization` in the deploy config: +- If you encounter OOM errors, adjust `gpu_memory_utilization` in the stage config: ```yaml -# In vllm_omni/deploy/glm_image.yaml, reduce from default 0.6: +# In glm_image.yaml, reduce from default 0.6: gpu_memory_utilization: 0.5 ``` - The first request may be slow due to model warmup. Subsequent requests will be faster. + +## Example materials + +??? abstract "openai_chat_client.py" + ``````py + --8<-- "examples/online_serving/glm_image/openai_chat_client.py" + `````` +??? abstract "run_curl_image_edit.sh" + ``````sh + --8<-- "examples/online_serving/glm_image/run_curl_image_edit.sh" + `````` +??? abstract "run_curl_text_to_image.sh" + ``````sh + --8<-- "examples/online_serving/glm_image/run_curl_text_to_image.sh" + `````` +??? abstract "run_server.sh" + ``````sh + --8<-- "examples/online_serving/glm_image/run_server.sh" + `````` diff --git a/docs/user_guide/examples/online_serving/image_to_video.md b/docs/user_guide/examples/online_serving/image_to_video.md index 781f0c2a5ed..00b67d74e26 100644 --- a/docs/user_guide/examples/online_serving/image_to_video.md +++ b/docs/user_guide/examples/online_serving/image_to_video.md @@ -72,9 +72,6 @@ curl -X POST http://localhost:8091/v1/videos/sync \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ - -F "enable_frame_interpolation=true" \ - -F "frame_interpolation_exp=1" \ - -F "frame_interpolation_scale=1.0" \ -F "seed=42" \ -o sync_i2v_output.mp4 ``` @@ -117,9 +114,6 @@ create_response=$(curl -s http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ - -F "enable_frame_interpolation=true" \ - -F "frame_interpolation_exp=1" \ - -F "frame_interpolation_scale=1.0" \ -F "seed=42") video_id=$(echo "$create_response" | jq -r '.id') @@ -178,35 +172,9 @@ curl -X POST http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ - -F "enable_frame_interpolation=true" \ - -F "frame_interpolation_exp=1" \ - -F "frame_interpolation_scale=1.0" \ -F "seed=42" ``` -Frame interpolation is also available for supported Wan2.2 I2V requests. See -[Frame Interpolation](../../diffusion/frame_interpolation.md) for worker-side -execution details and feature constraints. - -### Frame Interpolation Example - -```bash -curl -X POST http://localhost:8091/v1/videos/sync \ - -F "prompt=A bear playing with yarn, smooth motion" \ - -F "input_reference=@/path/to/qwen-bear.png" \ - -F "width=832" \ - -F "height=480" \ - -F "num_frames=33" \ - -F "fps=16" \ - -F "num_inference_steps=40" \ - -F "guidance_scale=1.0" \ - -F "guidance_scale_2=1.0" \ - -F "enable_frame_interpolation=true" \ - -F "frame_interpolation_exp=1" \ - -F "frame_interpolation_scale=1.0" \ - -o sync_i2v_interpolated.mp4 -``` - ## Create Response Format `POST /v1/videos` returns a job record, not inline base64 video data. diff --git a/docs/user_guide/examples/online_serving/mimo_audio.md b/docs/user_guide/examples/online_serving/mimo_audio.md index c8752f5782e..4737eca3664 100644 --- a/docs/user_guide/examples/online_serving/mimo_audio.md +++ b/docs/user_guide/examples/online_serving/mimo_audio.md @@ -13,10 +13,10 @@ Please refer to [README.md](https://github.com/vllm-project/vllm-omni/tree/main/ ```bash export MIMO_AUDIO_TOKENIZER_PATH="XiaomiMiMo/MiMo-Audio-Tokenizer" -vllm serve XiaomiMiMo/MiMo-Audio-7B-Instruct --omni \ - --served-model-name "MiMo-Audio-7B-Instruct" \ - --port 18091 \ - --chat-template ./examples/online_serving/mimo_audio/chat_template.jinja +vllm-omni serve XiaomiMiMo/MiMo-Audio-7B-Instruct --omni \ +--served-model-name "MiMo-Audio-7B-Instruct" \ +--port 18091 --stage-configs-path ./vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ +--chat-template ./examples/online_serving/mimo_audio/chat_template.jinja ``` > ⚠️ **Important** > **MiMo-Audio is not compatible with the default chat template.** diff --git a/docs/user_guide/examples/online_serving/qwen2_5_omni.md b/docs/user_guide/examples/online_serving/qwen2_5_omni.md index b3a2c9f2ac9..43576469242 100644 --- a/docs/user_guide/examples/online_serving/qwen2_5_omni.md +++ b/docs/user_guide/examples/online_serving/qwen2_5_omni.md @@ -218,6 +218,14 @@ The gradio script supports the following arguments: - `--port`: Port for Gradio server (default: 7861) - `--share`: Share the Gradio demo publicly (creates a public link) +### FAQ + +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` + ## Example materials ??? abstract "gradio_demo.py" diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md index 22d89ee8018..69de24852f6 100644 --- a/docs/user_guide/examples/online_serving/qwen3_omni.md +++ b/docs/user_guide/examples/online_serving/qwen3_omni.md @@ -15,72 +15,15 @@ Please refer to [README.md](https://github.com/vllm-project/vllm-omni/tree/main/ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 ``` -The default deployment configuration situated at `vllm_omni/deploy/qwen3_omni_moe.yaml` is resolved and loaded -automatically via the model registry, obviating the necessity for the `--deploy-config` flag in standard deployment topologies. -Asynchronous chunk streaming is **enabled by default** within the bundled configuration. +If you want to open async chunking for qwen3-omni, launch the server with command below -To explicitly utilize a custom deployment YAML, specify the configuration path: ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --deploy-config /path/to/deploy_config_file +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml ``` -### Launch individual stages (stage-based CLI) - -Adopt the stage-based CLI architecture to independently instantiate execution processes per functional stage. - -**1. Stage 0 (Thinker + API server)** - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --port 8091 \ - --stage-id 0 \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - -**2. Stage 1 (Talker)** - +If you have custom stage configs file, launch the server with command below ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --stage-id 1 \ - --headless \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - -**3. Stage 2 (Code2Wav)** - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --stage-id 2 \ - --headless \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - -Add `--deploy-config /path/to/deploy_config_file` to every command if you want -to override the bundled deploy YAML. - -For the regular one-process launch, stage-specific CLI tuning is usually done -with `--stage-overrides`, for example: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --stage-overrides '{"1": {"gpu_memory_utilization": 0.5}}' -``` - -For the stage-based CLI, you usually do **not** need `--stage-overrides` for -that kind of change. Since each command launches one stage, just pass the knob -directly on that stage command: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --stage-id 1 \ - --headless \ - --gpu-memory-utilization 0.5 \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file ``` ### Send Multi-modal Request @@ -121,6 +64,15 @@ python openai_chat_completion_client_for_multimodal_generation.py \ bash run_curl_multimodal_generation.sh use_image ``` + +### FAQ + +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` + ## Modality control You can control output modalities to specify which types of output the model should generate. This is useful when you only need text output and want to skip audio generation stages for better performance. @@ -244,7 +196,7 @@ The script supports the following arguments: - `--model`: Model name/path (default: Qwen/Qwen3-Omni-30B-A3B-Instruct) - `--server-port`: Port for vLLM server (default: 8091) - `--gradio-port`: Port for Gradio demo (default: 7861) -- `--deploy-config`: Path to custom deploy config YAML file (optional) +- `--stage-configs-path`: Path to custom stage configs YAML file (optional) - `--server-host`: Host for vLLM server (default: 0.0.0.0) - `--gradio-ip`: IP for Gradio demo (default: 127.0.0.1) - `--share`: Share Gradio demo publicly (creates a public link) @@ -259,7 +211,7 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 If you have custom stage configs file: ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --deploy-config /path/to/deploy_config_file +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file ``` **Step 2: Run the Gradio demo** diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md index 95f234f02de..156c4942cd9 100644 --- a/docs/user_guide/examples/online_serving/qwen3_tts.md +++ b/docs/user_guide/examples/online_serving/qwen3_tts.md @@ -58,7 +58,7 @@ Then open http://localhost:7860 in your browser. ```bash # CustomVoice model (predefined speakers) vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -66,7 +66,7 @@ vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ # VoiceDesign model vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \ - --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -74,7 +74,7 @@ vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \ # Base model (voice cloning) vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base \ - --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -211,6 +211,14 @@ with open("output.wav", "wb") as f: f.write(response.content) ``` +### FAQ + +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` + ## API Reference ### Voices Endpoint diff --git a/docs/user_guide/examples/online_serving/text_to_video.md b/docs/user_guide/examples/online_serving/text_to_video.md index b918aac19d0..d58296fcc78 100644 --- a/docs/user_guide/examples/online_serving/text_to_video.md +++ b/docs/user_guide/examples/online_serving/text_to_video.md @@ -3,28 +3,17 @@ Source . -This example demonstrates how to deploy text-to-video models for online video generation using vLLM-Omni. +This example demonstrates how to deploy the Wan2.2 text-to-video model for online video generation using vLLM-Omni. -## Supported Models +## Start Server -| Model | Model ID | -|-------|----------| -| Wan2.1 T2V (1.3B) | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | -| Wan2.1 T2V (14B) | `Wan-AI/Wan2.1-T2V-14B-Diffusers` | -| Wan2.2 T2V | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | -| LTX-2 | `Lightricks/LTX-2` | - -## Wan2.2 T2V - -### Start Server - -#### Basic Start +### Basic Start ```bash vllm serve Wan-AI/Wan2.2-T2V-A14B-Diffusers --omni --port 8091 ``` -#### Start with Parameters +### Start with Parameters Or use the startup script: @@ -165,9 +154,6 @@ curl -X POST http://localhost:8091/v1/videos \ -F "guidance_scale_2=4.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=5.0" \ - -F "enable_frame_interpolation=true" \ - -F "frame_interpolation_exp=1" \ - -F "frame_interpolation_scale=1.0" \ -F "seed=42" ``` @@ -190,35 +176,6 @@ curl -X POST http://localhost:8091/v1/videos \ | `flow_shift` | float | None | Scheduler flow shift (Wan2.2) | | `seed` | int | None | Random seed (reproducible) | | `lora` | object | None | LoRA configuration | -| `enable_frame_interpolation` | bool | false | Enable RIFE frame interpolation before MP4 encoding | -| `frame_interpolation_exp` | int | 1 | Interpolation exponent; 1=2x temporal resolution, 2=4x | -| `frame_interpolation_scale` | float | 1.0 | RIFE inference scale; use 0.5 for high-resolution inputs | -| `frame_interpolation_model_path` | str | None | Local directory or Hugging Face repo ID with `flownet.pkl`; defaults to `elfgum/RIFE-4.22.lite` | - -## Frame Interpolation - -Frame interpolation is an optional post-processing step for `/v1/videos` and -`/v1/videos/sync`. It synthesizes intermediate frames between generated frames -without rerunning the diffusion model. If the generated video has `N` frames, -the interpolated output frame count is `(N - 1) * 2**exp + 1`. The encoder FPS -is multiplied by `2**exp` so the output duration remains close to the original. - -Frame interpolation runs in the diffusion worker post-processing path instead of -the API server encoding path, so it can reuse the worker's current accelerator -device without blocking the FastAPI event loop. - -Example: generate 5 frames and interpolate to 9 frames: - -```bash -curl -X POST http://localhost:8091/v1/videos/sync \ - -F "prompt=A dog running through a park" \ - -F "num_frames=5" \ - -F "fps=8" \ - -F "enable_frame_interpolation=true" \ - -F "frame_interpolation_exp=1" \ - -F "frame_interpolation_scale=1.0" \ - -o sync_t2v_interpolated.mp4 -``` ## Create Response Format @@ -277,102 +234,8 @@ while true; do done ``` -## LTX-2 - -### Start Server - -#### Basic Start - -```bash -vllm serve Lightricks/LTX-2 --omni --port 8098 \ - --enforce-eager --flow-shift 1.0 --boundary-ratio 1.0 -``` - -For multi-GPU memory reduction, you can enable HSDP: - -```bash -vllm serve Lightricks/LTX-2 --omni --port 8098 \ - --enforce-eager --flow-shift 1.0 --boundary-ratio 1.0 \ - --use-hsdp --hsdp-shard-size 2 -``` - -#### Start with Optimization Presets - -Use the LTX-2 startup script with built-in optimization presets: - -```bash -# Baseline (1 GPU, eager) -bash run_server_ltx2.sh baseline - -# 4-GPU Ulysses sequence parallelism (lossless) -bash run_server_ltx2.sh ulysses4 - -# Cache-DiT lossy acceleration (1 GPU, ~1.4× speedup) -bash run_server_ltx2.sh cache-dit - -# Best combo: 4-GPU Ulysses SP + Cache-DiT (~2.2× speedup) -bash run_server_ltx2.sh best-combo -``` - -#### Optimization Benchmarks - -Benchmarked on H800, online serving (480×768, 41 frames, 20 steps, `seed=42`). -"Inference" is the server-reported inference time; excludes HTTP/poll overhead. - -| Preset | Server Command | Inference (s) | Speedup | Type | -|--------|---------------|---------------|---------|------| -| `baseline` | `--enforce-eager` | 10.3 | 1.00× | — | -| `compile` | *(default, no --enforce-eager)* | ~10.3 (warm) | ~1.00× | Lossless | -| `ulysses4` | `--enforce-eager --usp 4` | ~10.3 | ~1.00× | Lossless | -| `cache-dit` | `--enforce-eager --cache-backend cache_dit` | 7.4 avg | ~1.4× | Lossy | -| `best-combo` | `--enforce-eager --usp 4 --cache-backend cache_dit` | 4.7 avg | **~2.2×** | Lossless + Lossy | - -**Observations**: -- **torch.compile**: On H800, warm-request inference time matches the eager baseline (~10.3s). - The first request pays ~6s compilation overhead. Benefit depends on model architecture and GPU. -- **Ulysses SP (4 GPU)**: No measurable speedup alone for 41-frame generation at this resolution. - Communication overhead outweighs gains at this sequence length. -- **Cache-DiT**: Inference varies per request (6–10s) due to dynamic caching decisions. - Average is ~7.4s (~1.4× speedup) with slight quality tradeoff. -- **Best combo**: 4-GPU Ulysses SP + Cache-DiT synergize well — Cache-DiT reduces per-step - computation, making the communication overhead of Ulysses SP worthwhile. Average ~4.7s - (~2.2× speedup). -- **FP8 quantization**: Reduces VRAM but does not speed up LTX-2 on H800 (compute-bound). - -**Deployment Recommendations**: -- For **production with quality priority**: use `baseline` with `--enforce-eager` -- For **maximum throughput** (4 GPUs, quality tradeoff): use `best-combo` (~2.2× speedup) -- For **single-GPU throughput**: use `cache-dit` (~1.4× speedup) -- `--enforce-eager` is recommended to avoid torch.compile warmup latency on first request - -### Send Requests (curl) - -```bash -# Using the provided script -bash run_curl_ltx2.sh - -# Or directly -curl -sS -X POST http://localhost:8098/v1/videos \ - -H "Accept: application/json" \ - -F "prompt=A serene lakeside sunrise with mist over the water." \ - -F "width=768" \ - -F "height=480" \ - -F "num_frames=41" \ - -F "fps=24" \ - -F "num_inference_steps=20" \ - -F "guidance_scale=3.0" \ - -F "seed=42" -``` - ## Example materials -??? abstract "response.json" - ``````json - --8<-- "examples/online_serving/text_to_video/response.json" - `````` -??? abstract "run_curl_ltx2.sh" - ``````sh - --8<-- "examples/online_serving/text_to_video/run_curl_ltx2.sh" ??? abstract "run_curl_hunyuan_video_15.sh" ``````sh --8<-- "examples/online_serving/text_to_video/run_curl_hunyuan_video_15.sh" @@ -385,9 +248,6 @@ curl -sS -X POST http://localhost:8098/v1/videos \ ``````sh --8<-- "examples/online_serving/text_to_video/run_server.sh" `````` -??? abstract "run_server_ltx2.sh" - ``````sh - --8<-- "examples/online_serving/text_to_video/run_server_ltx2.sh" ??? abstract "run_server_hunyuan_video_15.sh" ``````sh --8<-- "examples/online_serving/text_to_video/run_server_hunyuan_video_15.sh" diff --git a/examples/offline_inference/bagel/README.md b/examples/offline_inference/bagel/README.md index 9955fd90db9..226c009f792 100644 --- a/examples/offline_inference/bagel/README.md +++ b/examples/offline_inference/bagel/README.md @@ -1,60 +1,44 @@ # BAGEL-7B-MoT -## Setup +## Set up Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup. -## Architecture +## Run examples -BAGEL-7B-MoT is a Mixture-of-Transformers (MoT) model supporting both image generation and understanding. It offers two deployment topologies: +**Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, please modify the stage configuration to distribute the model across devices. -| Topology | Stages | Description | -| :------- | :----- | :---------- | -| **Two-stage** (default) | Stage 0 (Thinker, AR) + Stage 1 (DiT, Diffusion) | Thinker handles text/understanding via vLLM AR engine; DiT handles image generation. KV cache is transferred between stages. | -| **Single-stage** | Stage 0 (DiT, Diffusion) only | The DiT stage contains a full LLM, ViT, VAE, and tokenizer internally. All modalities are handled within a single diffusion process. | - -Both topologies support all four modalities: `text2img`, `img2img`, `img2text`, `text2text`. - -## Quick Start +Get into the bagel folder ```bash cd examples/offline_inference/bagel - -# Default two-stage mode (auto-detected) -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A cute cat" - -# Single-stage mode -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A cute cat" \ - --deploy-config vllm_omni/deploy/bagel_single_stage.yaml ``` -> **Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. For dual-GPU setups, modify the deploy YAML to distribute stages across devices. +### Modality Control -## Modality Control +BAGEL-7B-MoT supports multiple modality modes. You can control the mode using the `--modality` argument: -Control the mode using the `--modality` argument: +#### Text to Image (text2img) -| Modality | Input | Output | Description | -| :------- | :---- | :----- | :---------- | -| `text2img` | Text | Image | Generate images from text prompts | -| `img2img` | Image + Text | Image | Transform images using text guidance | -| `img2text` | Image + Text | Text | Generate text descriptions from images | -| `text2text` | Text | Text | Pure text generation (language model mode) | +- **Pipeline**: Text → Thinker → DiT → VAE Decode → Image +- **Stages Used**: Stage 0 (Thinker) + Stage 1 (DiT) +- **KV Transfer**: Thinker sends KV cache to DiT for conditioned generation -### Text to Image (text2img) +Generate images from text prompts: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ - --prompts "A cute cat" \ - --steps 50 + --prompts "A cute cat" ``` -### Image to Image (img2img) +#### Image to Image (img2img) + +- **Pipeline**: Image → VAE Encode → DiT → VAE Decode → New Image +- **Stages Used**: Stage 1 (DiT) only +- **Special**: Bypasses the Thinker stage, direct image-to-image transformation + +Transform images based on text prompts: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ @@ -63,7 +47,13 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --prompts "Let the woman wear a blue dress" ``` -### Image to Text (img2text) +#### Image to Text (img2text) + +- **Pipeline**: Image → ViT + VAE Encode → Thinker → Text Output +- **Stages Used**: Stage 0 (Thinker) only +- **Special**: Uses both VAE latent encoding AND ViT semantic encoding for comprehensive image understanding + +Generate text descriptions from images: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ @@ -72,206 +62,202 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --prompts "Describe this image in detail" ``` -### Text to Text (text2text) +#### Text to Text (text2text) + +- **Pipeline**: Text → Thinker → Text Output +- **Stages Used**: Stage 0 (Thinker) only +- **Special**: No visual components involved, operates as pure language model + +Pure text generation: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2text \ --prompts "What is the capital of France?" -# Load prompts from a text file (one prompt per line): +# You can load prompts from a text file (one prompt per line): python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2text \ --txt-prompts /path/to/prompts.txt ``` -## Think Mode +### Inference Steps -Think mode enables the model to generate `...` planning/reasoning tokens before producing the final output. This improves generation quality for complex prompts. - -- **Two-stage**: The Thinker (AR) stage decodes think tokens, then transfers the augmented KV cache to the DiT stage for image generation. -- **Single-stage**: The DiT's internal LLM generates think tokens in-place before proceeding to denoise. +Control the number of inference steps for image generation: ```bash -# Think + text2img: plan before generating +# You can adjust steps to 100 to improve image quality python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ - --prompts "A futuristic city with flying cars" \ - --think \ - --max-think-tokens 1000 + --steps 50 \ + --prompts "A cute cat" +``` -# Think + img2img: reason about the edit -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality img2img \ - --image-path /path/to/image.jpg \ - --prompts "Make it look like a watercolor painting" \ - --think +### Key arguments -# Think + img2text: reason before describing -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality img2text \ - --image-path /path/to/image.jpg \ - --prompts "What is happening in this image?" \ - --think +BAGEL-7B-MoT supports **multiple modality modes** for different use cases. -# Think + text2text: chain-of-thought reasoning -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2text \ - --prompts "Solve: 23 * 47" \ - --think -``` +The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel.yaml) -Think mode parameters: +#### 📌 Command Line Arguments (end2end.py) -| Argument | Default | Description | -| :------- | :------ | :---------- | -| `--think` | `False` | Enable thinking mode | -| `--max-think-tokens` | `1000` | Maximum tokens for think generation | -| `--do-sample` | `False` | Enable sampling (vs. greedy) for text generation | -| `--text-temperature` | `0.3` | Temperature for text generation sampling | +| Argument | Type | Default | Description | +| :--------------------- | :----- | :---------------------------- | :----------------------------------------------------------- | +| `--model` | string | `ByteDance-Seed/BAGEL-7B-MoT` | Model path or name | +| `--modality` | choice | `text2img` | Modality mode: `text2img`, `img2img`, `img2text`, `text2text` | +| `--prompts` | list | `None` | Input text prompts directly | +| `--txt-prompts` | string | `None` | Path to txt file with one prompt per line | +| `--image-path` | string | `None` | Input image path (for `img2img`/`img2text`) | +| `--steps` | int | `50` | Number of inference steps | +| `--stage-configs-path` | string | `None` | Custom stage config file path | +| `--worker-backend` | choice | `process` | Worker backend: `process` or `ray` | +| `--ray-address` | string | `None` | Ray cluster address | +| `--enable-stats` | flag | `False` | Enable statistics logging | +| `--init-sleep-seconds` | int | `20` | Initialization sleep time | +| `--batch-timeout` | int | `5` | Batch timeout | +| `--init-timeout` | int | `300` | Initialization timeout | -## Classifier-Free Guidance (CFG) +------ -CFG controls the trade-off between prompt fidelity and diversity. These parameters apply to image generation modalities (`text2img`, `img2img`). +#### ⚙️ Stage Configuration Parameters (bagel.yaml) -```bash -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A photorealistic portrait" \ - --cfg-text-scale 6.0 \ - --cfg-img-scale 2.0 \ - --negative-prompt "blurry, low quality, distorted" \ - --cfg-interval 0.4 1.0 \ - --cfg-renorm-type global \ - --cfg-renorm-min 0.0 -``` + **Stage 0 - Thinker (LLM Stage)** -| Argument | Default | Description | -| :------- | :------ | :---------- | -| `--cfg-text-scale` | `4.0` | Text CFG scale (higher = more prompt-adherent) | -| `--cfg-img-scale` | `1.5` | Image CFG scale (for img2img) | -| `--negative-prompt` | `None` | Negative prompt for CFG conditioning | -| `--cfg-interval` | pipeline default | CFG active interval `[start, end]` as fractions of total timesteps | -| `--cfg-renorm-type` | `None` | Renormalization type: `global`, `text_channel`, `channel` | -| `--cfg-renorm-min` | `None` | Minimum renormalization value | -| `--cfg-parallel-size` | `1` | CFG parallel size: `1` = batched (single GPU), `2` = 2-branch parallel, `3` = full 3-GPU parallel | +| Parameter | Value | Description | +| :------------------------------- | :------------------------------ | :----------------------- | +| `stage_type` | `llm` | Stage type | +| `devices` | `"0"` | GPU device ID | +| `max_num_seqs` | `1` | Maximum batch size | +| `model_stage` | `thinker` | Model stage identifier | +| `model_arch` | `BagelForConditionalGeneration` | Model architecture | +| `gpu_memory_utilization` | `0.4` | GPU memory utilization | +| `tensor_parallel_size` | `1` | Tensor parallel size | +| `max_num_batched_tokens` | `32768` | Maximum batched tokens | +| `omni_kv_config.need_send_cache` | `true` | Whether to send KV cache | -## Deployment Topologies +------ -### Two-Stage (Default) +**Stage 1 - DiT (Diffusion Stage)** -The default topology auto-detected from the model. No extra flags needed. +| Parameter | Value | Description | +| :------------------------------- | :---------- | :-------------------------- | +| `stage_type` | `diffusion` | Stage type | +| `devices` | `"0"` | GPU device ID | +| `max_num_seqs` | `1` | Maximum batch size | +| `model_stage` | `dit` | Model stage identifier | +| `gpu_memory_utilization` | `0.4` | GPU memory utilization | +| `omni_kv_config.need_recv_cache` | `true` | Whether to receive KV cache | +| `engine_input_source` | `[0]` | Input source from Stage 0 | -```bash -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A cute cat" -``` +------ -The pipeline is defined in [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml). Stage 0 (Thinker) and Stage 1 (DiT) share GPU 0 by default. For dual-GPU setups, customize the deploy YAML and set `devices: "1"` for stage 1. +#### Tensor Parallelism (TP) -### Single-Stage +For larger models or multi-GPU environments, you can enable Tensor Parallelism (TP) by modifying the stage configuration (e.g., [`bagel.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel.yaml)). -Pass the single-stage deploy config via `--deploy-config`: +1. **Set `tensor_parallel_size`**: Increase this value (e.g., to `2` or `4`). +2. **Set `devices`**: Specify the comma-separated GPU IDs to be used for the stage (e.g., `"0,1"`). -```bash -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A cute cat" \ - --deploy-config vllm_omni/deploy/bagel_single_stage.yaml +Example configuration for TP=2 on GPUs 0 and 1: +```yaml + engine_args: + tensor_parallel_size: 2 + ... + runtime: + devices: "0,1" ``` -See [`bagel_single_stage.yaml`](../../../vllm_omni/deploy/bagel_single_stage.yaml) for configuration details. The `pipeline: bagel_single_stage` field selects the single-stage topology from the pipeline registry. +------ -### Tensor Parallelism (TP) +#### 🔗 Runtime Configuration -For larger models or multi-GPU environments, customize the deploy YAML (see [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml)) and set per-stage `tensor_parallel_size` and `devices`: +| Parameter | Value | Description | +| :-------------------- | :------ | :------------------------------- | +| `window_size` | `-1` | Window size (-1 means unlimited) | +| `max_inflight` | `1` | Maximum inflight requests | +| `shm_threshold_bytes` | `65536` | Shared memory threshold (64KB) | -```yaml -# Example: TP=2 on GPUs 0,1 for the Thinker stage -stages: - - stage_id: 0 - tensor_parallel_size: 2 - devices: "0,1" +## Using Mooncake Connector + +[Mooncake](https://github.com/kvcache-ai/Mooncake) is a high-performance distributed KV cache transfer engine that enables efficient cross-node data movement via TCP or RDMA, making it ideal for multi-node disaggregated inference. + +By default, BAGEL uses `SharedMemoryConnector` for inter-stage communication. You can switch to the Mooncake connector for better performance on multi-GPU setups and to enable multi-node deployment. + +### Prerequisites + +Install the Mooncake transfer engine: + +```bash +# For CUDA-enabled systems (recommended) +pip install mooncake-transfer-engine + +# For non-CUDA systems +pip install mooncake-transfer-engine-non-cuda ``` -Then pass the custom deploy YAML: +### Step 1: Start the Mooncake Master + +On the **primary node**, start the Mooncake master service (run in a separate terminal or background with `&`): ```bash -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2img \ - --prompts "A cute cat" \ - --deploy-config /path/to/custom_bagel.yaml +# Optional: enable disk-backed storage by creating a directory and passing --root_fs_dir. +# Without it, Mooncake runs in memory-only mode, which is sufficient for KV cache transfer. +mkdir -p ./mc_storage + +mooncake_master \ + --rpc_port=50051 \ + --enable_http_metadata_server=true \ + --http_metadata_server_host=0.0.0.0 \ + --http_metadata_server_port=8080 \ + --metrics_port=9003 \ + --root_fs_dir=./mc_storage/ \ + --cluster_id=mc-local-1 & ``` -### FP8 Quantization +### Step 2: Run Offline Inference with Mooncake + +Use the provided Mooncake stage config [`bagel_multiconnector.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml). Before launching, update the `metadata_server` and `master` addresses in the YAML to match your Mooncake master node's IP (use `127.0.0.1` for single-node testing). ```bash +cd examples/offline_inference/bagel + +# Text to Image with Mooncake python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --quantization fp8 + --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml + +# Image to Text with Mooncake +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality img2text \ + --image-path /path/to/image.jpg \ + --prompts "Describe this image" \ + --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml + +# Text to Text with Mooncake +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2text \ + --prompts "What is the capital of France?" \ + --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml ``` -## Command Line Reference - -### Core Arguments - -| Argument | Type | Default | Description | -| :------- | :--- | :------ | :---------- | -| `--model` | string | `ByteDance-Seed/BAGEL-7B-MoT` | Model path or HuggingFace name | -| `--modality` | choice | `text2img` | `text2img`, `img2img`, `img2text`, `text2text` | -| `--prompts` | list | `None` | Input text prompts | -| `--txt-prompts` | string | `None` | Path to text file with one prompt per line | -| `--image-path` | string | `None` | Input image path (required for `img2img`/`img2text`) | -| `--output` | string | `.` | Output directory for saved images | -| `--steps` | int | `50` | Number of diffusion inference steps | -| `--seed` | int | `None` | Random seed for reproducibility | - -### Think Mode Arguments - -| Argument | Type | Default | Description | -| :------- | :--- | :------ | :---------- | -| `--think` | flag | `False` | Enable `...` planning/reasoning | -| `--max-think-tokens` | int | `1000` | Maximum tokens for think generation | -| `--do-sample` | flag | `False` | Use sampling instead of greedy decoding | -| `--text-temperature` | float | `0.3` | Sampling temperature for text generation | - -### CFG Arguments - -| Argument | Type | Default | Description | -| :------- | :--- | :------ | :---------- | -| `--cfg-text-scale` | float | `4.0` | Text CFG guidance scale | -| `--cfg-img-scale` | float | `1.5` | Image CFG guidance scale | -| `--negative-prompt` | string | `None` | Negative prompt for CFG | -| `--cfg-parallel-size` | int | `1` | CFG parallel GPU count (1, 2, or 3) | -| `--cfg-interval` | float[2] | pipeline default | CFG active window `[start, end]` | -| `--cfg-renorm-type` | string | `None` | `global`, `text_channel`, or `channel` | -| `--cfg-renorm-min` | float | `None` | Minimum renormalization value | - -### Engine Arguments - -| Argument | Type | Default | Description | -| :------- | :--- | :------ | :---------- | -| `--deploy-config` | string | `None` | Path to deploy YAML (auto-detected if omitted) | -| `--worker-backend` | choice | `process` | `process` or `ray` | -| `--ray-address` | string | `None` | Ray cluster address | -| `--quantization` | string | `None` | Quantization method (e.g. `fp8`) | -| `--log-stats` | flag | `False` | Enable statistics logging | -| `--init-timeout` | int | `300` | Initialization timeout (seconds) | -| `--batch-timeout` | int | `5` | Batch timeout (seconds) | -| `--enable-diffusion-pipeline-profiler` | flag | `False` | Profile diffusion stage durations | +For more details on the Mooncake connector and multi-node setup, see the [Mooncake Store Connector documentation](../../../docs/design/feature/omni_connectors/mooncake_store_connector.md). + +------ ## FAQ -- If you encounter OOM errors, try decreasing `max_model_len` or `gpu_memory_utilization` in the deploy YAML. +- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. -**Two-stage VRAM usage:** +```bash +sudo apt update +sudo apt install ffmpeg +``` -| Stage | VRAM | -| :---- | :--- | -| Stage 0 (Thinker) | **15.04 GiB + KV Cache** | -| Stage 1 (DiT) | **26.50 GiB** | -| Total | **~42 GiB + KV Cache** | +- If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. -**Single-stage VRAM usage:** The DiT loads the full model (~42 GiB) in one process. +| Stage | VRAM | +| :------------------ | :--------------------------- | +| Stage-0 (Thinker) | **15.04 GiB** **+ KV Cache** | +| Stage-1 (DiT) | **26.50 GiB** | +| Total | **~42 GiB + KV Cache** | diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index a6ce1f1314f..2153a31ba70 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -2,10 +2,7 @@ import os from vllm_omni.inputs.data import OmniPromptType -from vllm_omni.model_executor.stage_input_processors.bagel import ( - GEN_THINK_SYSTEM_PROMPT, - VLM_THINK_SYSTEM_PROMPT, -) +from vllm_omni.model_executor.stage_input_processors.bagel import GEN_THINK_SYSTEM_PROMPT def parse_args(): @@ -53,12 +50,7 @@ def parse_args(): parser.add_argument("--shm-threshold-bytes", type=int, default=65536) parser.add_argument("--worker-backend", type=str, default="process", choices=["process", "ray"]) parser.add_argument("--ray-address", type=str, default=None) - parser.add_argument( - "--deploy-config", - type=str, - default=None, - help="Path to deploy YAML. If unset, auto-loads vllm_omni/deploy/bagel.yaml based on the HF model_type.", - ) + parser.add_argument("--stage-configs-path", type=str, default=None) parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.") parser.add_argument("--cfg-text-scale", type=float, default=4.0, help="Text CFG scale (default: 4.0)") @@ -102,28 +94,7 @@ def parse_args(): default=False, help="Enable thinking mode: AR stage decodes ... planning tokens before image generation.", ) - parser.add_argument( - "--max-think-tokens", - type=int, - default=1000, - help="Maximum number of tokens for thinking text generation (default: 1000).", - ) - parser.add_argument( - "--do-sample", - action="store_true", - default=False, - help="Enable sampling for text generation (default: greedy).", - ) - parser.add_argument( - "--text-temperature", - type=float, - default=0.3, - help="Temperature for text generation sampling (default: 0.3).", - ) - - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults - nullify_stage_engine_defaults(parser) args = parser.parse_args() return args @@ -134,6 +105,7 @@ def main(): model_name = args.model prompts: list[OmniPromptType] = [] try: + # Preferred: load from txt file (one prompt per line) if getattr(args, "txt_prompts", None) and args.prompt_type == "text": with open(args.txt_prompts, encoding="utf-8") as f: lines = [ln.strip() for ln in f.readlines()] @@ -146,20 +118,22 @@ def main(): raise if not prompts: + # Default prompt for text2img test if none provided prompts = ["A cute cat"] print(f"[Info] No prompts provided, using default: {prompts}") + omni_outputs = [] from PIL import Image from vllm_omni.entrypoints.omni import Omni omni_kwargs = {} - deploy_config = args.deploy_config - if args.think and deploy_config is None: - deploy_config = "vllm_omni/deploy/bagel_think.yaml" - print(f"[Info] Think mode enabled, using deploy config: {deploy_config}") - if deploy_config: - omni_kwargs["deploy_config"] = deploy_config + stage_configs_path = args.stage_configs_path + if args.think and stage_configs_path is None: + stage_configs_path = "vllm_omni/model_executor/stage_configs/bagel_think.yaml" + print(f"[Info] Think mode enabled, using stage config: {stage_configs_path}") + if stage_configs_path: + omni_kwargs["stage_configs_path"] = stage_configs_path omni_kwargs.update( { @@ -176,7 +150,7 @@ def main(): if args.quantization: omni_kwargs["quantization_config"] = args.quantization - omni = Omni.from_cli_args(args, model=model_name, **omni_kwargs) + omni = Omni(model=model_name, **omni_kwargs) formatted_prompts = [] for p in prompts: @@ -197,10 +171,7 @@ def main(): elif args.modality == "img2text": if args.image_path: loaded_image = Image.open(args.image_path).convert("RGB") - think_prefix = f"<|im_start|>system\n{VLM_THINK_SYSTEM_PROMPT}<|im_end|>\n" if args.think else "" - final_prompt_text = ( - f"{think_prefix}<|im_start|>user\n<|image_pad|>\n{p}<|im_end|>\n<|im_start|>assistant\n" - ) + final_prompt_text = f"<|im_start|>user\n<|image_pad|>\n{p}<|im_end|>\n<|im_start|>assistant\n" prompt_dict = { "prompt": final_prompt_text, "multi_modal_data": {"image": loaded_image}, @@ -208,8 +179,7 @@ def main(): } formatted_prompts.append(prompt_dict) elif args.modality == "text2text": - think_prefix = f"<|im_start|>{VLM_THINK_SYSTEM_PROMPT}<|im_end|>" if args.think else "" - final_prompt_text = f"{think_prefix}<|im_start|>{p}<|im_end|><|im_start|>" + final_prompt_text = f"<|im_start|>user\n{p}<|im_end|>\n<|im_start|>assistant\n" prompt_dict = {"prompt": final_prompt_text, "modalities": ["text"]} formatted_prompts.append(prompt_dict) else: @@ -221,63 +191,44 @@ def main(): formatted_prompts.append(prompt_dict) params_list = omni.default_sampling_params_list - # Bagel exposes 1 sampling param set for single-stage (DiT-only) and - # 2 for two-stage (Thinker + DiT). This heuristic may need updating - # if future pipelines break that 1:1 mapping. - is_single_stage = len(params_list) == 1 - - diffusion_params_idx = 0 if is_single_stage else (1 if len(params_list) > 1 else 0) - diffusion_params = params_list[diffusion_params_idx] - if args.modality in ("text2img", "img2img"): - diffusion_params.num_inference_steps = args.steps # type: ignore - diffusion_params.cfg_parallel_size = args.cfg_parallel_size # type: ignore - if args.seed is not None: - diffusion_params.seed = args.seed # type: ignore - - extra = getattr(diffusion_params, "extra_args", {}) or {} - extra["cfg_text_scale"] = args.cfg_text_scale - extra["cfg_img_scale"] = args.cfg_img_scale - if args.cfg_interval is not None: - extra["cfg_interval"] = tuple(args.cfg_interval) - if args.cfg_renorm_type is not None: - extra["cfg_renorm_type"] = args.cfg_renorm_type - if args.cfg_renorm_min is not None: - extra["cfg_renorm_min"] = args.cfg_renorm_min - if args.negative_prompt is not None: - extra["negative_prompt"] = args.negative_prompt - - needs_text_gen = is_single_stage and (args.think or args.modality in ("text2text", "img2text")) - if needs_text_gen: - if args.think: - extra["think"] = True - extra["max_think_tokens"] = args.max_think_tokens - extra["do_sample"] = args.do_sample - extra["text_temperature"] = args.text_temperature - diffusion_params.extra_args = extra # type: ignore + if len(params_list) > 1: + diffusion_params = params_list[1] + diffusion_params.num_inference_steps = args.steps # type: ignore + diffusion_params.cfg_parallel_size = args.cfg_parallel_size # type: ignore + if args.seed is not None: + diffusion_params.seed = args.seed # type: ignore + extra = { + "cfg_text_scale": args.cfg_text_scale, + "cfg_img_scale": args.cfg_img_scale, + } + if args.cfg_interval is not None: + extra["cfg_interval"] = tuple(args.cfg_interval) + if args.cfg_renorm_type is not None: + extra["cfg_renorm_type"] = args.cfg_renorm_type + if args.cfg_renorm_min is not None: + extra["cfg_renorm_min"] = args.cfg_renorm_min + if args.negative_prompt is not None: + extra["negative_prompt"] = args.negative_prompt + diffusion_params.extra_args = extra # type: ignore omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list)) img_idx = 0 for req_output in omni_outputs: - # 2-stage think mode: text output from thinker stage - ro = getattr(req_output, "request_output", None) - if ro and getattr(ro, "outputs", None): - txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs) - if txt: - if args.think: - print(f"[Think]\n{txt}") - else: - print(f"[Output] Text:\n{txt}") - - # Single-stage DiT: text from custom_output - custom = getattr(req_output, "_custom_output", {}) or {} - if custom.get("think_text"): - print(f"[Think]\n{custom['think_text']}") - if custom.get("text_output"): - print(f"[Output] Text:\n{custom['text_output']}") + if args.think: + text_output = getattr(req_output, "text", None) or getattr(req_output, "outputs", None) + if text_output: + if isinstance(text_output, list) and text_output: + for out in text_output: + txt = getattr(out, "text", str(out)) + if txt: + print(f"[Think] {txt}") + elif isinstance(text_output, str): + print(f"[Think] {text_output}") images = getattr(req_output, "images", None) + if not images: continue @@ -287,6 +238,8 @@ def main(): print(f"[Output] Saved image to {save_path}") img_idx += 1 + print(omni_outputs) + if __name__ == "__main__": main() diff --git a/examples/offline_inference/cosyvoice3/README.md b/examples/offline_inference/cosyvoice3/README.md index 704b49614fb..895d3f660f0 100644 --- a/examples/offline_inference/cosyvoice3/README.md +++ b/examples/offline_inference/cosyvoice3/README.md @@ -7,7 +7,7 @@ Install dependencies: uv pip install -e . ``` -> **Note:** This includes required libraries such as `soundfile`, +> **Note:** This includes required libraries such as `librosa`, `soundfile`, > `onnxruntime`, `x-transformers`, and `einops` via > `requirements/common.txt` and platform-specific requirements files. @@ -58,14 +58,7 @@ Key components live in `vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py - Stage 0 uses `CosyVoice3LM` and outputs speech tokens + conditioning features. - Stage 1 runs the flow model (DiT-based CFM) and HiFiGAN to synthesize waveform. -Pipeline topology lives in `vllm_omni/model_executor/models/cosyvoice3/pipeline.py`; -runtime tunables (batch size, memory limits, sampling) live in -`vllm_omni/deploy/cosyvoice3.yaml`. The deploy config auto-loads by -HF `model_type` and defaults to `async_chunk: true` (shared-memory -streaming). Pass `--no-async-chunk` on `vllm serve` to switch to the -legacy sync path where stage 1 runs `text2flow` over the full -speech-token sequence. +Stage wiring is configured in `vllm_omni/model_executor/stage_configs/cosyvoice3.yaml`. - Stage 0 emits latent speech tokens. -- Stage 1 consumes them via `sync_process_input_func` (sync mode) or the - shared-memory connector (async-chunk mode) and outputs audio. +- Stage 1 consumes them via `custom_process_input_func` and outputs audio. diff --git a/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py b/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py index a5dc564ec3b..68ab72b3870 100644 --- a/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py +++ b/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py @@ -2,12 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os +from pathlib import Path +import librosa import numpy as np import soundfile as sf from vllm import SamplingParams from vllm.assets.audio import AudioAsset -from vllm.multimodal.media.audio import load_audio from vllm_omni.entrypoints.omni import Omni from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config @@ -15,6 +16,22 @@ from vllm_omni.model_executor.models.cosyvoice3.utils import extract_text_token +def _ensure_mel_filters_asset() -> None: + repo_root = Path(__file__).resolve().parents[3] + filters_path = repo_root / "vllm_omni" / "model_executor" / "models" / "cosyvoice3" / "assets" / "mel_filters.npz" + if filters_path.exists(): + return + + source_url = "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz" + raise FileNotFoundError( + "Missing CosyVoice3 mel filter asset:\n" + f" {filters_path}\n" + "Download it with:\n" + f" mkdir -p {filters_path.parent} && " + f"curl -L {source_url} -o {filters_path}" + ) + + def run_e2e(): parser = argparse.ArgumentParser() # ""FunAudioLLM/Fun-CosyVoice3-0.5B-2512 @@ -24,13 +41,7 @@ def run_e2e(): required=True, help="Path to CosyVoice3 model directory (e.g., pretrained_models/Fun-CosyVoice3-0.5B/).", ) - parser.add_argument( - "--deploy-config", - type=str, - default=None, - help="Override the deploy config path. If unset, auto-loads " - "vllm_omni/deploy/cosyvoice3.yaml based on the HF model_type.", - ) + parser.add_argument("--stage-config", type=str, default="vllm_omni/model_executor/stage_configs/cosyvoice3.yaml") parser.add_argument("--prompt", type=str, default="Hello, this is a test of the CosyVoice system capability.") parser.add_argument( "--prompt-text", @@ -45,18 +56,24 @@ def run_e2e(): help="Path to tokenizer directory (e.g., /CosyVoice-BlankEN).", ) args = parser.parse_args() + _ensure_mel_filters_asset() # Ensure tokenizer directory exists if not os.path.exists(args.tokenizer): raise FileNotFoundError(f"{args.tokenizer} does not exist!") - if args.deploy_config is not None and not os.path.exists(args.deploy_config): - raise FileNotFoundError(f"{args.deploy_config} does not exist!") + # Ensure stage config exists + if not os.path.exists(args.stage_config): + raise FileNotFoundError(f"{args.stage_config} does not exist!") print(f"Initializing cosyvoice E2E with model={args.model}") + # Initialize Omni + # This spins up the engine(s) based on the stage config + # We pass trust_remote_code=True same as Qwen examples omni = Omni( model=args.model, - deploy_config=args.deploy_config, + stage_configs_path=args.stage_config, + trust_remote_code=True, tokenizer=args.tokenizer, log_stats=True, ) @@ -68,7 +85,7 @@ def run_e2e(): if not os.path.exists(args.audio_path): raise FileNotFoundError(f"Audio file not found: {args.audio_path}") # Load at native sample rate - audio_signal, sr = load_audio(args.audio_path, sr=None) + audio_signal, sr = librosa.load(args.audio_path, sr=None) # Validate sample rate before processing (similar to original CosyVoice) min_sr = 16000 diff --git a/examples/offline_inference/custom_pipeline/image_to_image/image_edit.py b/examples/offline_inference/custom_pipeline/image_to_image/image_edit.py index dc1085c28ef..8ab5e0d9a6c 100644 --- a/examples/offline_inference/custom_pipeline/image_to_image/image_edit.py +++ b/examples/offline_inference/custom_pipeline/image_to_image/image_edit.py @@ -44,11 +44,9 @@ import argparse import asyncio -import json import os import time from pathlib import Path -from typing import Any import torch from PIL import Image @@ -60,16 +58,6 @@ from vllm_omni.platforms import current_omni_platform -def parse_profiler_config(value: str) -> dict[str, Any]: - try: - config = json.loads(value) - except json.JSONDecodeError as e: - raise argparse.ArgumentTypeError(f"--profiler-config must be valid JSON: {e}") from e - if not isinstance(config, dict): - raise argparse.ArgumentTypeError("--profiler-config must be a JSON object") - return config - - # =========================== # Argument Parser # =========================== @@ -111,16 +99,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--vae-use-slicing", action="store_true") parser.add_argument("--vae-use-tiling", action="store_true") parser.add_argument("--enable-cpu-offload", action="store_true") - parser.add_argument( - "--profiler-config", - type=parse_profiler_config, - default=None, - help='JSON profiler config for torch/cuda profiling, e.g. \'{"profiler":"torch","torch_profiler_dir":"./perf"}\'.', - ) - - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults - nullify_stage_engine_defaults(parser) return parser.parse_args() @@ -179,13 +158,12 @@ async def main(): enable_cpu_offload=args.enable_cpu_offload, diffusion_load_format="dummy", custom_pipeline_args={"pipeline_class": "custom_pipeline.CustomPipeline"}, - profiler_config=args.profiler_config, ) print(">>> Pipeline loaded successfully") # ---- Profiling + Info ---- - profiler_enabled = args.profiler_config is not None + profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) print(f"\n{'=' * 60}") print("Generation Configuration") print(f"Model: {args.model}") diff --git a/examples/offline_inference/dynin_omni/README.md b/examples/offline_inference/dynin_omni/README.md deleted file mode 100644 index d28b360714e..00000000000 --- a/examples/offline_inference/dynin_omni/README.md +++ /dev/null @@ -1,110 +0,0 @@ -# Dynin-Omni Offline End2End Example - -This folder contains a unified offline inference entrypoint: - -- `end2end.py` - -## 1. Environment Setup - -Run from repository root: - -```bash -cd -``` - -If needed, install this repo in editable mode: - -```bash -pip install -e . -``` - -## 2. Extra Dependencies (EMOVA) - -Install the following packages for EMOVA-related components: - -```bash -pip install \ - "phonemizer==3.3.0" \ - "Unidecode==1.4.0" \ - "hydra-core==1.3.2" \ - "pytorch-lightning==1.1.0" \ - "wget==3.2" \ - "wrapt==2.1.1" \ - "onnx==1.20.1" \ - "frozendict==2.4.7" \ - "inflect==7.5.0" \ - "braceexpand==0.1.7" \ - "webdataset==1.0.2" \ - "torch-stft==0.1.4" \ - "editdistance==0.8.1" -``` - -## 3. Hardware and VRAM Requirements - -This example uses a 3-stage pipeline on one GPU by default -([`dynin_omni.yaml`](../../../vllm_omni/model_executor/stage_configs/dynin_omni.yaml)): - -- Stage-0 (`token2text`): `gpu_memory_utilization: 0.5` -- Stage-1 (`token2image`): `gpu_memory_utilization: 0.1` -- Stage-2 (`token2audio`): `gpu_memory_utilization: 0.1` - -### Requested GPU Memory Budget from `gpu_memory_utilization` - -| Stage | Utilization | A100 80GB | H200 141GB | -| :-- | :-- | :-- | :-- | -| Stage-0 (token2text) | 0.5 | ~40.0 GB | ~70.5 GB | -| Stage-1 (token2image) | 0.1 | ~8.0 GB | ~14.1 GB | -| Stage-2 (token2audio) | 0.1 | ~8.0 GB | ~14.1 GB | -| Total requested budget | 0.7 | ~56.0 GB | ~98.7 GB | - -### Observed Runtime Signal (from your log) - -- Stage-0 reported: `Model loading took 15.12 GiB memory` (weights footprint signal). -- Stages 1/2 can still add runtime memory depending on task path and backend allocations. -- Keep extra headroom for CUDA/PyTorch overhead and temporary allocations. - -### GPU Compatibility - -- Confirmed target GPUs for this setup: **NVIDIA H200**, **NVIDIA A100**. -- CI/e2e coverage in this repo also includes CUDA **L4** markers for Dynin tests. - -## 4. End2End Run Examples - -```bash -# t2t -python /examples/offline_inference/dynin_omni/end2end.py \ - --task t2t --model snu-aidas/Dynin-Omni --text - -# i2t -python /examples/offline_inference/dynin_omni/end2end.py \ - --task i2t --model snu-aidas/Dynin-Omni --image --text "Please describe this image in detail." - -# s2t -python /examples/offline_inference/dynin_omni/end2end.py \ - --task s2t --model snu-aidas/Dynin-Omni --audio --text "Transcribe the given audio." - -# t2i -python /examples/offline_inference/dynin_omni/end2end.py \ - --task t2i --model snu-aidas/Dynin-Omni --text - -# v2t -python /examples/offline_inference/dynin_omni/end2end.py \ - --task v2t --model snu-aidas/Dynin-Omni --video --text "Describe this video in detail." - -# i2i -python /examples/offline_inference/dynin_omni/end2end.py \ - --task i2i --model snu-aidas/Dynin-Omni --image --text - -# t2s -python /examples/offline_inference/dynin_omni/end2end.py \ - --task t2s --model snu-aidas/Dynin-Omni --text -``` - -## 5. Notes - -- Outputs are saved under task-specific directories in `/tmp` by default. -- You can override output path with `--output-dir`. -- If you want to force local config resolution, pass `--dynin-config-path `. -- If you see the warning - `max_num_batched_tokens (32768) exceeds max_num_seqs * max_model_len (4096)`, - reduce `max_num_batched_tokens` in stage config (for example, `4096` in CI config). diff --git a/examples/offline_inference/dynin_omni/end2end.py b/examples/offline_inference/dynin_omni/end2end.py deleted file mode 100644 index 82cff0c0015..00000000000 --- a/examples/offline_inference/dynin_omni/end2end.py +++ /dev/null @@ -1,1451 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import argparse -import json -import os -import re -import sys -import time -import types -from importlib.machinery import ModuleSpec -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from PIL import Image - -TASK_CHOICES = ("t2t", "t2i", "t2s", "i2i", "i2t", "s2t", "v2t") - -TASK_DEFAULT_RUNTIME = { - "t2t": ("mmu", "mmu", 0, "text"), - "t2i": ("t2i", "t2i_gen", 2, "image"), - "t2s": ("t2s_mmu_like", "t2s_gen", 1, "audio"), - "i2i": ("i2i", "i2i", 2, "image"), - "i2t": ("mmu", "mmu", 0, "text"), - "s2t": ("s2t", "s2t", 0, "text"), - "v2t": ("v2t", "v2t", 0, "text"), -} - -TASK_RUNTIME_FALLBACKS: dict[str, dict[str, Any]] = { - "t2t": { - "output_dir": "/tmp/dynin_end2end_outputs", - "prompt_max_text_len": 1024, - "max_new_tokens": 1024, - "steps": 1024, - "block_length": 16, - "temperature": 0.0, - "cfg_scale": 0.0, - }, - "t2i": { - "output_dir": "/tmp/dynin_t2i_outputs", - "prompt_max_text_len": 128, - "image_token_count": 1024, - "mask_token_id": 126336, - "codebook_size": 8192, - "timesteps": 20, - "guidance_scale": 3.5, - "temperature": 1.0, - }, - "i2i": { - "output_dir": "/tmp/dynin_i2i_outputs", - "prompt_max_text_len": 128, - "mask_token_id": 126336, - "codebook_size": 8192, - "timesteps": 64, - "guidance_scale": 3.5, - "temperature": 1.0, - "image_resolution": 336, - "use_train_i2i_prompt": True, - }, - "i2t": { - "output_dir": "/tmp/dynin_i2t_outputs", - "prompt_max_text_len": 128, - "max_new_tokens": 128, - "steps": 128, - "block_length": 2, - "temperature": 0.0, - "cfg_scale": 0.0, - "mask_token_id": 126336, - "codebook_size": 8192, - "image_resolution": 480, - "remasking": "low_confidence", - }, - "s2t": { - "output_dir": "/tmp/dynin_s2t_outputs", - "prompt_max_text_len": 1024, - "max_new_tokens": 128, - "steps": 128, - "block_length": 2, - "temperature": 0.0, - "cfg_scale": 0.0, - "mask_token_id": 126336, - "codebook_size": 8192, - "remasking": "low_confidence", - }, - "t2s": { - "output_dir": "/tmp/dynin_t2s_outputs", - "runtime_task": "t2s_mmu_like", - "prompting_task": "t2s_gen", - "prompt_max_text_len": 1024, - "t2s_token_length": 512, - "mask_token_id": 126336, - "codebook_size": 8192, - "audio_codebook_size": 4096, - "steps": 512, - "block_length": 128, - "temperature": 1.0, - "cfg_scale": 2.5, - "t2s_condition": "gender-female_emotion-neutral_speed-normal_pitch-normal", - }, - "v2t": { - "output_dir": "/tmp/dynin_v2t_outputs", - "prompt_max_text_len": 1024, - "max_new_tokens": 128, - "steps": 128, - "block_length": 2, - "temperature": 0.0, - "cfg_scale": 0.0, - "mask_token_id": 126336, - "codebook_size": 8192, - "image_resolution": 224, - "num_frames": 5, - "remasking": "low_confidence", - }, -} - -DEFAULT_I2T_QUESTION = "Please describe this image in detail." -DEFAULT_S2T_INSTRUCTION = "Transcribe the given audio." -DEFAULT_V2T_QUESTION = "Please provide a detailed description of the video." -DEFAULT_T2T_PROMPT = "Explain multimodal LLM inference in 3 sentences." -DEFAULT_T2S_INSTRUCTION = "Convert the given text into spoken audio." -DEFAULT_T2S_PROMPT = "Hello. This is a default text-to-speech sample." - -DYNIN_SPECIAL_TOKENS = ( - "<|soi|>", - "<|eoi|>", - "<|sov|>", - "<|eov|>", - "<|t2i|>", - "<|mmu|>", - "<|t2v|>", - "<|v2v|>", - "<|lvg|>", - "<|i2i|>", - "<|ti2ti|>", - "<|v2t|>", - "<|v2s|>", - "<|s2t|>", - "<|t2s|>", - "<|s2s|>", - "<|soa|>", - "<|eoa|>", -) - - -def bootstrap_repo_path() -> Path: - repo_root = Path(__file__).resolve().parents[3] - repo_root_str = str(repo_root) - if repo_root_str not in sys.path: - sys.path.insert(0, repo_root_str) - return repo_root - - -def ensure_safe_import_for_vllm() -> None: - os.environ.setdefault("TRANSFORMERS_NO_TORCHVISION", "1") - try: - import torchvision # noqa: F401 - - return - except Exception: - pass - - import enum - - class _InterpolationMode(enum.Enum): - NEAREST = 0 - BILINEAR = 2 - BICUBIC = 3 - LANCZOS = 1 - HAMMING = 4 - BOX = 5 - - tv_mod = types.ModuleType("torchvision") - tv_mod.__dict__["__version__"] = "0.0-stub" - tv_mod.__spec__ = ModuleSpec(name="torchvision", loader=None) - transforms_mod = types.ModuleType("torchvision.transforms") - transforms_mod.__spec__ = ModuleSpec(name="torchvision.transforms", loader=None) - transforms_mod.InterpolationMode = _InterpolationMode - tv_mod.transforms = transforms_mod - sys.modules["torchvision"] = tv_mod - sys.modules["torchvision.transforms"] = transforms_mod - - -def sanitize_repo_id(repo_id: str) -> str: - return re.sub(r"[^a-zA-Z0-9._-]+", "_", repo_id) - - -def is_hf_repo_id(value: str) -> bool: - return isinstance(value, str) and value.count("/") == 1 and all(value.split("/", 1)) - - -def ensure_local_model_dir(model: str, cache_dir: Path, localize: bool) -> Path: - model_path = Path(model).expanduser() - if model_path.is_dir(): - return model_path.resolve() - if not localize: - return Path(model) - - from huggingface_hub import snapshot_download - - cache_dir.mkdir(parents=True, exist_ok=True) - os.environ.setdefault("HF_HOME", str(cache_dir / ".hf_home")) - local_dir = cache_dir / sanitize_repo_id(model) - if not local_dir.exists(): - print(f"[end2end] Downloading model into local cache: {local_dir}") - snapshot_download( - repo_id=model, - local_dir=str(local_dir), - local_dir_use_symlinks=True, - resume_download=True, - ) - return local_dir.resolve() - - -def resolve_local_only( - override: bool | None, - source: str, - default: bool, -) -> bool: - if override is not None: - return bool(override) - return default or Path(source).expanduser().is_dir() - - -def load_text_tokenizer(tokenizer_source: str, local_files_only: bool): - from transformers import AutoTokenizer - - kwargs = { - "trust_remote_code": True, - "padding_side": "left", - "local_files_only": bool(local_files_only), - } - try: - tokenizer = AutoTokenizer.from_pretrained(tokenizer_source, **kwargs) - except TypeError: - kwargs.pop("local_files_only", None) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_source, **kwargs) - return tokenizer - - -def preprocess_image(image: Image.Image, resolution: int) -> torch.Tensor: - w, h = image.size - short_side = min(w, h) - scale = resolution / short_side - new_w, new_h = round(w * scale), round(h * scale) - image = image.resize((new_w, new_h), Image.BICUBIC) - left = (new_w - resolution) // 2 - top = (new_h - resolution) // 2 - image = image.crop((left, top, left + resolution, top + resolution)) - arr = np.array(image, dtype=np.float32) / 255.0 - tensor = torch.from_numpy(arr).permute(2, 0, 1) - return (tensor - 0.5) / 0.5 - - -def load_vq_image_encoder(source: str, local_files_only: bool, device: torch.device) -> Any: - from vllm_omni.model_executor.models.dynin_omni.dynin_omni_common import get_dynin_magvit_attr - - MAGVITv2 = get_dynin_magvit_attr("MAGVITv2", source=source, local_files_only=local_files_only) - vq_model = MAGVITv2.from_pretrained(source, local_files_only=local_files_only).to(device) - vq_model.requires_grad_(False) - vq_model.eval() - return vq_model - - -def encode_image_tokens( - image_path: Path, - vq_model: Any, - device: torch.device, - resolution: int, -) -> torch.Tensor: - image = Image.open(image_path).convert("RGB") - image_tensor = preprocess_image(image, resolution=resolution).unsqueeze(0).to(device) - with torch.no_grad(): - token_ids = vq_model.get_code(image_tensor) - token_ids = torch.as_tensor(token_ids, dtype=torch.long).detach().cpu() - if token_ids.ndim == 2 and token_ids.shape[0] == 1: - token_ids = token_ids[0] - return token_ids.contiguous() - - -def encode_video_tokens( - video_path: Path, - vq_model: Any, - device: torch.device, - resolution: int, - num_frames: int, -) -> torch.Tensor: - import cv2 - - cap = cv2.VideoCapture(str(video_path)) - frames: list[np.ndarray] = [] - while True: - ok, frame = cap.read() - if not ok: - break - frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - frames.append(frame) - cap.release() - if not frames: - raise ValueError(f"Video has no readable frames: {video_path}") - if len(frames) < num_frames: - raise ValueError(f"Video has {len(frames)} frames, requires >= {num_frames}: {video_path}") - - indices = np.linspace(0, len(frames) - 1, num_frames).astype(int) - token_list: list[torch.Tensor] = [] - for idx in indices: - pil = Image.fromarray(frames[int(idx)]) - frame_tensor = preprocess_image(pil, resolution=resolution).unsqueeze(0).to(device) - with torch.no_grad(): - token_list.append(torch.as_tensor(vq_model.get_code(frame_tensor), dtype=torch.long)) - merged = torch.cat(token_list, dim=1).detach().cpu() - if merged.ndim == 2 and merged.shape[0] == 1: - merged = merged[0] - return merged.contiguous() - - -def load_vq_audio_encoder(source: str, local_files_only: bool, device: torch.device) -> Any: - from transformers import AutoModel - - kwargs = { - "trust_remote_code": True, - "local_files_only": bool(local_files_only), - "low_cpu_mem_usage": False, - } - try: - model = AutoModel.from_pretrained(source, **kwargs) - except TypeError: - kwargs.pop("low_cpu_mem_usage", None) - try: - model = AutoModel.from_pretrained(source, **kwargs) - except TypeError: - kwargs.pop("local_files_only", None) - model = AutoModel.from_pretrained(source, **kwargs) - model.requires_grad_(False) - model.eval() - if hasattr(model, "to"): - model = model.to(device) - return model - - -def encode_audio_tokens(audio_path: Path, vq_audio_model: Any) -> torch.Tensor: - encoded = vq_audio_model.encode(str(audio_path)) - if isinstance(encoded, dict): - for key in ("input_ids", "token_ids", "codes", "tokens"): - if key in encoded: - encoded = encoded[key] - break - encoded = torch.as_tensor(encoded, dtype=torch.long).detach().cpu() - if encoded.ndim == 1: - encoded = encoded.unsqueeze(0) - elif encoded.ndim > 2: - encoded = encoded.view(encoded.shape[0], -1) - return encoded.contiguous() - - -def build_chat_prompt(content: str) -> str: - return ( - f"<|start_header_id|>user<|end_header_id|>\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" - ) - - -def resolve_task_text( - *, - task_name: str, - text: str, - instruction: str = "", - raw_prompt: bool = False, -) -> str: - text = str(text or "").strip() - - if task_name == "t2t" and not text: - return DEFAULT_T2T_PROMPT - if task_name == "i2t" and not text: - return DEFAULT_I2T_QUESTION - if task_name == "s2t" and not text: - return DEFAULT_S2T_INSTRUCTION - if task_name == "v2t" and not text: - return DEFAULT_V2T_QUESTION - if task_name in {"t2i", "i2i"} and not text: - return "A high quality detailed image." - - if task_name != "t2s": - return text - - if not text: - text = DEFAULT_T2S_PROMPT - - if raw_prompt: - return text - - instruction = str(instruction or "").strip() or DEFAULT_T2S_INSTRUCTION - return build_chat_prompt(f"{instruction}\n{text}") - - -def load_universal_prompting( - *, - tokenizer: Any, - tokenizer_source: str, - max_text_len: int, - cond_dropout_prob: float, - local_files_only: bool, - max_audio_len: int = 512, - max_audio_len_short: int = 256, -) -> Any: - from vllm_omni.model_executor.models.dynin_omni.dynin_omni_common import ( - DYNIN_REMOTE_SETTINGS, - resolve_remote_attr, - ) - - UniversalPrompting = resolve_remote_attr( - "UniversalPrompting", - module_name="prompting_utils", - settings=DYNIN_REMOTE_SETTINGS, - source=tokenizer_source, - local_files_only=bool(local_files_only), - fallback_module_names=("modeling_dynin_omni",), - ) - init_kwargs: dict[str, Any] = { - "max_text_len": int(max_text_len), - "special_tokens": DYNIN_SPECIAL_TOKENS, - "ignore_id": -100, - "cond_dropout_prob": float(cond_dropout_prob), - "use_reserved_token": True, - "max_audio_len": int(max_audio_len), - "max_audio_len_short": int(max_audio_len_short), - } - try: - return UniversalPrompting(tokenizer, **init_kwargs) - except TypeError: - init_kwargs.pop("max_audio_len", None) - init_kwargs.pop("max_audio_len_short", None) - return UniversalPrompting(tokenizer, **init_kwargs) - - -def _runtime_fallback(task: str, key: str, value: Any) -> Any: - if isinstance(value, str): - if value.strip() != "": - return value - elif value is not None: - return value - return TASK_RUNTIME_FALLBACKS.get(task, {}).get(key) - - -def _validate_generation_args(*, task: str, max_new_tokens: int, steps: int, block_length: int) -> None: - # Keep i2t/v2t generation constraints aligned with i2t.py/v2t.py. - if task not in {"i2t", "v2t"}: - return - if max_new_tokens <= 0: - raise ValueError(f"{task} requires max_new_tokens > 0.") - if block_length <= 0: - raise ValueError(f"{task} requires block_length > 0.") - if steps <= 0: - raise ValueError(f"{task} requires steps > 0.") - if max_new_tokens % block_length != 0: - raise ValueError(f"{task} requires max_new_tokens % block_length == 0, got {max_new_tokens} % {block_length}") - num_blocks = max_new_tokens // block_length - if num_blocks <= 0: - raise ValueError(f"{task} has invalid num_blocks.") - if steps % num_blocks != 0: - raise ValueError( - f"{task} requires steps % (max_new_tokens // block_length) == 0, " - f"got steps={steps}, max_new_tokens={max_new_tokens}, block_length={block_length}" - ) - - -def make_prompt_payload( - *, - task: str, - text: str, - image_tokens: torch.Tensor | None, - audio_tokens: torch.Tensor | None, - video_tokens: torch.Tensor | None, - image_placeholder_tokens: int, - audio_placeholder_tokens: int, - image_token_offset: int, - speech_token_offset: int, - mask_token_id: int, - use_train_i2i_prompt: bool, -) -> tuple[Any, str]: - runtime_task, prompting_task, _, _ = TASK_DEFAULT_RUNTIME[task] - del runtime_task - - if task == "t2t": - payload = ([[]], [build_chat_prompt(text)]) - return payload, prompting_task - - if task == "i2t": - if image_tokens is None: - raise ValueError("i2t requires image tokens") - img = image_tokens.view(-1).long() + int(image_token_offset) - payload = ([[img]], [build_chat_prompt(text)]) - return payload, prompting_task - - if task == "s2t": - if audio_tokens is None: - raise ValueError("s2t requires audio tokens") - aud = audio_tokens.long() + int(speech_token_offset) - if aud.ndim == 1: - aud = aud.unsqueeze(0) - payload = ([aud], [build_chat_prompt(text)]) - return payload, prompting_task - - if task == "v2t": - if video_tokens is None: - raise ValueError("v2t requires video tokens") - vid = video_tokens.view(-1).long() + int(image_token_offset) - payload = (vid.unsqueeze(0), [build_chat_prompt(text)]) - return payload, prompting_task - - if task == "t2i": - image_placeholder = torch.full( - (1, int(image_placeholder_tokens)), - fill_value=int(mask_token_id), - dtype=torch.long, - ) - payload = ([text], image_placeholder) - return payload, prompting_task - - if task == "i2i": - if image_tokens is None: - raise ValueError("i2i requires image tokens") - src = image_tokens.view(1, -1).long() + int(image_token_offset) - target_len = int(image_placeholder_tokens) if image_placeholder_tokens > 0 else int(src.shape[1]) - image_placeholder = torch.full( - (1, target_len), - fill_value=int(mask_token_id), - dtype=torch.long, - ) - if use_train_i2i_prompt: - labels_placeholder = torch.full( - (1, target_len), - fill_value=-100, - dtype=torch.long, - ) - payload = ([text], src, image_placeholder, labels_placeholder) - return payload, "i2i" - payload = ([text], src, image_placeholder) - return payload, "i2i_gen" - - if task == "t2s": - audio_placeholder = torch.full( - (1, int(audio_placeholder_tokens)), - fill_value=int(mask_token_id), - dtype=torch.long, - ) - payload = ([text], audio_placeholder) - return payload, prompting_task - - raise ValueError(f"Unsupported task: {task}") - - -def _to_1d_int_list(value: Any) -> list[int]: - if value is None: - return [] - if isinstance(value, torch.Tensor): - tensor = value.detach().to(device="cpu", dtype=torch.long) - else: - tensor = torch.as_tensor(value, dtype=torch.long) - if tensor.ndim == 0: - tensor = tensor.view(1) - elif tensor.ndim >= 2: - tensor = tensor.view(tensor.shape[0], -1)[0] - return [int(v) for v in tensor.tolist()] - - -def _run_uni_prompting(uni_prompting: Any, payload: Any, prompting_task: str) -> tuple[list[int], list[int]]: - prepared = uni_prompting(payload, prompting_task) - if isinstance(prepared, tuple): - prepared_input_ids = prepared[0] if len(prepared) > 0 else None - prepared_attention_mask = prepared[1] if len(prepared) > 1 else None - else: - prepared_input_ids = prepared - prepared_attention_mask = None - - input_ids = _to_1d_int_list(prepared_input_ids) - attention_mask = _to_1d_int_list(prepared_attention_mask) - if not input_ids: - raise RuntimeError(f"UniversalPrompting returned empty input_ids for task={prompting_task}") - return input_ids, attention_mask - - -def _get_special_token_id(uni_prompting: Any, token: str) -> int: - sptids = getattr(uni_prompting, "sptids_dict", None) or {} - if token not in sptids: - raise KeyError(f"Special token not found in UniversalPrompting.sptids_dict: {token}") - token_ids = _to_1d_int_list(sptids[token]) - if not token_ids: - raise ValueError(f"Special token id is empty for token: {token}") - return int(token_ids[0]) - - -def _tokenize_chat_query(tokenizer: Any, text: str) -> list[int]: - encoded = tokenizer(build_chat_prompt(text), return_tensors="pt").input_ids[0] - token_ids = _to_1d_int_list(encoded) - if not token_ids: - raise RuntimeError("Failed to tokenize chat query text.") - return token_ids - - -def _flatten_media_token_ids_with_offset(token_ids: Any, token_offset: int) -> list[int]: - media_ids = token_ids - if isinstance(media_ids, torch.Tensor): - media_ids = media_ids.detach().cpu().reshape(-1).tolist() - else: - media_ids = np.asarray(media_ids).reshape(-1).tolist() - return [int(x) + int(token_offset) for x in media_ids] - - -def _scalar_token_id(value: Any) -> int: - if isinstance(value, torch.Tensor): - if value.numel() == 0: - raise ValueError("Empty special-token tensor.") - return int(value.view(-1)[0].item()) - if isinstance(value, (list, tuple)): - if not value: - raise ValueError("Empty special-token list.") - return int(value[0]) - return int(value) - - -def build_v2t_input_ids( - *, - video_token_ids: Any, - tokenizer: Any, - uni_prompting: Any, - question: str, - image_token_offset: int, -) -> tuple[list[int], str]: - media_ids = video_token_ids - if isinstance(media_ids, torch.Tensor): - media_ids = media_ids.detach().cpu().reshape(-1).tolist() - else: - media_ids = np.asarray(media_ids).reshape(-1).tolist() - media_ids = [int(x) + int(image_token_offset) for x in media_ids] - - sptids = uni_prompting.sptids_dict - task_id = _scalar_token_id(sptids["<|v2t|>"]) - soi_id = _scalar_token_id(sptids["<|soi|>"]) - eoi_id = _scalar_token_id(sptids["<|eoi|>"]) - sot_id = _scalar_token_id(sptids["<|sot|>"]) - - prompt_text = build_v2t_chat_prompt(question) - query_ids = tokenizer(prompt_text, return_tensors="pt").input_ids[0].detach().cpu().tolist() - input_ids = [task_id, soi_id] + media_ids + [eoi_id, sot_id] + [int(v) for v in query_ids] - return input_ids, prompt_text - - -def build_i2t_input_ids( - *, - image_token_ids: Any, - tokenizer: Any, - uni_prompting: Any, - question: str, - image_token_offset: int, -) -> tuple[list[int], str]: - image_ids = image_token_ids - if isinstance(image_ids, torch.Tensor): - image_ids = image_ids.detach().cpu().reshape(-1).tolist() - else: - image_ids = np.asarray(image_ids).reshape(-1).tolist() - image_ids = [int(x) + int(image_token_offset) for x in image_ids] - - sptids = uni_prompting.sptids_dict - task_id = _scalar_token_id(sptids["<|mmu|>"]) - soi_id = _scalar_token_id(sptids["<|soi|>"]) - eoi_id = _scalar_token_id(sptids["<|eoi|>"]) - sot_id = _scalar_token_id(sptids["<|sot|>"]) - - prompt_text = build_i2t_chat_prompt(question) - query_ids = tokenizer(prompt_text, return_tensors="pt").input_ids[0].detach().cpu().tolist() - input_ids = [task_id, soi_id] + image_ids + [eoi_id, sot_id] + [int(v) for v in query_ids] - return input_ids, prompt_text - - -def build_v2t_chat_prompt(question: str) -> str: - return ( - f"<|start_header_id|>user<|end_header_id|>\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" - ) - - -def build_i2t_chat_prompt(question: str) -> str: - return ( - f"<|start_header_id|>user<|end_header_id|>\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" - ) - - -def make_mmu_prompt( - *, - task: str, - text: str, - tokenizer: Any, - uni_prompting: Any, - image_tokens: torch.Tensor | None, - audio_tokens: torch.Tensor | None, - video_tokens: torch.Tensor | None, - image_token_offset: int, - speech_token_offset: int, -) -> tuple[list[int], list[int]]: - query_ids = _tokenize_chat_query(tokenizer, text) - - if task == "i2t": - token_ids, _ = build_i2t_input_ids( - image_token_ids=image_tokens, - tokenizer=tokenizer, - uni_prompting=uni_prompting, - question=text, - image_token_offset=int(image_token_offset), - ) - token_ids = [int(v) for v in token_ids] - return token_ids, [1] * len(token_ids) - - if task == "v2t": - token_ids, _ = build_v2t_input_ids( - video_token_ids=video_tokens, - tokenizer=tokenizer, - uni_prompting=uni_prompting, - question=text, - image_token_offset=int(image_token_offset), - ) - token_ids = [int(v) for v in token_ids] - return token_ids, [1] * len(token_ids) - - if task == "s2t": - if audio_tokens is None: - raise ValueError("s2t requires audio tokens") - audio_ids = _to_1d_int_list(audio_tokens.long() + int(speech_token_offset)) - token_ids = [ - _get_special_token_id(uni_prompting, "<|s2t|>"), - _get_special_token_id(uni_prompting, "<|soa|>"), - *audio_ids, - _get_special_token_id(uni_prompting, "<|eoa|>"), - *query_ids, - ] - return token_ids, [1] * len(token_ids) - - raise ValueError(f"Unsupported task for validation-style MMU prompt: {task}") - - -def iter_mm_outputs(outputs: list[Any]): - for omni_out in outputs: - req_out = getattr(omni_out, "request_output", None) - req_list = req_out if isinstance(req_out, list) else [req_out] - for item in req_list: - if item is None: - continue - mm_out = getattr(item, "multimodal_output", None) or {} - if mm_out: - yield mm_out - completions = getattr(item, "outputs", None) or [] - for completion in completions: - c_mm_out = getattr(completion, "multimodal_output", None) or {} - if c_mm_out: - yield c_mm_out - omni_mm = getattr(omni_out, "multimodal_output", None) or {} - if omni_mm: - yield omni_mm - - -def _to_token_list(value: Any) -> list[int]: - if value is None: - return [] - if hasattr(value, "detach"): - value = value.detach() - if hasattr(value, "cpu"): - value = value.cpu() - if hasattr(value, "flatten"): - value = value.flatten().tolist() - if isinstance(value, tuple): - value = list(value) - if not isinstance(value, list): - return [] - out: list[int] = [] - for token in value: - if isinstance(token, bool): - continue - try: - out.append(int(token)) - except Exception: - continue - return out - - -def extract_text_output(outputs: list[Any], tokenizer: Any) -> str: - for mm_out in iter_mm_outputs(outputs): - text = mm_out.get("text") - if isinstance(text, list) and text: - text = text[-1] - if isinstance(text, str) and text.strip(): - return text.strip() - for key in ("text_tokens", "token_ids"): - token_ids = _to_token_list(mm_out.get(key)) - if not token_ids: - continue - decoded = tokenizer.decode(token_ids, skip_special_tokens=True) - if isinstance(decoded, str) and decoded.strip(): - return decoded.strip() - return "" - - -def extract_image_output(outputs: list[Any]) -> torch.Tensor | None: - for mm_out in iter_mm_outputs(outputs): - image = mm_out.get("image") - if isinstance(image, list) and image: - image = image[-1] - if isinstance(image, torch.Tensor): - return image - return None - - -def tensor_to_pil_image(image: torch.Tensor) -> Image.Image: - arr = image.detach().cpu().numpy() - if arr.ndim == 4: - arr = arr[0] - if arr.ndim == 3 and arr.shape[0] in (1, 3, 4): - arr = np.transpose(arr, (1, 2, 0)) - if arr.dtype != np.uint8: - arr = arr.astype(np.float32) - if arr.max() <= 1.0: - arr = arr * 255.0 - arr = np.clip(arr, 0.0, 255.0).astype(np.uint8) - if arr.ndim == 3 and arr.shape[-1] == 1: - arr = arr[..., 0] - return Image.fromarray(arr) - - -def extract_audio_output(outputs: list[Any]) -> tuple[np.ndarray, int] | None: - for mm_out in iter_mm_outputs(outputs): - audio = mm_out.get("audio") - if audio is None: - audio = mm_out.get("speech") - if audio is None: - continue - - def _to_wav_array(value: Any) -> np.ndarray: - if isinstance(value, torch.Tensor): - return value.detach().cpu().numpy().reshape(-1).astype(np.float32) - return np.asarray(value).reshape(-1).astype(np.float32) - - if isinstance(audio, list): - chunks = [_to_wav_array(chunk) for chunk in audio] - wav = np.concatenate(chunks, axis=0) if chunks else np.zeros((0,), dtype=np.float32) - else: - wav = _to_wav_array(audio) - sr = mm_out.get("sr", 24000) - if hasattr(sr, "item"): - try: - sr = int(sr.item()) - except Exception: - sr = 24000 - elif isinstance(sr, list): - sr = int(sr[0]) if sr else 24000 - else: - sr = int(sr) - return wav, sr - return None - - -def save_audio_wav(path: Path, wav: np.ndarray, sr: int) -> None: - try: - import soundfile as sf - - sf.write(str(path), wav, int(sr), format="WAV") - except Exception: - from scipy.io import wavfile - - wav_i16 = np.clip(wav, -1.0, 1.0) - wav_i16 = (wav_i16 * 32767.0).astype(np.int16) - wavfile.write(str(path), int(sr), wav_i16) - - -def parse_args(repo_root: Path) -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Dynin-Omni unified offline end2end example.") - parser.add_argument("--task", type=str, required=True, choices=TASK_CHOICES) - parser.add_argument("--model", type=str, required=True, help="HF repo id or local model directory.") - parser.add_argument( - "--stage-config-path", - type=str, - default=str(repo_root / "vllm_omni/model_executor/stage_configs/dynin_omni.yaml"), - help="Path to stage config yaml.", - ) - parser.add_argument( - "--dynin-config-path", - type=str, - default="", - help="Path to DYNIN config yaml (passed through additional_information).", - ) - parser.add_argument( - "--model-cache-dir", - type=str, - default="/tmp/dynin_localized_models", - help="Cache directory used when --model is HF repo id.", - ) - parser.add_argument( - "--localize-model", - action=argparse.BooleanOptionalAction, - default=True, - help="If true and --model is HF repo id, snapshot it under --model-cache-dir.", - ) - parser.add_argument("--text", type=str, default="", help="Prompt/edit/question text.") - parser.add_argument("--instruction", type=str, default="", help="Optional extra instruction.") - parser.add_argument("--raw-prompt", action=argparse.BooleanOptionalAction, default=False) - parser.add_argument("--image", type=str, default="", help="Input image path for i2i/i2t.") - parser.add_argument("--audio", type=str, default="", help="Input audio path for s2t.") - parser.add_argument("--video", type=str, default="", help="Input video path for v2t.") - parser.add_argument("--image-resolution", type=int, default=None) - parser.add_argument("--num-frames", type=int, default=None) - parser.add_argument( - "--output-dir", - type=str, - default="", - help="Directory for generated outputs.", - ) - parser.add_argument("--output-prefix", type=str, default="") - parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--dtype", type=str, default="auto") - parser.add_argument("--max-tokens-per-stage", type=int, default=1) - - parser.add_argument("--runtime-task", type=str, default="", help="Override runtime task key.") - parser.add_argument("--prompting-task", type=str, default="", help="Override prompting task key.") - parser.add_argument("--detok-id", type=int, default=None, help="Override detok id.") - - parser.add_argument("--prompt-max-text-len", type=int, default=None) - parser.add_argument("--cond-dropout-prob", type=float, default=0.0) - parser.add_argument("--max-new-tokens", type=int, default=None) - parser.add_argument("--steps", type=int, default=None) - parser.add_argument("--block-length", type=int, default=None) - parser.add_argument("--temperature", type=float, default=None) - parser.add_argument("--cfg-scale", type=float, default=None) - parser.add_argument("--remasking", type=str, default="low_confidence") - - parser.add_argument("--timesteps", type=int, default=None) - parser.add_argument("--guidance-scale", type=float, default=None) - parser.add_argument("--noise-type", type=str, default="mask") - parser.add_argument("--noise-schedule-name", type=str, default="cosine") - parser.add_argument("--noise-schedule-params", type=str, default="{}") - - parser.add_argument("--mask-token-id", type=int, default=None) - parser.add_argument("--codebook-size", type=int, default=None) - parser.add_argument("--audio-codebook-size", type=int, default=None) - parser.add_argument("--image-token-count", type=int, default=None) - parser.add_argument("--t2s-token-length", type=int, default=None) - parser.add_argument( - "--t2s-condition", - type=str, - default="", - ) - parser.add_argument( - "--use-train-i2i-prompt", - action="store_true", - help="Use i2i training prompt template (default behavior of i2i.py).", - ) - parser.add_argument( - "--no-use-train-i2i-prompt", - dest="use_train_i2i_prompt", - action="store_false", - help="Use i2i_gen prompt template.", - ) - parser.set_defaults(use_train_i2i_prompt=None) - - parser.add_argument("--tokenizer-path", type=str, default="") - parser.add_argument("--model-local-files-only", action=argparse.BooleanOptionalAction, default=None) - parser.add_argument("--tokenizer-local-files-only", action=argparse.BooleanOptionalAction, default=None) - - parser.add_argument("--vq-model-image-path", type=str, default="") - parser.add_argument("--vq-model-image-local-files-only", action=argparse.BooleanOptionalAction, default=None) - parser.add_argument("--vq-model-audio-path", type=str, default="") - parser.add_argument("--vq-model-audio-local-files-only", action=argparse.BooleanOptionalAction, default=None) - - parser.add_argument("--disable-hf-xet", action=argparse.BooleanOptionalAction, default=True) - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults - - nullify_stage_engine_defaults(parser) - return parser.parse_args() - - -def main() -> None: - repo_root = bootstrap_repo_path() - ensure_safe_import_for_vllm() - from vllm_omni.model_executor.models.dynin_omni.dynin_omni_common import ( - DYNIN_PROMPT_SOURCE_KEY, - DYNIN_PROMPT_SOURCE_OFFLINE_PREBUILT, - ) - - args = parse_args(repo_root) - - if args.disable_hf_xet: - os.environ.setdefault("HF_HUB_DISABLE_XET", "1") - - np.random.seed(args.seed) - torch.manual_seed(args.seed) - - model_dir = ensure_local_model_dir( - model=args.model, - cache_dir=Path(args.model_cache_dir).expanduser(), - localize=bool(args.localize_model), - ) - model_source = str(model_dir) - - task_name = str(args.task) - dynin_config_path = str(Path(args.dynin_config_path).expanduser()) - os.environ["DYNIN_CONFIG_PATH"] = dynin_config_path - default_runtime_task, default_prompting_task, default_detok_id, final_modality = TASK_DEFAULT_RUNTIME[task_name] - runtime_task = args.runtime_task.strip() or str( - _runtime_fallback(task_name, "runtime_task", None) or default_runtime_task - ) - prompting_task = args.prompting_task.strip() or str( - _runtime_fallback(task_name, "prompting_task", None) or default_prompting_task - ) - detok_id_default = _runtime_fallback(task_name, "detok_id", None) - if detok_id_default is None: - detok_id_default = default_detok_id - detok_id = int(detok_id_default if args.detok_id is None else args.detok_id) - - output_dir_default = _runtime_fallback(task_name, "output_dir", args.output_dir) - resolved_output_dir = str(output_dir_default or "/tmp/dynin_end2end_outputs") - - image_resolution_value = _runtime_fallback( - task_name, - "image_resolution", - args.image_resolution, - ) - if image_resolution_value is None: - image_resolution_value = 336 - image_resolution = int(image_resolution_value) - - num_frames_value = _runtime_fallback( - task_name, - "num_frames", - args.num_frames, - ) - if num_frames_value is None: - num_frames_value = 8 - num_frames = int(num_frames_value) - - prompt_max_text_len_value = _runtime_fallback( - task_name, - "prompt_max_text_len", - args.prompt_max_text_len, - ) - if prompt_max_text_len_value is None: - prompt_max_text_len_value = 1024 - prompt_max_text_len = int(prompt_max_text_len_value) - - max_new_tokens_value = _runtime_fallback( - task_name, - "max_new_tokens", - args.max_new_tokens, - ) - if max_new_tokens_value is None: - max_new_tokens_value = 256 - max_new_tokens = int(max_new_tokens_value) - - steps_value = _runtime_fallback( - task_name, - "steps", - args.steps, - ) - if steps_value is None: - steps_value = 256 - steps = int(steps_value) - - block_length_value = _runtime_fallback( - task_name, - "block_length", - args.block_length, - ) - if block_length_value is None: - block_length_value = 2 - block_length = int(block_length_value) - - temperature_value = _runtime_fallback( - task_name, - "temperature", - args.temperature, - ) - if temperature_value is None: - temperature_value = 0.0 - temperature = float(temperature_value) - - cfg_scale_value = _runtime_fallback( - task_name, - "cfg_scale", - args.cfg_scale, - ) - if cfg_scale_value is None: - cfg_scale_value = 0.0 - cfg_scale = float(cfg_scale_value) - - remasking = str(_runtime_fallback(task_name, "remasking", args.remasking) or "low_confidence") - - timesteps_value = _runtime_fallback( - task_name, - "timesteps", - args.timesteps, - ) - if timesteps_value is None: - timesteps_value = 20 - timesteps = int(timesteps_value) - - guidance_scale_value = _runtime_fallback( - task_name, - "guidance_scale", - args.guidance_scale, - ) - if guidance_scale_value is None: - guidance_scale_value = 0.0 - guidance_scale = float(guidance_scale_value) - - mask_token_id_value = _runtime_fallback( - task_name, - "mask_token_id", - args.mask_token_id, - ) - if mask_token_id_value is None: - mask_token_id_value = 126336 - mask_token_id = int(mask_token_id_value) - - codebook_size_value = _runtime_fallback( - task_name, - "codebook_size", - args.codebook_size, - ) - if codebook_size_value is None: - codebook_size_value = 8192 - codebook_size = int(codebook_size_value) - - audio_codebook_size_value = _runtime_fallback( - task_name, - "audio_codebook_size", - args.audio_codebook_size, - ) - if audio_codebook_size_value is None: - audio_codebook_size_value = 4096 - audio_codebook_size = int(audio_codebook_size_value) - - image_token_count_value = _runtime_fallback( - task_name, - "image_token_count", - args.image_token_count, - ) - image_token_count = int(image_token_count_value) if image_token_count_value is not None else 0 - - t2s_token_length_value = _runtime_fallback( - task_name, - "t2s_token_length", - args.t2s_token_length, - ) - if t2s_token_length_value is None: - t2s_token_length_value = 383 - t2s_token_length = int(t2s_token_length_value) - - t2s_condition = str( - _runtime_fallback(task_name, "t2s_condition", args.t2s_condition) - or "gender-female_emotion-neutral_speed-normal_pitch-normal" - ) - - _validate_generation_args( - task=task_name, - max_new_tokens=max_new_tokens, - steps=steps, - block_length=block_length, - ) - - use_train_i2i_prompt = _runtime_fallback(task_name, "use_train_i2i_prompt", args.use_train_i2i_prompt) - if use_train_i2i_prompt is None: - use_train_i2i_prompt = bool(task_name == "i2i") - use_train_i2i_prompt = bool(use_train_i2i_prompt) - - if task_name in {"i2i", "i2t"} and not args.image: - raise ValueError(f"--task {task_name} requires --image") - if task_name == "s2t" and not args.audio: - raise ValueError("--task s2t requires --audio") - if task_name == "v2t" and not args.video: - raise ValueError("--task v2t requires --video") - - text = resolve_task_text( - task_name=task_name, - text=args.text, - instruction=args.instruction, - raw_prompt=bool(args.raw_prompt), - ) - - tokenizer_source = args.tokenizer_path.strip() or model_source - model_local_only = resolve_local_only( - args.model_local_files_only, model_source, default=Path(model_source).is_dir() - ) - tokenizer_local_only = resolve_local_only( - args.tokenizer_local_files_only, - tokenizer_source, - default=model_local_only, - ) - tokenizer = load_text_tokenizer(tokenizer_source, local_files_only=tokenizer_local_only) - text_vocab_size = int(len(tokenizer)) - - image_tokens: torch.Tensor | None = None - audio_tokens: torch.Tensor | None = None - video_tokens: torch.Tensor | None = None - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - vq_image_source = args.vq_model_image_path.strip() or "snu-aidas/magvitv2" - vq_audio_source = args.vq_model_audio_path.strip() or "snu-aidas/emova_speech_tokenizer_vllm" - vq_image_local_only = resolve_local_only(args.vq_model_image_local_files_only, vq_image_source, default=False) - vq_audio_local_only = resolve_local_only(args.vq_model_audio_local_files_only, vq_audio_source, default=False) - - if task_name in {"i2i", "i2t", "v2t"}: - vq_image = load_vq_image_encoder(vq_image_source, vq_image_local_only, device) - if task_name in {"i2i", "i2t"}: - image_tokens = encode_image_tokens( - Path(args.image).expanduser().resolve(), - vq_model=vq_image, - device=device, - resolution=int(image_resolution), - ) - if task_name == "v2t": - video_tokens = encode_video_tokens( - Path(args.video).expanduser().resolve(), - vq_model=vq_image, - device=device, - resolution=int(image_resolution), - num_frames=int(num_frames), - ) - if hasattr(vq_image, "cpu"): - vq_image = vq_image.cpu() - - if task_name == "s2t": - vq_audio = load_vq_audio_encoder(vq_audio_source, vq_audio_local_only, device) - audio_tokens = encode_audio_tokens(Path(args.audio).expanduser().resolve(), vq_audio) - if hasattr(vq_audio, "cpu"): - vq_audio = vq_audio.cpu() - - noise_schedule_params: dict[str, Any] = {} - try: - parsed = json.loads(args.noise_schedule_params) - if isinstance(parsed, dict): - noise_schedule_params = {str(k): v for k, v in parsed.items()} - except Exception: - noise_schedule_params = {} - - image_token_count = int(image_token_count) - if image_token_count <= 0: - if image_tokens is not None: - image_token_count = int(image_tokens.numel()) - else: - base_res = int(image_resolution) - image_token_count = max(1, (base_res // 16) ** 2) - - uncond_input_ids: list[int] | None = None - uncond_attention_mask: list[int] | None = None - if task_name == "t2t": - messages = [{"role": "user", "content": text}] - if getattr(tokenizer, "chat_template", None): - prompt_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) - encoded = tokenizer(prompt_text, return_tensors="pt", add_special_tokens=False) - else: - encoded = tokenizer(text, return_tensors="pt", add_special_tokens=True) - prompt_token_ids = _to_1d_int_list(encoded["input_ids"]) - prompt_attention_mask = _to_1d_int_list(encoded.get("attention_mask")) - if not prompt_attention_mask: - prompt_attention_mask = [1] * len(prompt_token_ids) - else: - max_audio_len_for_prompt = int(max(t2s_token_length, 512)) - if audio_tokens is not None: - max_audio_len_for_prompt = max(max_audio_len_for_prompt, int(audio_tokens.numel())) - max_audio_len_short_for_prompt = max(256, max_audio_len_for_prompt // 2) - - uni_prompting = load_universal_prompting( - tokenizer=tokenizer, - tokenizer_source=tokenizer_source, - max_text_len=int(prompt_max_text_len), - cond_dropout_prob=float(args.cond_dropout_prob), - local_files_only=bool(tokenizer_local_only), - max_audio_len=int(max_audio_len_for_prompt), - max_audio_len_short=int(max_audio_len_short_for_prompt), - ) - prompting_text_vocab_size = int(len(uni_prompting.text_tokenizer)) - - is_mmu_task = task_name in {"i2t", "s2t", "v2t"} and not args.prompting_task.strip() - if is_mmu_task: - prompt_token_ids, prompt_attention_mask = make_mmu_prompt( - task=task_name, - text=text, - tokenizer=uni_prompting.text_tokenizer, - uni_prompting=uni_prompting, - image_tokens=image_tokens, - audio_tokens=audio_tokens, - video_tokens=video_tokens, - image_token_offset=prompting_text_vocab_size, - speech_token_offset=prompting_text_vocab_size + int(codebook_size), - ) - else: - prompt_payload, prompting_task = make_prompt_payload( - task=task_name, - text=text, - image_tokens=image_tokens, - audio_tokens=audio_tokens, - video_tokens=video_tokens, - image_placeholder_tokens=image_token_count, - audio_placeholder_tokens=int(t2s_token_length), - image_token_offset=text_vocab_size, - speech_token_offset=text_vocab_size + int(codebook_size), - mask_token_id=int(mask_token_id), - use_train_i2i_prompt=use_train_i2i_prompt, - ) - if args.prompting_task.strip(): - prompting_task = args.prompting_task.strip() - - prompt_token_ids, prompt_attention_mask = _run_uni_prompting( - uni_prompting, - prompt_payload, - prompting_task, - ) - - if task_name in {"i2t", "s2t", "v2t"}: - prompt_attention_mask = [1] * len(prompt_token_ids) - if not prompt_attention_mask: - prompt_attention_mask = [1] * len(prompt_token_ids) - - if task_name in {"t2i", "i2i"} and guidance_scale > 0: - uncond_payload, uncond_prompting_task = make_prompt_payload( - task=task_name, - text="", - image_tokens=image_tokens, - audio_tokens=audio_tokens, - video_tokens=video_tokens, - image_placeholder_tokens=image_token_count, - audio_placeholder_tokens=int(t2s_token_length), - image_token_offset=text_vocab_size, - speech_token_offset=text_vocab_size + int(codebook_size), - mask_token_id=int(mask_token_id), - use_train_i2i_prompt=use_train_i2i_prompt, - ) - uncond_input_ids, uncond_attention_mask = _run_uni_prompting( - uni_prompting, - uncond_payload, - args.prompting_task.strip() or uncond_prompting_task, - ) - if not uncond_attention_mask: - uncond_attention_mask = [1] * len(uncond_input_ids) - - runtime_info: dict[str, Any] = { - "task": [runtime_task], - "detok_id": [int(detok_id)], - DYNIN_PROMPT_SOURCE_KEY: [DYNIN_PROMPT_SOURCE_OFFLINE_PREBUILT], - "dynin_config_path": [str(dynin_config_path)], - "attention_mask": [prompt_attention_mask], - "prompt_max_text_len": [int(prompt_max_text_len)], - "prompting_max_text_len": [int(prompt_max_text_len)], - "cond_dropout_prob": [float(args.cond_dropout_prob)], - "prompting_cond_dropout_prob": [float(args.cond_dropout_prob)], - "tokenizer_path": [str(tokenizer_source)], - "text_vocab_size": [int(text_vocab_size)], - "model_local_files_only": [bool(model_local_only)], - "max_new_tokens": [int(max_new_tokens)], - "steps": [int(steps)], - "block_length": [int(block_length)], - "temperature": [float(temperature)], - "cfg_scale": [float(cfg_scale)], - "remasking": [str(remasking)], - "mask_id": [int(mask_token_id)], - "mask_token_id": [int(mask_token_id)], - "codebook_size": [int(codebook_size)], - "audio_codebook_size": [int(audio_codebook_size)], - "timesteps": [int(timesteps)], - "guidance_scale": [float(guidance_scale)], - "noise_type": [str(args.noise_type)], - "noise_schedule_name": [str(args.noise_schedule_name)], - "noise_schedule_params": [noise_schedule_params], - "seq_len": [int(image_token_count)], - "condition": [str(t2s_condition)], - "vq_model_image_path": [str(vq_image_source)], - "vq_model_image_local_files_only": [bool(vq_image_local_only)], - "vq_model_audio_path": [str(vq_audio_source)], - "vq_model_audio_local_files_only": [bool(vq_audio_local_only)], - } - - if task_name in {"t2t", "i2t", "s2t", "v2t"}: - runtime_info["prompt_length"] = [int(len(prompt_token_ids))] - if uncond_input_ids is not None: - runtime_info["uncond_input_ids"] = [uncond_input_ids] - if uncond_attention_mask is not None: - runtime_info["uncond_attention_mask"] = [uncond_attention_mask] - - if task_name == "t2s": - runtime_info["max_new_tokens"] = [int(t2s_token_length)] - - prompt = { - "prompt_token_ids": [int(v) for v in prompt_token_ids], - "additional_information": runtime_info, - "modalities": [final_modality], - } - - from vllm import SamplingParams - - from vllm_omni.entrypoints.omni import Omni - - stage_config_path = str(Path(args.stage_config_path).expanduser()) - omni = Omni(model=model_source, stage_configs_path=stage_config_path, dtype=args.dtype) - sampling_params_list = [ - SamplingParams(max_tokens=int(args.max_tokens_per_stage), temperature=0.0, top_p=1.0, detokenize=False) - for _ in range(omni.num_stages) - ] - - try: - outputs = list(omni.generate(prompt, sampling_params_list)) - finally: - omni.close() - - out_dir = Path(resolved_output_dir).expanduser() - out_dir.mkdir(parents=True, exist_ok=True) - stamp = time.strftime("%Y%m%d_%H%M%S") - prefix = args.output_prefix.strip() or f"{task_name}_{stamp}" - - if final_modality == "text": - text_out = extract_text_output(outputs, tokenizer=tokenizer) - if not text_out: - raise RuntimeError("No text output found.") - out_path = out_dir / f"{prefix}.txt" - out_path.write_text(text_out + "\n", encoding="utf-8") - print(f"[end2end] text saved: {out_path}") - print(text_out) - return - - if final_modality == "image": - image_out = extract_image_output(outputs) - if image_out is None: - raise RuntimeError("No image output found.") - pil = tensor_to_pil_image(image_out) - out_path = out_dir / f"{prefix}.png" - pil.save(out_path) - print(f"[end2end] image saved: {out_path}") - return - - if final_modality == "audio": - audio_out = extract_audio_output(outputs) - if audio_out is None: - raise RuntimeError("No audio output found.") - wav, sr = audio_out - out_path = out_dir / f"{prefix}.wav" - save_audio_wav(out_path, wav, sr) - print(f"[end2end] audio saved: {out_path} (sr={sr}, samples={wav.shape[0]})") - return - - raise RuntimeError(f"Unsupported final modality: {final_modality}") - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/fish_speech/end2end.py b/examples/offline_inference/fish_speech/end2end.py index 60830d06b7f..31c24d3d5d6 100644 --- a/examples/offline_inference/fish_speech/end2end.py +++ b/examples/offline_inference/fish_speech/end2end.py @@ -18,6 +18,7 @@ import logging import math import os +import tempfile import time import numpy as np @@ -87,10 +88,17 @@ def build_prompt( semantic_len, ) + # The model-side structured clone prefill consumes a temporary .npy file and + # removes it after loading. Abnormal termination can still leave the file + # behind, which is acceptable for this offline example. + with tempfile.NamedTemporaryFile(prefix="fish_ref_", suffix=".npy", delete=False) as f: + np.save(f, np.asarray(ref_audio_wav, dtype=np.float32)) + ref_audio_npy_path = f.name + additional_information = { "text": normalized_text, "ref_text": normalized_ref_text, - "ref_audio_wav": torch.from_numpy(np.asarray(ref_audio_wav, dtype=np.float32)), + "ref_audio_path": ref_audio_npy_path, "ref_audio_sr": int(ref_audio_sr), "fish_structured_voice_clone": True, } diff --git a/examples/offline_inference/glm_image/README.md b/examples/offline_inference/glm_image/README.md new file mode 100644 index 00000000000..c3c7c291696 --- /dev/null +++ b/examples/offline_inference/glm_image/README.md @@ -0,0 +1,145 @@ +# GLM-Image Multistage End-to-End Inference + +This example demonstrates how to run GLM-Image with the vLLM-Omni multistage architecture. + +## Architecture + +GLM-Image uses a 2-stage pipeline: + +``` +┌─────────────────────────────────────────────────────────────┐ +│ GLM-Image Pipeline │ +├─────────────────────────────────────────────────────────────┤ +│ │ +│ Stage 0 (AR Model) Stage 1 (Diffusion) │ +│ ┌─────────────────┐ ┌─────────────────────┐ │ +│ │ vLLM-optimized │ │ GlmImagePipeline │ │ +│ │ GlmImageFor │ prior │ ┌───────────────┐ │ │ +│ │ Conditional │──tokens───►│ │ DiT Denoiser │ │ │ +│ │ Generation │ │ └───────────────┘ │ │ +│ │ (9B AR model) │ │ │ │ │ +│ └─────────────────┘ │ ▼ │ │ +│ ▲ │ ┌───────────────┐ │ │ +│ │ │ │ VAE Decode │──┼──► Image +│ Text/Image │ └───────────────┘ │ │ +│ Input └─────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Features + +- **vLLM-optimized AR**: Uses PagedAttention and tensor parallelism for faster prior token generation +- **Flexible deployment**: AR and Diffusion stages can run on different GPUs +- **Text-to-Image**: Generate images from text descriptions +- **Image-to-Image**: Edit existing images with text prompts + +## Usage + +### Text-to-Image + +```bash +python end2end.py \ + --config-path ../../../vllm_omni/model_executor/stage_configs/glm_image.yaml \ + --prompt "A beautiful sunset over the ocean with sailing boats" \ + --height 1024 \ + --width 1024 \ + --output output_t2i.png +``` + +### Image-to-Image (Image Editing) + +```bash +python end2end.py \ + --config-path ../../../vllm_omni/model_executor/stage_configs/glm_image.yaml \ + --prompt "Transform this scene into a winter wonderland" \ + --image input.png \ + --output output_i2i.png +``` + +### With Custom Parameters + +```bash +python end2end.py \ + --model-path /path/to/glm-image \ + --config-path ../../../vllm_omni/model_executor/stage_configs/glm_image.yaml \ + --prompt "A photorealistic cat sitting on a window sill" \ + --height 1024 \ + --width 1024 \ + --num-inference-steps 50 \ + --guidance-scale 1.5 \ + --seed 42 \ + --output output.png +``` + +## Shell Scripts + +### Run Text-to-Image + +```bash +./run_t2i.sh +``` + +### Run Image-to-Image + +```bash +./run_i2i.sh --image /path/to/input.png +``` + +## Stage Configuration + +The stage config (`glm_image.yaml`) defines: + +- **Stage 0 (AR)**: Uses `GPUARWorker` with vLLM engine + + - Model: `GlmImageForConditionalGeneration` + - Output: `token_ids` (prior tokens) + +- **Stage 1 (Diffusion)**: Uses diffusion engine + - Model: `GlmImagePipeline` + - Output: Generated image + +See `vllm_omni/model_executor/stage_configs/glm_image.yaml` for full configuration. + +## Comparison with Single-Stage + +| Aspect | Single-Stage (transformers) | Multistage (vLLM) | +| ----------- | --------------------------- | ------------------- | +| AR Model | transformers native | vLLM PagedAttention | +| Memory | Higher (no KV cache opt) | Lower (optimized) | +| Throughput | Lower | Higher | +| Flexibility | Single GPU | Multi-GPU support | + +## Troubleshooting + +### OOM Error + +Try reducing memory usage: + +```bash +# In glm_image.yaml, adjust: +gpu_memory_utilization: 0.5 # Reduce from 0.6 +``` + +### Slow Initialization + +The first run loads model weights. Subsequent runs are faster: + +```bash +--stage-init-timeout 900 # Increase timeout for slow storage +``` + +### `Transformers does not recognize this architecture` Error + +Your have to upgrade `transformers` package to `5.3.0` or above: + +``` +pip install --upgrade transformers +``` + +## Requirements + +- vLLM-Omni with GLM-Image support +- CUDA-capable GPU (recommended: H100/A100 with 80GB) +- GLM-Image model weights +- `transformers` v5.3.0 or above diff --git a/examples/offline_inference/glm_image/end2end.py b/examples/offline_inference/glm_image/end2end.py new file mode 100644 index 00000000000..13bcd23f55a --- /dev/null +++ b/examples/offline_inference/glm_image/end2end.py @@ -0,0 +1,511 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +End-to-end offline inference example for GLM-Image with multistage architecture. + +This script tests the multistage pipeline where: +- Stage 0 (AR): vLLM-optimized GlmImageForConditionalGeneration generates prior_token_ids +- Stage 1 (Diffusion): GlmImagePipeline performs DiT denoising + VAE decode + +Usage (text-to-image): + python end2end.py \ + --model-path /path/to/glm-image \ + --config-path /path/to/glm_image.yaml \ + --prompt "A beautiful sunset over the ocean" \ + --output output_t2i.png + +Usage (image-to-image / image edit): + python end2end.py \ + --model-path /path/to/glm-image \ + --config-path /path/to/glm_image.yaml \ + --prompt "Make it look like winter" \ + --image input.png \ + --output output_i2i.png + +Usage (with custom parameters): + python end2end.py \ + --model-path /path/to/glm-image \ + --config-path /path/to/glm_image.yaml \ + --prompt "A cat sitting on a window sill" \ + --height 1024 \ + --width 1024 \ + --num-inference-steps 50 \ + --guidance-scale 1.5 \ + --seed 42 + +For more options, run: + python end2end.py --help +""" + +import argparse +import os +import time +from pathlib import Path + +from PIL import Image + +from vllm_omni.entrypoints.omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + +# Default stage config path (relative to vllm_omni package) +DEFAULT_CONFIG_PATH = "vllm_omni/model_executor/stage_configs/glm_image.yaml" + +SEED = 42 + +# GLM-Image special tokens +GLM_IMAGE_EOS_TOKEN_ID = 16385 # eos_token_id from generation_config.json +GLM_IMAGE_VISION_VOCAB_SIZE = 16512 # top_k should be vision_vocab_size + + +def compute_max_tokens(height: int, width: int, factor: int = 32) -> int: + """ + Compute max_new_tokens for GLM-Image AR generation. + + GLM-Image generates tokens in this order for text-to-image: + 1. Small preview image (half resolution in each dimension) + 2. Large target image (full resolution) + 3. EOS token + + Args: + height: Target image height in pixels + width: Target image width in pixels + factor: Downsampling factor (32 for GLM-Image AR output) + + Returns: + Total number of tokens to generate (small + large + EOS) + """ + # Large image tokens (target resolution) + token_h = height // factor + token_w = width // factor + large_tokens = token_h * token_w + + # Small preview tokens (half resolution in each dimension) + small_h = token_h // 2 + small_w = token_w // 2 + small_tokens = small_h * small_w + + # Total: small + large + EOS + return small_tokens + large_tokens + 1 + + +def load_image(image_path: str) -> Image.Image: + """Load an image from file path.""" + if not os.path.exists(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + return Image.open(image_path).convert("RGB") + + +def save_image(image: Image.Image, output_path: str) -> None: + """Save an image to file path.""" + output_dir = os.path.dirname(output_path) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + image.save(output_path) + print(f"Image saved to: {output_path}") + + +def build_prompt_for_t2i( + prompt: str, + height: int = 1024, + width: int = 1024, +) -> dict: + """ + Build prompt dict for text-to-image generation. + + Args: + prompt: Text description for image generation + height: Target image height + width: Target image width + + Returns: + Dict containing prompt and generation parameters + """ + return { + "prompt": prompt, + "height": height, + "width": width, + # Pass target dimensions to AR processor for proper grid token generation + "mm_processor_kwargs": { + "target_h": height, + "target_w": width, + }, + } + + +def build_prompt_for_i2i( + prompt: str, + image: Image.Image, + height: int | None = None, + width: int | None = None, +) -> dict: + """ + Build prompt dict for image-to-image generation. + + Args: + prompt: Text description for image editing + image: Source image for editing + height: Target image height (default: use source image size) + width: Target image width (default: use source image size) + + Returns: + Dict containing prompt, image, and generation parameters + """ + # Use source image dimensions if not specified + if height is None: + height = image.height + if width is None: + width = image.width + + return { + "prompt": prompt, + "multi_modal_data": { + "image": image, + }, + "height": height, + "width": width, + # Pass target dimensions to AR processor for proper grid token generation + "mm_processor_kwargs": { + "target_h": height, + "target_w": width, + }, + } + + +def main(args: argparse.Namespace) -> None: + """Main entry point for GLM-Image end-to-end inference.""" + print("=" * 60) + print("GLM-Image Multistage End-to-End Inference") + print("=" * 60) + + # Validate arguments + if not args.prompt: + raise ValueError("--prompt is required") + + # Determine config path + config_path = args.config_path + if config_path is None: + # Try to find default config + if os.path.exists(DEFAULT_CONFIG_PATH): + config_path = DEFAULT_CONFIG_PATH + else: + # Try relative to script location + script_dir = Path(__file__).parent.parent.parent.parent + config_path = script_dir / "vllm_omni/model_executor/stage_configs/glm_image.yaml" + if not config_path.exists(): + raise FileNotFoundError( + f"Stage config not found. Please specify --config-path. Tried: {DEFAULT_CONFIG_PATH}" + ) + config_path = str(config_path) + + print(f"Model path: {args.model_path}") + print(f"Config path: {config_path}") + print(f"Prompt: {args.prompt}") + + # Load source image for image-to-image mode + source_image = None + if args.image: + print(f"Source image: {args.image}") + source_image = load_image(args.image) + print(f" Image size: {source_image.width}x{source_image.height}") + + # Build prompt based on mode + if source_image is not None: + # Image-to-image mode + prompt_dict = build_prompt_for_i2i( + prompt=args.prompt, + image=source_image, + height=args.height, + width=args.width, + ) + mode = "image-to-image" + else: + # Text-to-image mode + prompt_dict = build_prompt_for_t2i( + prompt=args.prompt, + height=args.height or 1024, + width=args.width or 1024, + ) + mode = "text-to-image" + + print(f"Mode: {mode}") + print(f"Target size: {prompt_dict.get('height', 1024)}x{prompt_dict.get('width', 1024)}") + + # Add generation parameters to prompt + prompt_dict["seed"] = args.seed + prompt_dict["num_inference_steps"] = args.num_inference_steps + prompt_dict["guidance_scale"] = args.guidance_scale + + if args.negative_prompt: + prompt_dict["negative_prompt"] = args.negative_prompt + + # Build cache-dit config if requested + cache_config = None + if args.cache_backend == "cache_dit": + cache_config = { + "Fn_compute_blocks": 1, + "Bn_compute_blocks": 0, + "max_warmup_steps": 4, + "residual_diff_threshold": 0.24, + "max_continuous_cached_steps": 3, + "enable_taylorseer": False, + "taylorseer_order": 1, + "scm_steps_mask_policy": None, + "scm_steps_policy": "dynamic", + } + + # Initialize Omni with multistage config + print("\nInitializing Omni with multistage pipeline...") + print(f" Cache backend: {args.cache_backend or 'None (no acceleration)'}") + start_time = time.time() + + omni = Omni( + model=args.model_path, + stage_configs_path=config_path, + log_stats=args.enable_stats, + stage_init_timeout=args.stage_init_timeout, + cache_backend=args.cache_backend, + cache_config=cache_config, + enable_cache_dit_summary=getattr(args, "enable_cache_dit_summary", False), + enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, + ) + + init_time = time.time() - start_time + print(f"Initialization completed in {init_time:.2f}s") + + # Prepare prompts (support batch generation) + prompts = [prompt_dict for _ in range(args.num_prompts)] + + # No explicit sampling_params for diffusion - parameters are in prompt_dict + # For multistage, the AR stage may need sampling params + from vllm import SamplingParams + + # Compute max_tokens dynamically based on target image size + target_height = prompt_dict.get("height", 1024) + target_width = prompt_dict.get("width", 1024) + calculated_max_tokens = compute_max_tokens(target_height, target_width) + + # Use calculated value unless user explicitly specified a different value + # Default args.max_tokens is 16384 (very large), so prefer calculated value + effective_max_tokens = calculated_max_tokens if args.max_tokens == 16384 else args.max_tokens + + if args.verbose: + print(f"AR max_tokens: {effective_max_tokens} (calculated: {calculated_max_tokens}, arg: {args.max_tokens})") + + # IMPORTANT: GLM-Image AR model requires these exact sampling parameters + # from generation_config.json for proper image token generation. + # - temperature=0.9, top_p=0.75, top_k=16512 (vision_vocab_size) + # - stop_token_ids=[16385] (eos_token_id) is CRITICAL to stop generation + ar_sampling_params = SamplingParams( + temperature=0.9, # From generation_config.json + top_p=0.75, # From generation_config.json + top_k=GLM_IMAGE_VISION_VOCAB_SIZE, # 16512, vision vocabulary size + max_tokens=effective_max_tokens, + stop_token_ids=[GLM_IMAGE_EOS_TOKEN_ID], # 16385, CRITICAL for stopping + seed=args.seed, + detokenize=False, + ) + + # For diffusion stage, sampling_params contains diffusion-specific parameters + # These are passed as kwargs to the diffusion engine + diffusion_sampling_params = OmniDiffusionSamplingParams( + num_inference_steps=args.num_inference_steps, + guidance_scale=args.guidance_scale, + height=prompt_dict.get("height", 1024), + width=prompt_dict.get("width", 1024), + seed=args.seed, + ) + + # For multistage, we need sampling_params for each stage + # Stage 0 (AR): SamplingParams for vLLM + # Stage 1 (Diffusion): dict with diffusion kwargs + sampling_params_list = [ar_sampling_params, diffusion_sampling_params] + + # Run generation + print(f"\nGenerating {args.num_prompts} image(s)...") + gen_start_time = time.time() + + output_dir = os.path.dirname(args.output) if args.output else "outputs" + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + output_count = 0 + for stage_outputs in omni.generate(prompts, sampling_params_list, py_generator=True): + output = stage_outputs.request_output + if stage_outputs.final_output_type == "image": + request_id = output.request_id + + # Get generated images + images = output.images if hasattr(output, "images") else [] + if not images and hasattr(output, "multimodal_output"): + images = output.multimodal_output.get("images", []) + + # Save each generated image + for idx, img in enumerate(images): + if args.num_prompts == 1 and len(images) == 1: + output_path = args.output + else: + base, ext = os.path.splitext(args.output) + output_path = f"{base}_{request_id}_{idx}{ext}" + + if isinstance(img, Image.Image): + save_image(img, output_path) + else: + print(f"Warning: Unexpected image type for request {request_id}: {type(img)}") + + output_count += 1 + + elif stage_outputs.final_output_type == "text": + # AR stage output (intermediate, for debugging) + if args.verbose: + print(f"AR output for request {output.request_id}:") + print(f" Token count: {len(output.outputs[0].token_ids)}") + + gen_time = time.time() - gen_start_time + print(f"\nGeneration completed in {gen_time:.2f}s") + print(f"Generated {output_count} image(s)") + + # Cleanup + omni.close() + print("\nDone!") + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="GLM-Image Multistage End-to-End Inference", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + # Required arguments + parser.add_argument( + "--model-path", + type=str, + default="zai-org/GLM-Image", + help="Path to GLM-Image model directory or HuggingFace model ID", + ) + parser.add_argument( + "--prompt", + type=str, + required=True, + help="Text prompt for image generation", + ) + + # Optional arguments + parser.add_argument( + "--config-path", + type=str, + default=None, + help="Path to stage config YAML file (default: auto-detect)", + ) + parser.add_argument( + "--image", + type=str, + default=None, + help="Path to source image for image-to-image mode", + ) + parser.add_argument( + "--output", + type=str, + default="output_glm_image.png", + help="Output image path (default: output_glm_image.png)", + ) + parser.add_argument( + "--negative-prompt", + type=str, + default=None, + help="Negative prompt for classifier-free guidance", + ) + + # Generation parameters + parser.add_argument( + "--height", + type=int, + default=None, + help="Output image height (default: 1024 for t2i, source size for i2i)", + ) + parser.add_argument( + "--width", + type=int, + default=None, + help="Output image width (default: 1024 for t2i, source size for i2i)", + ) + parser.add_argument( + "--num-inference-steps", + type=int, + default=50, + help="Number of diffusion denoising steps (default: 50)", + ) + parser.add_argument( + "--guidance-scale", + type=float, + default=1.5, + help="Classifier-free guidance scale (default: 1.5)", + ) + parser.add_argument( + "--seed", + type=int, + default=SEED, + help=f"Random seed for reproducibility (default: {SEED})", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=16384, + help="Maximum tokens for AR generation (default: 16384)", + ) + + # Batch processing + parser.add_argument( + "--num-prompts", + type=int, + default=1, + help="Number of images to generate (default: 1)", + ) + + # Cache acceleration + parser.add_argument( + "--cache-backend", + type=str, + default=None, + choices=["cache_dit"], + help="Cache backend for DiT acceleration. Default: None (no cache).", + ) + parser.add_argument( + "--enable-cache-dit-summary", + action="store_true", + help="Enable cache-dit summary logging after diffusion forward passes.", + ) + + # Runtime options + parser.add_argument( + "--enable-stats", + action="store_true", + default=False, + help="Enable statistics logging", + ) + parser.add_argument( + "--stage-init-timeout", + type=int, + default=600, + help="Timeout for stage initialization in seconds (default: 600)", + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "--enable-diffusion-pipeline-profiler", + action="store_true", + help="Enable diffusion pipeline profiler to display stage durations.", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/offline_inference/glm_image/run_i2i.sh b/examples/offline_inference/glm_image/run_i2i.sh new file mode 100755 index 00000000000..f81b157b0c8 --- /dev/null +++ b/examples/offline_inference/glm_image/run_i2i.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# Run GLM-Image image-to-image (editing) with multistage pipeline + +set -e + +# Default values +MODEL_PATH="${MODEL_PATH:-/path/to/glm-image}" +CONFIG_PATH="${CONFIG_PATH:-vllm_omni/model_executor/stage_configs/glm_image.yaml}" +PROMPT="${PROMPT:-Transform this image into an oil painting style}" +INPUT_IMAGE="" +OUTPUT="${OUTPUT:-output_i2i.png}" +NUM_STEPS="${NUM_STEPS:-50}" +GUIDANCE="${GUIDANCE:-1.5}" +SEED="${SEED:-42}" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --model-path) + MODEL_PATH="$2" + shift 2 + ;; + --config-path) + CONFIG_PATH="$2" + shift 2 + ;; + --prompt) + PROMPT="$2" + shift 2 + ;; + --image) + INPUT_IMAGE="$2" + shift 2 + ;; + --output) + OUTPUT="$2" + shift 2 + ;; + --num-steps) + NUM_STEPS="$2" + shift 2 + ;; + --guidance) + GUIDANCE="$2" + shift 2 + ;; + --seed) + SEED="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Check if input image is provided +if [ -z "${INPUT_IMAGE}" ]; then + echo "Error: --image is required for image-to-image mode" + echo "Usage: ./run_i2i.sh --image /path/to/input.png [--prompt \"edit instruction\"]" + exit 1 +fi + +if [ ! -f "${INPUT_IMAGE}" ]; then + echo "Error: Input image not found: ${INPUT_IMAGE}" + exit 1 +fi + +echo "==============================================" +echo "GLM-Image Image-to-Image Generation" +echo "==============================================" +echo "Model: ${MODEL_PATH}" +echo "Config: ${CONFIG_PATH}" +echo "Input: ${INPUT_IMAGE}" +echo "Prompt: ${PROMPT}" +echo "Output: ${OUTPUT}" +echo "Steps: ${NUM_STEPS}" +echo "Guidance: ${GUIDANCE}" +echo "Seed: ${SEED}" +echo "==============================================" + +python end2end.py \ + --model-path "${MODEL_PATH}" \ + --config-path "${CONFIG_PATH}" \ + --prompt "${PROMPT}" \ + --image "${INPUT_IMAGE}" \ + --output "${OUTPUT}" \ + --num-inference-steps "${NUM_STEPS}" \ + --guidance-scale "${GUIDANCE}" \ + --seed "${SEED}" \ + --verbose diff --git a/examples/offline_inference/glm_image/run_t2i.sh b/examples/offline_inference/glm_image/run_t2i.sh new file mode 100755 index 00000000000..5d249960b8f --- /dev/null +++ b/examples/offline_inference/glm_image/run_t2i.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# Run GLM-Image text-to-image generation with multistage pipeline + +set -e + +# Default values +MODEL_PATH="${MODEL_PATH:-/path/to/glm-image}" +CONFIG_PATH="${CONFIG_PATH:-vllm_omni/model_executor/stage_configs/glm_image.yaml}" +PROMPT="${PROMPT:-A beautiful mountain landscape with snow-capped peaks and a clear blue lake}" +OUTPUT="${OUTPUT:-output_t2i.png}" +HEIGHT="${HEIGHT:-1024}" +WIDTH="${WIDTH:-1024}" +NUM_STEPS="${NUM_STEPS:-50}" +GUIDANCE="${GUIDANCE:-1.5}" +SEED="${SEED:-42}" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --model-path) + MODEL_PATH="$2" + shift 2 + ;; + --config-path) + CONFIG_PATH="$2" + shift 2 + ;; + --prompt) + PROMPT="$2" + shift 2 + ;; + --output) + OUTPUT="$2" + shift 2 + ;; + --height) + HEIGHT="$2" + shift 2 + ;; + --width) + WIDTH="$2" + shift 2 + ;; + --num-steps) + NUM_STEPS="$2" + shift 2 + ;; + --guidance) + GUIDANCE="$2" + shift 2 + ;; + --seed) + SEED="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +echo "==============================================" +echo "GLM-Image Text-to-Image Generation" +echo "==============================================" +echo "Model: ${MODEL_PATH}" +echo "Config: ${CONFIG_PATH}" +echo "Prompt: ${PROMPT}" +echo "Output: ${OUTPUT}" +echo "Size: ${WIDTH}x${HEIGHT}" +echo "Steps: ${NUM_STEPS}" +echo "Guidance: ${GUIDANCE}" +echo "Seed: ${SEED}" +echo "==============================================" + +python end2end.py \ + --model-path "${MODEL_PATH}" \ + --config-path "${CONFIG_PATH}" \ + --prompt "${PROMPT}" \ + --output "${OUTPUT}" \ + --height "${HEIGHT}" \ + --width "${WIDTH}" \ + --num-inference-steps "${NUM_STEPS}" \ + --guidance-scale "${GUIDANCE}" \ + --seed "${SEED}" \ + --verbose diff --git a/examples/offline_inference/helios/end2end.py b/examples/offline_inference/helios/end2end.py index 6cf7dfdcb36..88c3b865d42 100644 --- a/examples/offline_inference/helios/end2end.py +++ b/examples/offline_inference/helios/end2end.py @@ -196,9 +196,6 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--cfg-parallel-size", type=int, default=1, choices=[1, 2], help="CFG parallel size.") parser.add_argument("--tensor-parallel-size", type=int, default=1, help="Tensor parallelism size.") - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults - - nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md index 3cd8fa01b2e..da28a44d9e6 100644 --- a/examples/offline_inference/hunyuan_image3/README.md +++ b/examples/offline_inference/hunyuan_image3/README.md @@ -1,161 +1,25 @@ -# HunyuanImage-3.0-Instruct +# HunyuanImage-3.0 Image-to-Text Inference -## Set up +This example demonstrates how to run HunyuanImage-3.0 Image-to-Text with the vLLM-Omni. -Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup. +## Local CLI Usage -## Run examples - -**Note**: These examples work with the default configuration on **8x NVIDIA L40S (48GB)**. For different GPU setups, modify the stage configuration to adjust device allocation and memory utilization. - -Get into the hunyuan_image3 folder: - -```bash -cd examples/offline_inference/hunyuan_image3 -``` - -### Modality Control - -HunyuanImage-3.0-Instruct supports multiple modality modes. You can control the mode using the `--modality` argument: - -#### Text to Image (text2img) - -- **Pipeline**: Text → AR (CoT + latent tokens) → DiT (denoise) → VAE Decode → Image -- **Stages Used**: Stage 0 (AR) + Stage 1 (DiT) -- **KV Transfer**: AR sends KV cache to DiT for conditioned generation -- **Default Config**: `hunyuan_image3_t2i.yaml` - -```bash -python end2end.py --model tencent/HunyuanImage-3.0-Instruct \ - --modality text2img \ - --prompts "A cute cat sitting on a windowsill watching the sunset" -``` - -#### Image to Image (img2img) - -- **Pipeline**: Image + Text → AR (CoT + recaption + latent) → DiT → Edited Image -- **Stages Used**: Stage 0 (AR) + Stage 1 (DiT) -- **KV Transfer**: AR sends KV cache to DiT -- **Default Config**: `hunyuan_image3_it2i.yaml` - -```bash -python end2end.py --model tencent/HunyuanImage-3.0-Instruct \ - --modality img2img \ - --image-path /path/to/image.png \ - --prompts "Make the petals neon pink" -``` - -#### Image to Text (img2text) - -- **Pipeline**: Image + Question → AR → Text description -- **Stages Used**: Stage 0 (AR) only -- **Default Config**: `hunyuan_image3_i2t.yaml` +Download the example image: ```bash -python end2end.py --model tencent/HunyuanImage-3.0-Instruct \ - --modality img2text \ - --image-path /path/to/image.jpg \ - --prompts "Describe the content of the picture." +wget https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg ``` -#### Text to Text (text2text) - -- **Pipeline**: Text → AR → Text -- **Stages Used**: Stage 0 (AR) only -- **Default Config**: `hunyuan_image3_t2t.yaml` +Run example: ```bash -python end2end.py --model tencent/HunyuanImage-3.0-Instruct \ - --modality text2text \ - --prompts "What is the capital of France?" +python image_to_text.py \ + --image cherry_blossom.jpg \ + --prompt "<|startoftext|>You are an assistant that understands images and outputs text.Describe the content of the picture." ``` -### Inference Steps & Guidance - -Control generation quality for image modalities: - -```bash -python end2end.py --modality text2img \ - --steps 50 \ - --guidance-scale 5.0 \ - --height 1024 --width 1024 \ - --prompts "A photo-realistic sunset over the ocean" -``` - -### Key Arguments - -#### 📌 Command Line Arguments (end2end.py) - -| Argument | Type | Default | Description | -| :--------------------- | :----- | :----------------------------------- | :----------------------------------------------------------- | -| `--model` | string | `tencent/HunyuanImage-3.0-Instruct` | Model path or name | -| `--modality` | choice | `text2img` | Modality: `text2img`, `img2img`, `img2text`, `text2text` | -| `--prompts` | list | `None` | Input text prompts | -| `--image-path` | string | `None` | Input image path (for `img2img`/`img2text`) | -| `--output` | string | `.` | Output directory for saved images | -| `--steps` | int | `50` | Number of inference steps | -| `--guidance-scale` | float | `5.0` | Classifier-free guidance scale | -| `--seed` | int | `42` | Random seed | -| `--height` | int | `1024` | Output image height | -| `--width` | int | `1024` | Output image width | -| `--bot-task` | string | auto | Override prompt task (e.g. `it2i_think`, `t2i_recaption`) | -| `--sys-type` | string | auto | Override system prompt type (e.g. `en_unified`, `en_vanilla`) | -| `--stage-configs-path` | string | auto | Custom stage config YAML path | -| `--enforce-eager` | flag | `False` | Disable torch.compile | -| `--init-timeout` | int | `300` | Initialization timeout (seconds) | - ------- - -#### ⚙️ Stage Configurations - -| Config YAML | Modality | Stages | GPUs | Description | -| :---------------------------------- | :-------- | :----- | :----- | :------------------------------------ | -| `hunyuan_image3_t2i.yaml` | text2img | 2 | 8 | T2I with AR→DiT, 4 GPU each | -| `hunyuan_image3_it2i.yaml` | img2img | 2 | 8 | IT2I with AR→DiT, 4 GPU each | -| `hunyuan_image3_i2t.yaml` | img2text | 1 | 4 | I2T (AR only) | -| `hunyuan_image3_t2t.yaml` | text2text | 1 | 4 | T2T (AR only) | -| `hunyuan_image3_t2i_2gpu.yaml` | text2img | 2 | 2 | T2I for 2-GPU setups | -| `hunyuan_image3_moe.yaml` | text2img | 2 | 8 | T2I with MoE AR→DiT KV reuse | -| `hunyuan_image3_moe_dit_2gpu_fp8.yaml` | text2img | 2 | 2 | T2I with FP8 quantization | - ------- - -## Using MoE Config - -The `hunyuan_image3_moe.yaml` config enables AR→DiT KV cache reuse with 8 GPUs (4 for AR + 4 for DiT). - -```bash -python end2end.py --model tencent/HunyuanImage-3.0-Instruct \ - --modality text2img \ - --stage-configs-path hunyuan_image3_moe.yaml \ - --prompts "A cute cat" -``` - ------- - -## Prompt Format - -HunyuanImage-3.0 uses a pretrain template format: - -``` -<|startoftext|>{system_prompt}{}{trigger_tag}{user_prompt} -``` - -- ``: Placeholder for each input image (auto-inserted by `prompt_utils.py`) -- Trigger tags: `` (CoT), `` (recaptioning) -- System prompt: Auto-selected based on task - -The `prompt_utils.build_prompt()` handles this formatting automatically. - ------- - -## FAQ - -- **OOM errors**: Decrease `gpu_memory_utilization` in the YAML stage config, or use a smaller `max_num_batched_tokens`. -- **Custom image sizes**: Use `--height` and `--width` flags (multiples of 16 recommended). +Key arguments: -| Stage | VRAM (approx) | -| :---------------- | :------------------- | -| Stage 0 (AR) | ~15 GiB + KV Cache | -| Stage 1 (DiT) | ~30 GiB | -| Total (8-GPU) | ~45 GiB + KV Cache | +- `--model`: Model used. Default is: tencent/HunyuanImage-3.0-Instruct (Optional). +- `--image`: Path to input image (required). +- `--prompt`: Text description used to guide image understanding (required). diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py deleted file mode 100644 index 2cea303888e..00000000000 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ /dev/null @@ -1,265 +0,0 @@ -""" -HunyuanImage-3.0-Instruct unified end-to-end inference script. - -Supports all modalities through a single entry point: - - text2img: Text → AR → DiT → Image - - img2img: Text+Image → AR → DiT → Edited Image (IT2I) - - img2text: Image+Text → AR → Text description (I2T) - - text2text: Text → AR → Text (comprehension, no image) - -Usage: - python end2end.py --modality text2img --prompts "A cute cat" - python end2end.py --modality img2img --image-path input.png --prompts "Make it snowy" - python end2end.py --modality img2text --image-path input.png --prompts "Describe this image" -""" - -import argparse -import os - -from vllm_omni.diffusion.models.hunyuan_image3.system_prompt import ( - get_system_prompt, -) -from vllm_omni.entrypoints.omni import Omni -from vllm_omni.inputs.data import OmniPromptType - -# task → (sys_type, bot_task, trigger_tag) -_TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { - "t2t": ("en_unified", None, None), - "i2t": ("en_unified", None, None), - "it2i_think": ("en_unified", "think", ""), - "it2i_recaption": ("en_unified", "recaption", ""), - "t2i_think": ("en_unified", "think", ""), - "t2i_recaption": ("en_unified", "recaption", ""), - "t2i_vanilla": ("en_vanilla", "image", None), -} - -# Modality → prompt_utils task mapping -_MODALITY_TASK_MAP = { - "text2img": "t2i_think", - "img2img": "it2i_think", - "img2text": "i2t", - "text2text": "t2t", -} - - -def build_prompt( - user_prompt: str, - task: str = "it2i_think", - sys_type: str | None = None, - custom_system_prompt: str | None = None, -) -> str: - """Build a HunyuanImage-3.0 prompt using pretrain template format.""" - if task not in _TASK_PRESETS: - raise ValueError(f"Unknown task {task!r}. Choose from: {sorted(_TASK_PRESETS)}") - - preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] - effective_sys_type = sys_type or preset_sys_type - - system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt) - sys_text = system_prompt.strip() if system_prompt else "" - - has_image_input = task.startswith("i2t") or task.startswith("it2i") - - parts = ["<|startoftext|>"] - if sys_text: - parts.append(sys_text) - if has_image_input: - parts.append("") - if trigger_tag: - parts.append(trigger_tag) - parts.append(user_prompt) - - return "".join(parts) - - -# Modality → default stage config -_MODALITY_DEFAULT_CONFIG = { - "text2img": "hunyuan_image3_t2i.yaml", - "img2img": "hunyuan_image3_it2i.yaml", - "img2text": "hunyuan_image3_i2t.yaml", - "text2text": "hunyuan_image3_t2t.yaml", -} - - -def parse_args(): - parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.") - parser.add_argument( - "--model", - default="tencent/HunyuanImage-3.0-Instruct", - help="Model name or local path.", - ) - parser.add_argument( - "--modality", - default="text2img", - choices=["text2img", "img2img", "img2text", "text2text"], - help="Modality mode to control stage execution.", - ) - parser.add_argument("--prompts", nargs="+", default=None, help="Input text prompts.") - parser.add_argument( - "--image-path", - type=str, - default=None, - help="Path to input image (for img2img/img2text).", - ) - parser.add_argument( - "--output", - type=str, - default=".", - help="Output directory to save results.", - ) - - # Generation parameters - parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.") - parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale.") - parser.add_argument("--seed", type=int, default=42, help="Random seed.") - parser.add_argument("--height", type=int, default=1024, help="Output image height.") - parser.add_argument("--width", type=int, default=1024, help="Output image width.") - - # Prompt configuration - parser.add_argument( - "--bot-task", - type=str, - default=None, - help="Override prompt task (e.g. it2i_think, t2i_recaption). Default: auto from modality.", - ) - parser.add_argument( - "--sys-type", - type=str, - default=None, - help="Override system prompt type (e.g. en_unified, en_vanilla).", - ) - - # Omni init args - parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom stage config YAML path.") - parser.add_argument("--log-stats", action="store_true", default=False) - parser.add_argument("--init-timeout", type=int, default=300, help="Initialization timeout in seconds.") - parser.add_argument("--enforce-eager", action="store_true", help="Disable torch.compile.") - - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults - - nullify_stage_engine_defaults(parser) - return parser.parse_args() - - -def main(): - args = parse_args() - os.makedirs(args.output, exist_ok=True) - - # Determine task for prompt formatting - task = args.bot_task or _MODALITY_TASK_MAP[args.modality] - - # Determine stage config - stage_configs_path = args.stage_configs_path or _MODALITY_DEFAULT_CONFIG[args.modality] - - # Build Omni - omni_kwargs = { - "model": args.model, - "stage_configs_path": stage_configs_path, - "log_stats": args.log_stats, - "init_timeout": args.init_timeout, - "enforce_eager": args.enforce_eager, - } - if args.modality in ("text2img", "img2img"): - omni_kwargs["mode"] = "text-to-image" - - omni = Omni(**omni_kwargs) - - # Prepare prompts - prompts = args.prompts or ["A cute cat"] - if not prompts: - print("[Info] No prompts provided, using default.") - prompts = ["A cute cat"] - - # Load image if needed - input_image = None - if args.modality in ("img2img", "img2text"): - if not args.image_path or not os.path.exists(args.image_path): - raise ValueError(f"--image-path required for {args.modality}, got: {args.image_path}") - from PIL import Image - - input_image = Image.open(args.image_path).convert("RGB") - - # Format prompts - formatted_prompts: list[OmniPromptType] = [] - for p in prompts: - formatted_text = build_prompt(p, task=task, sys_type=args.sys_type) - - prompt_dict: dict = {"prompt": formatted_text} - - if args.modality == "text2img": - prompt_dict["modalities"] = ["image"] - elif args.modality == "img2img": - prompt_dict["modalities"] = ["image"] - prompt_dict["multi_modal_data"] = {"image": input_image} - prompt_dict["height"] = input_image.height - prompt_dict["width"] = input_image.width - elif args.modality == "img2text": - prompt_dict["modalities"] = ["text"] - prompt_dict["multi_modal_data"] = {"image": input_image} - elif args.modality == "text2text": - prompt_dict["modalities"] = ["text"] - - formatted_prompts.append(prompt_dict) - - # Build sampling params from defaults - params_list = list(omni.default_sampling_params_list) - - # Override diffusion params if applicable - from vllm_omni.inputs.data import OmniDiffusionSamplingParams - - for i, sp in enumerate(params_list): - if isinstance(sp, OmniDiffusionSamplingParams): - sp.num_inference_steps = args.steps - sp.guidance_scale = args.guidance_scale - if args.seed is not None: - sp.seed = args.seed - if args.modality in ("text2img",): - sp.height = args.height - sp.width = args.width - - # Print configuration - print(f"\n{'=' * 60}") - print("HunyuanImage-3.0 Generation Configuration:") - print(f" Model: {args.model}") - print(f" Modality: {args.modality}") - print(f" Stage config: {stage_configs_path}") - print(f" Num stages: {omni.num_stages}") - if args.modality in ("text2img", "img2img"): - print(f" Inference steps: {args.steps}") - print(f" Guidance scale: {args.guidance_scale}") - print(f" Seed: {args.seed}") - if args.modality == "text2img": - print(f" Output size: {args.width}x{args.height}") - if args.image_path: - print(f" Input image: {args.image_path}") - print(f" Prompts: {prompts}") - print(f"{'=' * 60}\n") - - # Generate - omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list)) - - # Process outputs - img_idx = 0 - for req_output in omni_outputs: - # Text output (AR stage or text-only) - ro = getattr(req_output, "request_output", None) - if ro and getattr(ro, "outputs", None): - txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs) - if txt: - print(f"[Output] Text:\n{txt}") - - # Image output (DiT stage) - images = getattr(req_output, "images", None) - if not images and ro and hasattr(ro, "images"): - images = ro.images - - if images: - for j, img in enumerate(images): - save_path = os.path.join(args.output, f"output_{img_idx}_{j}.png") - img.save(save_path) - print(f"[Output] Saved image to {save_path}") - img_idx += 1 - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/hunyuan_image3/image_to_text.py b/examples/offline_inference/hunyuan_image3/image_to_text.py new file mode 100644 index 00000000000..d40134ac0a0 --- /dev/null +++ b/examples/offline_inference/hunyuan_image3/image_to_text.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import os + +from PIL import Image + +from vllm_omni.entrypoints.omni import Omni + +""" +The tencent/HunyuanImage-3.0-Instruct base model uses the tencent/Hunyuan-A13B-Instruct backbone. It utilizes two tokenizer delimiter templates: + +1) Pretrained template (default for gen_text mode), which concatenates system, image + tokens, and user question WITHOUT role delimiters: +"<|startoftext|>{system_prompt}{image_tokens}{user_question}" + + Example (before image token expansion): +"<|startoftext|>You are an assistant that understands images and outputs text.Describe the content of the picture." + +2) Instruct template, which uses explicit role prefixes and separators. +""" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Generate text from image using HunyuanImage-3.0-Instruct.") + parser.add_argument( + "--model", + default="tencent/HunyuanImage-3.0-Instruct", + help="Model name or local path.", + ) + parser.add_argument( + "--image", + type=str, + required=True, + help="Path to input image file (PNG, JPG, etc.).", + ) + parser.add_argument( + "--prompt", + type=str, + required=True, + help="Pretrain template prompt: <|startoftext|>{system}{question}", + ) + parser.add_argument( + "--enable-diffusion-pipeline-profiler", + action="store_true", + help="Enable diffusion pipeline profiler to display stage durations.", + ) + return parser.parse_args() + + +def load_image(image_path: str) -> Image.Image: + """Load an image from file path.""" + if not os.path.exists(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + return Image.open(image_path).convert("RGB") + + +def main(args: argparse.Namespace) -> None: + omni = Omni( + model=args.model, + enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, + mode="image-to-text", + ) + + prompt = "<|startoftext|>You are an assistant that understands images and outputs text." + args.prompt + + prompt_dict = { + "prompt": prompt, + "modalities": ["text"], + } + + # Add image input if provided + if args.image: + if not os.path.exists(args.image): + raise FileNotFoundError(f"Input image not found: {args.image}") + + input_image = load_image(args.image) + prompt_dict["multi_modal_data"] = {"image": input_image} + + prompts = [prompt_dict] + omni_outputs = omni.generate(prompts=prompts) + + prompt_text = omni_outputs[0].request_output.prompt + generated_text = omni_outputs[0].request_output.outputs[0].text + print(f"Prompt: {prompt_text}") + print(f"Text: {generated_text}") + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/offline_inference/image_to_image/image_edit.py b/examples/offline_inference/image_to_image/image_edit.py index b857bb22d1b..a8035a3fdcb 100644 --- a/examples/offline_inference/image_to_image/image_edit.py +++ b/examples/offline_inference/image_to_image/image_edit.py @@ -87,11 +87,9 @@ """ import argparse -import json import os import time from pathlib import Path -from typing import Any import torch from PIL import Image @@ -103,16 +101,6 @@ from vllm_omni.platforms import current_omni_platform -def parse_profiler_config(value: str) -> dict[str, Any]: - try: - config = json.loads(value) - except json.JSONDecodeError as e: - raise argparse.ArgumentTypeError(f"--profiler-config must be valid JSON: {e}") from e - if not isinstance(config, dict): - raise argparse.ArgumentTypeError("--profiler-config must be a JSON object") - return config - - def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Edit an image with Qwen-Image-Edit.") parser.add_argument( @@ -208,13 +196,6 @@ def parse_args() -> argparse.Namespace: default=1, help="Number of GPUs used for ulysses sequence parallelism.", ) - parser.add_argument( - "--ulysses-mode", - type=str, - default="strict", - choices=["strict", "advanced_uaa"], - help="Ulysses sequence-parallel mode: 'strict' (divisibility required) or 'advanced_uaa' (UAA).", - ) parser.add_argument( "--ring-degree", type=int, @@ -316,8 +297,8 @@ def parse_args() -> argparse.Namespace: "--cfg-parallel-size", type=int, default=1, - choices=[1, 2, 3], - help="Number of GPUs used for classifier free guidance parallel size (max 3 branches).", + choices=[1, 2], + help="Number of GPUs used for classifier free guidance parallel size.", ) parser.add_argument( "--enforce-eager", @@ -344,43 +325,11 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Enable layerwise (blockwise) offloading on DiT modules.", ) - parser.add_argument( - "--vae-patch-parallel-size", - type=int, - default=1, - help="Number of GPUs used for VAE patch/tile parallelism (decode).", - ) - parser.add_argument( - "--use-hsdp", - action="store_true", - help="Enable HSDP (Hybrid Sharded Data Parallel) for diffusion models.", - ) - parser.add_argument( - "--hsdp-shard-size", - type=int, - default=1, - help="Number of GPUs to shard weights across for HSDP.", - ) - parser.add_argument( - "--hsdp-replicate-size", - type=int, - default=1, - help="Number of HSDP replica groups.", - ) parser.add_argument( "--enable-diffusion-pipeline-profiler", action="store_true", help="Enable diffusion pipeline profiler to display stage durations.", ) - parser.add_argument( - "--profiler-config", - type=parse_profiler_config, - default=None, - help='JSON profiler config for torch/cuda profiling, e.g. \'{"profiler":"torch","torch_profiler_dir":"./perf"}\'.', - ) - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults - - nullify_stage_engine_defaults(parser) return parser.parse_args() @@ -446,11 +395,11 @@ def main(): enforce_eager=args.enforce_eager, enable_cpu_offload=args.enable_cpu_offload, enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, - profiler_config=args.profiler_config, ) print("Pipeline loaded") - profiler_enabled = args.profiler_config is not None + # Check if profiling is requested via environment variable + profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) # Time profiling for generation print(f"\n{'=' * 60}") diff --git a/examples/offline_inference/image_to_image/image_to_image.md b/examples/offline_inference/image_to_image/image_to_image.md index 1c1a5ff3a79..2df248e034f 100644 --- a/examples/offline_inference/image_to_image/image_to_image.md +++ b/examples/offline_inference/image_to_image/image_to_image.md @@ -51,6 +51,5 @@ Key arguments: - `--vae-use-tiling`: enable VAE tiling for memory optimization. - `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](../../../docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel). - `--enable-cpu-offload`: enable CPU offloading for diffusion models. -- `--strength`: **Z-Image only** - controls the denoising start timestep for I2I (default: 0.6). Range: [0.0, 1.0]. Lower values preserve more of the original image; higher values allow more creative changes. > ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage. diff --git a/examples/offline_inference/image_to_video/README.md b/examples/offline_inference/image_to_video/README.md index a458850a02b..2692c76df26 100644 --- a/examples/offline_inference/image_to_video/README.md +++ b/examples/offline_inference/image_to_video/README.md @@ -59,13 +59,12 @@ Key arguments: - `--negative-prompt`: Optional list of artifacts to suppress. - `--boundary-ratio`: Boundary split ratio for two-stage MoE models. - `--flow-shift`: Scheduler flow shift (5.0 for 720p, 12.0 for 480p). -- `--sample-solver`: Wan2.2 sampling solver. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints. - `--num-inference-steps`: Number of denoising steps (default 50). - `--fps`: Frames per second for the saved MP4 (requires `diffusers` export_to_video). - `--output`: Path to save the generated video. - `--vae-use-slicing`: Enable VAE slicing for memory optimization. - `--vae-use-tiling`: Enable VAE tiling for memory optimization. -- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism/cfg_parallel.md). +- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](../../../docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel). - `--tensor-parallel-size`: tensor parallel size (effective for models that support TP, e.g. LTX2). - `--enable-cpu-offload`: enable CPU offloading for diffusion models. - `--use-hsdp`: Enable Hybrid Sharded Data Parallel to shard model weights across GPUs. @@ -75,6 +74,3 @@ Key arguments: > ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage. - -For Wan2.2 LightX2V-converted local Diffusers directories and related LoRA -assets, see the [LoRA guide](../../../docs/user_guide/diffusion/lora.md#wan22-lightx2v-offline-assembly). diff --git a/examples/offline_inference/image_to_video/image_to_video.py b/examples/offline_inference/image_to_video/image_to_video.py index fea5178d89b..c8c55c485ad 100644 --- a/examples/offline_inference/image_to_video/image_to_video.py +++ b/examples/offline_inference/image_to_video/image_to_video.py @@ -33,10 +33,9 @@ """ import argparse -import json +import os import time from pathlib import Path -from typing import Any import numpy as np import PIL.Image @@ -49,16 +48,6 @@ from vllm_omni.platforms import current_omni_platform -def parse_profiler_config(value: str) -> dict[str, Any]: - try: - config = json.loads(value) - except json.JSONDecodeError as e: - raise argparse.ArgumentTypeError(f"--profiler-config must be valid JSON: {e}") from e - if not isinstance(config, dict): - raise argparse.ArgumentTypeError("--profiler-config must be a JSON object") - return config - - def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Generate a video from an image (Wan2.2, LTX2, HunyuanVideo-1.5).") parser.add_argument( @@ -95,13 +84,6 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--flow-shift", type=float, default=5.0, help="Scheduler flow_shift (5.0 for 720p, 12.0 for 480p)." ) - parser.add_argument( - "--sample-solver", - type=str, - default="unipc", - choices=["unipc", "euler"], - help="Sampling solver for Wan2.2 pipelines. Use 'euler' for Lightning/Distill setups.", - ) parser.add_argument("--output", type=str, default="i2v_output.mp4", help="Path to save the video (mp4).") parser.add_argument("--fps", type=int, default=None, help="Frames per second for the output video.") parser.add_argument( @@ -164,7 +146,7 @@ def parse_args() -> argparse.Namespace: "--audio-sample-rate", type=int, default=24000, - help="Sample rate for audio output when saved (default: 24000).", + help="Sample rate for audio output when saved (default: 24000 for LTX2).", ) parser.add_argument( "--cache-backend", @@ -205,15 +187,6 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Enable diffusion pipeline profiler to display stage durations.", ) - parser.add_argument( - "--profiler-config", - type=parse_profiler_config, - default=None, - help='JSON profiler config for torch/cuda profiling, e.g. \'{"profiler":"torch","torch_profiler_dir":"./perf"}\'.', - ) - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults - - nullify_stage_engine_defaults(parser) return parser.parse_args() @@ -294,7 +267,8 @@ def main(): "rel_l1_thresh": 0.2, } - profiler_enabled = args.profiler_config is not None + # Check if profiling is requested via environment variable + profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) parallel_config = DiffusionParallelConfig( ulysses_degree=args.ulysses_degree, ring_degree=args.ring_degree, @@ -319,7 +293,6 @@ def main(): cache_backend=args.cache_backend, cache_config=cache_config, enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, - profiler_config=args.profiler_config, ) if profiler_enabled: @@ -332,7 +305,6 @@ def main(): print(f" Model: {args.model}") print(f" Inference steps: {args.num_inference_steps}") print(f" Frames: {args.num_frames}") - print(f" Solver: {args.sample_solver}") print( f" Parallel configuration: cfg_parallel_size={args.cfg_parallel_size}," f" tensor_parallel_size={args.tensor_parallel_size}, vae_patch_parallel_size={args.vae_patch_parallel_size}" @@ -354,14 +326,9 @@ def main(): generator=generator, guidance_scale=guidance_scale, guidance_scale_2=args.guidance_scale_high, - boundary_ratio=args.boundary_ratio, num_inference_steps=num_inference_steps, num_frames=num_frames, frame_rate=frame_rate, - extra_args={ - "sample_solver": args.sample_solver, - "flow_shift": args.flow_shift, - }, ), ) generation_end = time.perf_counter() @@ -504,9 +471,15 @@ def _ensure_frame_list(video_array): video_array = _ensure_frame_list(video_array) - if audio is not None: - from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes + use_ltx2_export = is_ltx2 + encode_video = None + if use_ltx2_export: + try: + from diffusers.pipelines.ltx2.export_utils import encode_video + except ImportError: + encode_video = None + if use_ltx2_export and encode_video is not None: if isinstance(video_array, list): frames_np = np.stack(video_array, axis=0) elif isinstance(video_array, np.ndarray): @@ -517,24 +490,25 @@ def _ensure_frame_list(video_array): if frames_np.ndim == 4 and frames_np.shape[-1] == 4: frames_np = frames_np[..., :3] - frames_u8 = (np.clip(frames_np, 0.0, 1.0) * 255).round().clip(0, 255).astype("uint8") - - audio_np = audio - if isinstance(audio_np, list): - audio_np = audio_np[0] if audio_np else None - if isinstance(audio_np, torch.Tensor): - audio_np = audio_np.detach().cpu().float().numpy() - if isinstance(audio_np, np.ndarray): - audio_np = np.squeeze(audio_np).astype(np.float32) - - video_bytes = mux_video_audio_bytes( - frames_u8, - audio_np, - fps=float(fps), - audio_sample_rate=args.audio_sample_rate, + audio_out = None + if audio is not None: + if isinstance(audio, list): + audio = audio[0] if audio else None + if isinstance(audio, np.ndarray): + audio = torch.from_numpy(audio) + if isinstance(audio, torch.Tensor): + audio_out = audio + if audio_out.dim() > 1: + audio_out = audio_out[0] + audio_out = audio_out.float().cpu() + + encode_video( + frames_np, + fps=fps, + audio=audio_out, + audio_sample_rate=args.audio_sample_rate if audio_out is not None else None, + output_path=str(output_path), ) - with open(str(output_path), "wb") as f: - f.write(video_bytes) else: export_to_video(video_array, str(output_path), fps=fps) print(f"Saved generated video to {output_path}") diff --git a/examples/offline_inference/magi_human/README.md b/examples/offline_inference/magi_human/README.md deleted file mode 100644 index 2b89093d941..00000000000 --- a/examples/offline_inference/magi_human/README.md +++ /dev/null @@ -1,72 +0,0 @@ -# MagiHuman Generation - -MagiHuman is an advanced, omni-modality model that generates both high-quality video and lip-synced audio from a text prompt. - -Because MagiHuman is a very large model featuring a powerful DiT MoE backbone and a ~9B parameter T5Gemma text encoder, it natively supports **Tensor Parallelism (TP)** in vLLM-Omni to run efficiently across multi-GPU setups, reducing device memory bottlenecks. - -## Setup - -### Install MagiCompiler (recommended) - -MagiHuman relies on [MagiCompiler](https://github.com/SandAI-org/MagiCompiler) for custom-op registration used by the DiT attention kernels. While the pipeline can fall back to stub implementations, installing MagiCompiler is **strongly recommended** for correct behaviour. - -```bash -# Clone the repo -git clone https://github.com/SandAI-org/MagiCompiler.git -cd MagiCompiler - -# System dependencies (optional, for FX graph visualization; Debian/Ubuntu) -sudo apt update && sudo apt install -y graphviz - -# Python dependencies -pip install -r requirements.txt - -# Install MagiCompiler -pip install . # end users (recommended) -# pip install -e . # developers (editable install) -``` - -### Hardware requirements - -Ensure your hardware has enough VRAM. For a standard node with 80GB GPUs, running with `--tensor-parallel-size 4` is recommended to shard both the MoE weights and the T5Gemma text encoder across 4 GPUs, reducing the per-GPU peak VRAM overhead significantly (by roughly ~13.5GB per GPU compared to single-device inference). - -Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) for further details on allocating memory. - -## Run Examples - -Get into the example folder: -```bash -cd examples/offline_inference/magi_human -``` - -### End-to-End Generation (Text to Video+Audio) - -Generate a video with synchronized speech natively generated by the model. - -```bash -python end2end.py \ - --model /proj-tango-pvc/users/zhipeng.wang/workspace/models/daVinci-MagiHuman \ - --prompt "A young woman with long, wavy golden blonde hair..." \ - --tensor-parallel-size 4 \ - --output output_magihuman.mp4 -``` - -## Common Parameters - -| Parameter | Default | Description | -|-----------|---------|-------------| -| `--model` | *(Required)* | Local model path or HuggingFace ID | -| `--prompt` | *(built-in demo prompt)* | Highly detailed text prompt dictating visual look and dialogue text | -| `--tensor-parallel-size` | `4` | Tensor parallelism size (Number of GPUs) | -| `--height` | `256` | Initial resolution height | -| `--width` | `448` | Initial resolution width | -| `--num-inference-steps` | `8` | Denoising steps | -| `--seed` | `52` | Random seed | -| `--output` | `output_magihuman.mp4` | Output video with audio path | - -## Example materials - -??? abstract "end2end.py" - ``````py - --8<-- "examples/offline_inference/magi_human/end2end.py" - `````` diff --git a/examples/offline_inference/magi_human/end2end.py b/examples/offline_inference/magi_human/end2end.py deleted file mode 100644 index 7ea8161385f..00000000000 --- a/examples/offline_inference/magi_human/end2end.py +++ /dev/null @@ -1,125 +0,0 @@ -import argparse - -from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes -from vllm_omni.entrypoints.omni import Omni -from vllm_omni.inputs.data import OmniDiffusionSamplingParams - - -def parse_args(): - parser = argparse.ArgumentParser(description="End-to-end inference script for MagiHuman.") - parser.add_argument("--model", type=str, required=True, help="Path or ID of the MagiHuman model.") - parser.add_argument( - "--prompt", - type=str, - default="", - help="Text prompt containing visual description, dialogue, and background sound.", - ) - parser.add_argument( - "--tensor-parallel-size", "-tp", type=int, default=4, help="Tensor parallel size (number of GPUs)." - ) - parser.add_argument( - "--output", type=str, default="output_magihuman.mp4", help="Path to save the generated mp4 file." - ) - parser.add_argument("--height", type=int, default=256, help="Video height.") - parser.add_argument("--width", type=int, default=448, help="Video width.") - parser.add_argument("--num-inference-steps", type=int, default=8, help="Number of denoising steps.") - parser.add_argument("--seed", type=int, default=52, help="Random seed for generation.") - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults - - nullify_stage_engine_defaults(parser) - return parser.parse_args() - - -def main(): - args = parse_args() - - print(f"Initializing MagiHuman pipeline with TP={args.tensor_parallel_size}...") - omni = Omni( - model=args.model, - init_timeout=1200, - tensor_parallel_size=args.tensor_parallel_size, - devices=list(range(args.tensor_parallel_size)), - ) - - prompt = args.prompt - if not prompt: - prompt = ( - "A young woman with long, wavy golden blonde hair and bright blue eyes, " - "wearing a fitted ivory silk blouse with a delicate lace collar, sits " - "stationary in front of a softly lit, blurred warm-toned interior. Her " - "overall disposition is warm, composed, and gently confident. The camera " - "holds a static medium close-up, framing her from the shoulders up, " - "with shallow depth of field keeping her face in sharp focus. Soft " - "directional key light falls from the upper left, casting a gentle " - "highlight along her cheekbone and nose bridge. She draws a quiet breath, " - "the levator labii superiors relaxing as her lips part. She speaks in " - "clear, warm, unhurried American English: " - "\"The most beautiful things in life aren't things at all — " - "they're moments, feelings, and the people who make you feel truly alive.\" " - "Her jaw descends smoothly on each stressed syllable; the orbicularis oris " - "shapes each vowel with precision. A faint, genuine smile engages the " - "zygomaticus major, lifting her lip corners fractionally. Her brows rest " - "in a soft, neutral arch throughout. She maintains steady, forward-facing " - "eye contact. Head position remains level; no torso displacement occurs.\n\n" - "Dialogue:\n" - ": " - "\"The most beautiful things in life aren't things at all — " - "they're moments, feelings, and the people who make you feel truly alive.\"\n\n" - "Background Sound:\n" - "" - ) - - sampling_params = OmniDiffusionSamplingParams( - height=args.height, - width=args.width, - num_inference_steps=args.num_inference_steps, - seed=args.seed, - extra_args={ - "seconds": 5, - "sr_height": 1080, - "sr_width": 1920, - "sr_num_inference_steps": 5, - }, - ) - - print(f"Generating with prompt: {prompt[:80]}...") - outputs = omni.generate( - prompts=[prompt], - sampling_params_list=[sampling_params], - ) - - print(f"Generation complete. Output type: {type(outputs)}") - if outputs: - first = outputs[0] - - if hasattr(first, "images") and first.images: - video_frames = first.images[0] - print(f"Video frames: shape={video_frames.shape}, dtype={video_frames.dtype}") - - audio_waveform = None - mm = first.multimodal_output or {} - if mm: - audio_waveform = mm.get("audio") - if audio_waveform is not None: - print(f"Audio waveform: shape={audio_waveform.shape}, dtype={audio_waveform.dtype}") - - output_fps = float(mm.get("fps", 25)) - output_sr = int(mm.get("audio_sample_rate", 24000)) - print(f"Using fps={output_fps}, audio_sample_rate={output_sr} from model output") - - video_bytes = mux_video_audio_bytes( - video_frames, - audio_waveform, - fps=output_fps, - audio_sample_rate=output_sr, - ) - with open(args.output, "wb") as f: - f.write(video_bytes) - print(f"Saved MP4 ({len(video_bytes)} bytes) to {args.output}") - print("SUCCESS: MagiHuman pipeline generation completed.") - else: - print("WARNING: No outputs returned.") - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_image_summarize.py b/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_image_summarize.py index a742b535f69..ca87b9e9a94 100644 --- a/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_image_summarize.py +++ b/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_image_summarize.py @@ -19,7 +19,6 @@ from vllm.multimodal.image import convert_image_mode from vllm_omni import Omni -from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults DEFAULT_SYSTEM = "You are a helpful assistant." DEFAULT_QUESTION = "Please summarize the content of this image." @@ -49,7 +48,6 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Enable diffusion pipeline profiler to display stage durations.", ) - nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_t2i.py b/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_t2i.py index bd1282a2117..a4c41fee1f8 100644 --- a/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_t2i.py +++ b/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_t2i.py @@ -29,7 +29,6 @@ from vllm.sampling_params import SamplingParams from vllm_omni import Omni -from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) @@ -118,7 +117,6 @@ def parse_args() -> argparse.Namespace: ) p.add_argument("--out", type=str, default="output.png", help="Path to save the generated image.") p.add_argument("--trust-remote-code", action="store_true", help="Trust remote code when loading the model.") - nullify_stage_engine_defaults(p) args = p.parse_args() if not args.prompt: args.prompt = ["A stylish woman with sunglasses riding a motorcycle in NYC."] diff --git a/examples/offline_inference/mimo_audio/README.md b/examples/offline_inference/mimo_audio/README.md index 5615dea5176..747e734cc24 100644 --- a/examples/offline_inference/mimo_audio/README.md +++ b/examples/offline_inference/mimo_audio/README.md @@ -39,6 +39,7 @@ Run a single sample for basic TTS: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft ``` @@ -47,6 +48,7 @@ Run batch samples for basic TTS: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft \ --num-prompts {batch_size} @@ -64,6 +66,7 @@ Generate speech from text input: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft \ --text "The weather is so nice today." @@ -75,6 +78,7 @@ Generate speech with explicit voice style instructions: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft_with_instruct \ --text "The weather is so nice today." \ @@ -87,6 +91,7 @@ Generate speech using an audio reference for voice cloning: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft_with_audio \ --text "The weather is so nice today." \ @@ -99,6 +104,7 @@ Generate speech from text containing natural voice descriptions: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type tts_sft_with_natural_instruction \ --text "In a panting young male voice, he said: I can't run anymore, wait for me!" @@ -110,6 +116,7 @@ Transcribe audio to text: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type audio_trancribing_sft \ --audio-path "./spoken_dialogue_assistant_turn_1.wav" @@ -121,6 +128,7 @@ Understand and analyze audio content with text queries: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type audio_understanding_sft \ --text "Summarize the audio." \ @@ -133,6 +141,7 @@ Audio understanding with reasoning chain: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type audio_understanding_sft_with_thinking \ --text "Summarize the audio." \ @@ -145,6 +154,7 @@ Multi-turn dialogue with audio input and output: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type spoken_dialogue_sft_multiturn \ --audio-path "./prompt_speech_zh_m.wav" @@ -158,6 +168,7 @@ Multi-turn dialogue converting speech to text: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type speech2text_dialogue_sft_multiturn ``` @@ -170,6 +181,7 @@ Multi-turn text-only dialogue: ```bash python3 -u end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ --model-name XiaomiMiMo/MiMo-Audio-7B-Instruct \ --query-type text_dialogue_sft_multiturn ``` @@ -178,6 +190,29 @@ Note: This task uses hardcoded message lists in the script. ## Troubleshooting +### Audio dependencies (soundfile, librosa) + +This example depends on **soundfile** (read/write WAV) and **librosa** (load audio including MP3). Install the project requirements first: + +```bash +pip install -r requirements/common.txt +# or at least: pip install soundfile>=0.13.1 librosa>=0.11.0 +``` + +- **`soundfile` / libsndfile not found** + `soundfile` uses the C library **libsndfile**. On Linux, install the system package before pip: + - Debian/Ubuntu: `sudo apt-get install libsndfile1` + - For development builds: `sudo apt-get install libsndfile1-dev` + - Then: `pip install soundfile` + +- **`librosa` fails to load MP3 or reports "No backend available"** + Loading MP3 (e.g. in `spoken_dialogue_sft_multiturn` with `.mp3` files) uses **ffmpeg** as the backend. Install ffmpeg: + - Debian/Ubuntu: `sudo apt-get install ffmpeg` + - macOS: `brew install ffmpeg` + +- **`ImportError: No module named 'soundfile'` or `ModuleNotFoundError: ... librosa`** + Ensure you are in the same Python environment where vLLM Omni and the example dependencies are installed, and that `requirements/common.txt` (or the packages above) are installed. + ### Tokenizer path - **`MIMO_AUDIO_TOKENIZER_PATH` not set or model fails to find tokenizer** diff --git a/examples/offline_inference/mimo_audio/end2end.py b/examples/offline_inference/mimo_audio/end2end.py index 9c652fe2b05..ae044d2e8a1 100644 --- a/examples/offline_inference/mimo_audio/end2end.py +++ b/examples/offline_inference/mimo_audio/end2end.py @@ -182,7 +182,7 @@ def main(args): omni = Omni( model=model_name, - deploy_config=args.deploy_config, + stage_configs_path=args.stage_configs_path, log_stats=args.enable_stats, log_file=("omni_pipeline.log" if args.enable_stats else None), init_sleep_seconds=args.init_sleep_seconds, @@ -309,10 +309,7 @@ def main(args): lines.append("Prompt:\n") lines.append(str(prompt_text) + "\n") lines.append("vllm_text_output:\n") - output_text = str(text_output) - if "" in output_text or "" in output_text: - output_text = output_text.replace("", "").replace("", "").strip() - lines.append(output_text + "\n") + lines.append(str(text_output).strip() + "\n") try: with open(out_txt, "w", encoding="utf-8") as f: print("lines", lines) @@ -354,7 +351,7 @@ def parse_args(): "--text", "-t", type=str, - default="", + default="The weather is so nice today.", help="input text", ) parser.add_argument( @@ -431,11 +428,10 @@ def parse_args(): help="Sampling rate for audio.", ) parser.add_argument( - "--deploy-config", + "--stage-configs-path", type=str, - default=None, - help="Override the deploy config path. If unset, auto-loads " - "vllm_omni/deploy/mimo_audio.yaml based on the HF model_type.", + default="../../../model_executor/stage_configs/mimo_audio.yaml", + help="Path to a stage configs file.", ) return parser.parse_args() diff --git a/examples/offline_inference/mimo_audio/message_convert.py b/examples/offline_inference/mimo_audio/message_convert.py index 416f21ccfaf..ebcc59c6b43 100644 --- a/examples/offline_inference/mimo_audio/message_convert.py +++ b/examples/offline_inference/mimo_audio/message_convert.py @@ -5,12 +5,12 @@ import re from collections.abc import Callable +import librosa import numpy as np import torch import torchaudio from process_speechdata import InputSegment, StreamingInputSegment from torchaudio.transforms import MelSpectrogram -from vllm.multimodal.media.audio import load_audio speech_zeroemb_idx = 151667 empty_token = "<|empty|>" @@ -685,7 +685,7 @@ def get_audio_data(audio_url): # File path audio_file = audio_url - audio_signal, sr = load_audio(audio_file, sr=24000) + audio_signal, sr = librosa.load(audio_file, sr=24000) audio_data = (audio_signal.astype(np.float32), sr) return audio_data diff --git a/examples/offline_inference/ming_flash_omni/README.md b/examples/offline_inference/ming_flash_omni/README.md deleted file mode 100644 index be90b408d14..00000000000 --- a/examples/offline_inference/ming_flash_omni/README.md +++ /dev/null @@ -1,92 +0,0 @@ -# Ming-flash-omni 2.0 - -[Ming-flash-omni-2.0](https://github.com/inclusionAI/Ming) is an omni-modal model supporting text, image, video, and audio understanding, with text and speech outputs. - -vLLM-Omni supports two deployment modes: - -| Mode | Stage config | Output | -|------|-------------|--------| -| Thinker only (multimodal understanding) | `ming_flash_omni_thinker.yaml` (default `--omni`) | Text | -| Thinker + Talker (omni-speech) | `ming_flash_omni.yaml` | Text + Audio | - -For standalone TTS (talker only), see [`examples/offline_inference/ming_flash_omni_tts/`](../ming_flash_omni_tts/). - -## Setup - -Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup. - -The default `--omni` flag runs thinker only. For omni-speech, pass the two-stage config explicitly: - -```bash ---stage-configs-path vllm_omni/model_executor/stage_configs/ming_flash_omni.yaml -``` - -## Run examples - -The end-to-end script defaults to built-in assets; pass `--image-path`, -`--audio-path`, or `--video-path` to override. - -```bash -# Text-only -python examples/offline_inference/ming_flash_omni/end2end.py --query-type text - -# Image / audio / video / mixed understanding -python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_image -python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_audio -python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_video --num-frames 16 -python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_mixed_modalities \ - --image-path /path/to/image.jpg --audio-path /path/to/audio.wav -``` - -#### Reasoning (Thinking Mode) - -Reasoning ("detailed thinking on") is applied by the script when -`--query-type reasoning` is set. The default prompt matches Ming's cookbook -and expects the reference figure from the upstream repo — see -`get_reasoning_query` in `end2end.py`. - -```bash -python examples/offline_inference/ming_flash_omni/end2end.py -q reasoning --image-path ./3_0.png -``` - -### Omni-speech (thinker + talker) - -To enable spoken output, use the two-stage config and request `audio` (or `text,audio`) modalities. -The thinker processes your multimodal input, generates text, then the talker synthesises the response as speech. - -**Audio-only output** (speech response, no text): -```bash -python examples/offline_inference/ming_flash_omni/end2end.py \ - --query-type text \ - --stage-configs-path vllm_omni/model_executor/stage_configs/ming_flash_omni.yaml \ - --modalities audio \ - --output-dir output_ming_omni_speech -``` - -**Both text and audio output**: -```bash -python examples/offline_inference/ming_flash_omni/end2end.py \ - --query-type use_audio \ - --stage-configs-path vllm_omni/model_executor/stage_configs/ming_flash_omni.yaml \ - --modalities text,audio \ - --output-dir output_ming_omni_speech -``` - -Generated `.wav` files are saved to `--output-dir` (default `output_ming`), one per request. - -The stage config allocates thinker on GPUs 0–3 and talker on GPU 3 by default. Adjust `devices` in the YAML to match your hardware. - -### Modality control - -| `--modalities` | Thinker output | Talker | Saved files | -|---------------|----------------|--------|-------------| -| `text` (default) | Text | Not run | `.txt` | -| `audio` | Text (internal) | Runs | `.wav` | -| `text,audio` | Text | Runs | `.txt` + `.wav` | - -Pass `--stage-configs-path /path/to/your_config.yaml` to any of the commands -above to override the stage config. - -## Online serving - -For online serving via the OpenAI-compatible API, see [examples/online_serving/ming_flash_omni/README.md](../../online_serving/ming_flash_omni/README.md). diff --git a/examples/offline_inference/ming_flash_omni/end2end.py b/examples/offline_inference/ming_flash_omni/end2end.py deleted file mode 100644 index e00dcea7bb3..00000000000 --- a/examples/offline_inference/ming_flash_omni/end2end.py +++ /dev/null @@ -1,507 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# Partial example cases are referred from -# https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/cookbook.ipynb -import os -import time -from typing import NamedTuple - -import numpy as np -import soundfile as sf -import vllm -from PIL import Image -from transformers import AutoProcessor -from vllm import SamplingParams -from vllm.assets.audio import AudioAsset -from vllm.assets.image import ImageAsset -from vllm.assets.video import VideoAsset, video_to_ndarrays -from vllm.multimodal.image import convert_image_mode -from vllm.multimodal.media.audio import load_audio -from vllm.utils.argparse_utils import FlexibleArgumentParser - -import vllm_omni -from vllm_omni.entrypoints.omni import Omni - -# Imports the processor also registers itself -from vllm_omni.transformers_utils.processors.ming import MingFlashOmniProcessor # noqa: F401 - -SEED = 42 -MODEL_NAME = "Jonathan1909/Ming-flash-omni-2.0" - - -class QueryResult(NamedTuple): - inputs: dict - limit_mm_per_prompt: dict[str, int] - - -def get_text_query(processor: MingFlashOmniProcessor, question: str | None = None) -> QueryResult: - if question is None: - question = "请详细介绍鹦鹉的生活习性。" - conversation = [{"role": "HUMAN", "content": question}] - prompt = processor.apply_chat_template(conversation, tokenize=False) - return QueryResult( - inputs={"prompt": prompt}, - limit_mm_per_prompt={}, - ) - - -def get_image_query( - processor: MingFlashOmniProcessor, - question: str | None = None, - image_path: str | None = None, -) -> QueryResult: - if question is None: - question = "Describe this image in detail." - - if image_path: - if not os.path.exists(image_path): - raise FileNotFoundError(f"Image file not found: {image_path}") - image_data = convert_image_mode(Image.open(image_path), "RGB") - else: - image_data = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") - - conversation = [ - { - "role": "HUMAN", - "content": [ - {"type": "image", "image": image_data}, - {"type": "text", "text": question}, - ], - } - ] - prompt = processor.apply_chat_template(conversation, tokenize=False) - - return QueryResult( - inputs={ - "prompt": prompt, - "multi_modal_data": {"image": image_data}, - }, - limit_mm_per_prompt={"image": 1}, - ) - - -def get_audio_query( - processor: MingFlashOmniProcessor, - question: str | None = None, - audio_path: str | None = None, - sampling_rate: int = 16000, -) -> QueryResult: - if question is None: - question = "Please recognize the language of this speech and transcribe it. Format: oral." - - if audio_path: - if not os.path.exists(audio_path): - raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = load_audio(audio_path, sr=sampling_rate) - audio_data = (audio_signal.astype(np.float32), sr) - else: - audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate - - # Use a string for "audio" so the processor counts it as 1 audio input - conversation = [ - { - "role": "HUMAN", - "content": [ - {"type": "audio", "audio": "input"}, - {"type": "text", "text": question}, - ], - } - ] - prompt = processor.apply_chat_template(conversation, tokenize=False) - - return QueryResult( - inputs={ - "prompt": prompt, - "multi_modal_data": {"audio": audio_data}, - }, - limit_mm_per_prompt={"audio": 1}, - ) - - -def get_video_query( - processor: MingFlashOmniProcessor, - question: str | None = None, - video_path: str | None = None, - num_frames: int = 16, -) -> QueryResult: - if question is None: - question = "Describe what is happening in this video." - - if video_path: - if not os.path.exists(video_path): - raise FileNotFoundError(f"Video file not found: {video_path}") - video_frames = video_to_ndarrays(video_path, num_frames=num_frames) - else: - video_frames = VideoAsset(name="baby_reading", num_frames=num_frames).np_ndarrays - - conversation = [ - { - "role": "HUMAN", - "content": [ - {"type": "video"}, - {"type": "text", "text": question}, - ], - } - ] - prompt = processor.apply_chat_template(conversation, tokenize=False) - - return QueryResult( - inputs={ - "prompt": prompt, - "multi_modal_data": {"video": video_frames}, - }, - limit_mm_per_prompt={"video": 1}, - ) - - -def get_mixed_modalities_query( - processor: MingFlashOmniProcessor, - image_path: str | None = None, - audio_path: str | None = None, - sampling_rate: int = 16000, -) -> QueryResult: - """Mixed image + audio understanding.""" - question = "Describe the image, and recognize the language of this speech and transcribe it. Format: oral" - - if image_path: - if not os.path.exists(image_path): - raise FileNotFoundError(f"Image file not found: {image_path}") - image_data = convert_image_mode(Image.open(image_path), "RGB") - else: - image_data = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") - - if audio_path: - if not os.path.exists(audio_path): - raise FileNotFoundError(f"Audio file not found: {audio_path}") - sig, sr = load_audio(audio_path, sr=sampling_rate) - audio_data = (sig.astype(np.float32), sr) - else: - audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate - - conversation = [ - { - "role": "HUMAN", - "content": [ - {"type": "image", "image": image_data}, - {"type": "audio", "audio": "input"}, - {"type": "text", "text": question}, - ], - } - ] - prompt = processor.apply_chat_template(conversation, tokenize=False) - - return QueryResult( - inputs={ - "prompt": prompt, - "multi_modal_data": {"image": image_data, "audio": audio_data}, - }, - limit_mm_per_prompt={"image": 1, "audio": 1}, - ) - - -def get_reasoning_query( - processor: MingFlashOmniProcessor, - question: str | None = None, - image_path: str | None = None, -) -> QueryResult: - if question is None: - # NOTE: To use the following default question, input with example figure provided by Ming - # https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/figures/cases/3_0.png - # E.g., - # python examples/offline_inference/ming_flash_omni/end2end.py -q reasoning --image-path ./3_0.png - # Otherwise, the problem solving might be false. - question = ( - "Based on the following rules:\n•\tYou control the smiley face character\n" - "•\tYou can move up, down, left, and right, and only a single square at a time\n" - "•\tWalls are dark grey and cannot be moved into\n•\tThe brown square is a box\n•" - "\tThe box can be pushed by moving into it (i.e., if you are in the square " - "adjacent to the box to the left, and move onto the square with the box, " - "the box will move one square to the right).\n" - "•\tThe box cannot be pushed into walls\n" - "•\tThe blue door at the bottom is locked and cannot be passed through, " - "unless the box is placed on the blue square\n" - "•\tThe square beneath the blue door is the exit\n" - "•\tMoving from one square to another\n\n" - "Let's assume a coordinate system where the smiley face is " - "on the top left at (1,1) and the square below it is (1,2). " - "The smiley face performs the following moves: {down, right, right, right}, " - "such that the smiley face is at square (4,2) and the box is in square (5,2). " - "What are the next sequence of moves that must be done to move the box down to (5,3)? " - "Give your answer as a comma separated list." - ) - - if image_path: - if not os.path.exists(image_path): - raise FileNotFoundError(f"Image file not found: {image_path}") - image_data = convert_image_mode(Image.open(image_path), "RGB") - conversation = [ - { - "role": "HUMAN", - "content": [ - {"type": "image", "image": image_data}, - {"type": "text", "text": question}, - ], - } - ] - prompt = processor.apply_chat_template(conversation, tokenize=False, use_cot_system_prompt=True) - return QueryResult( - inputs={ - "prompt": prompt, - "multi_modal_data": {"image": image_data}, - }, - limit_mm_per_prompt={"image": 1}, - ) - - conversation = [{"role": "HUMAN", "content": question}] - prompt = processor.apply_chat_template(conversation, tokenize=False, use_cot_system_prompt=True) - return QueryResult( - inputs={"prompt": prompt}, - limit_mm_per_prompt={}, - ) - - -query_map = { - "text": get_text_query, - "use_audio": get_audio_query, - "use_image": get_image_query, - "use_video": get_video_query, - "use_mixed_modalities": get_mixed_modalities_query, - "reasoning": get_reasoning_query, -} - - -def main(args): - print( - "=" * 20, - "\n", - f"vllm version: {vllm.__version__}\n", - f"vllm-omni version: {vllm_omni.__version__}\n", - "=" * 20, - sep="", - ) - - processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True) - assert isinstance(processor, MingFlashOmniProcessor), f"Wrong processor type being used: {type(processor)}" - - query_func = query_map[args.query_type] - if args.query_type == "use_image": - query_result = query_func(processor, image_path=args.image_path) - elif args.query_type == "use_audio": - query_result = query_func(processor, audio_path=args.audio_path, sampling_rate=args.sampling_rate) - elif args.query_type == "use_video": - query_result = query_func(processor, video_path=args.video_path, num_frames=args.num_frames) - elif args.query_type == "use_mixed_modalities": - query_result = query_func( - processor, - image_path=args.image_path, - audio_path=args.audio_path, - sampling_rate=args.sampling_rate, - ) - elif args.query_type == "reasoning": - query_result = query_func(processor, image_path=args.image_path) - else: - query_result = query_func(processor) - - # Initialize Omni (with thinker-only stage config) - omni = Omni( - model=MODEL_NAME, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - init_timeout=args.init_timeout, - stage_init_timeout=args.stage_init_timeout, - ) - - # Thinker sampling params - thinker_sampling_params = SamplingParams( - temperature=0.4, - top_p=0.9, - max_tokens=args.max_tokens, - repetition_penalty=1.05, - seed=SEED, - detokenize=True, - ) - # Talker (ming_tts) uses a custom generation loop (CFM + AudioVAE); - # vLLM sampling is a no-op here — max_tokens=1 just satisfies the scheduler. - talker_sampling_params = SamplingParams( - temperature=0.0, - max_tokens=1, - ) - all_sampling_params = [thinker_sampling_params, talker_sampling_params] - # Match sampling params to the number of configured stages - # (thinker-only yaml → 1, thinker+talker yaml → 2). - sampling_params_list = all_sampling_params[: omni.num_stages] - - prompts = [query_result.inputs for _ in range(args.num_prompts)] - - if args.modalities is not None: - output_modalities = args.modalities.split(",") - for prompt in prompts: - prompt["modalities"] = output_modalities - - total_requests = len(prompts) - processed_count = 0 - print(f"Query type: {args.query_type}") - print(f"Number of prompts: {total_requests}") - - output_dir = args.output_dir - os.makedirs(output_dir, exist_ok=True) - - profiler_enabled = args.enable_profiler - if profiler_enabled: - omni.start_profile(stages=args.profiler_stages) - - for stage_outputs in omni.generate(prompts, sampling_params_list): - output = stage_outputs.request_output - if stage_outputs.final_output_type == "text": - request_id = output.request_id - text_output = output.outputs[0].text - lines = [] - lines.append("Prompt:\n") - lines.append(str(output.prompt) + "\n") - lines.append("Text Output:\n") - lines.append(str(text_output).strip() + "\n") - print(*lines, sep="") - - # Save to file - out_txt = os.path.join(output_dir, f"{request_id}.txt") - try: - with open(out_txt, "w", encoding="utf-8") as f: - f.writelines(lines) - print(f"Request ID: {request_id}, text saved to {out_txt}") - except Exception as e: - print(f"Failed to write output file {out_txt}: {e}") - - elif stage_outputs.final_output_type == "audio": - request_id = output.request_id - mm = output.outputs[0].multimodal_output - if mm and "audio" in mm: - audio = mm["audio"] - sr_raw = mm.get("sr", 44100) - sample_rate = int(sr_raw.item() if hasattr(sr_raw, "item") else sr_raw) - audio_numpy = audio.float().squeeze().cpu().numpy() - output_wav = os.path.join(output_dir, f"{request_id}.wav") - sf.write(output_wav, audio_numpy, samplerate=sample_rate, format="WAV") - print( - f"Request ID: {request_id}, audio saved to {output_wav} " - f"({len(audio_numpy) / sample_rate:.2f}s, {sample_rate}Hz)" - ) - - processed_count += 1 - if profiler_enabled and processed_count >= total_requests: - print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...") - # Stop the profiler while workers are still alive - omni.stop_profile(stages=args.profiler_stages) - - print("[Info] Waiting 30s for workers to write trace files to disk...") - time.sleep(30) - print("[Info] Trace export wait time finished.") - - omni.close() - - -def parse_args(): - parser = FlexibleArgumentParser(description="Ming-flash-omni 2.0 offline inference example") - parser.add_argument( - "--query-type", - "-q", - type=str, - default="text", - choices=query_map.keys(), - help="Query type.", - ) - parser.add_argument( - "--stage-configs-path", - type=str, - default=None, - help="Path to a stage configs YAML file.", - ) - parser.add_argument( - "--log-stats", - action="store_true", - default=False, - help="Enable detailed statistics logging.", - ) - parser.add_argument("--init-timeout", type=int, default=2000, help="Timeout for initializing in seconds.") - parser.add_argument( - "--stage-init-timeout", - type=int, - default=2000, - help="Timeout for initializing a single stage in seconds.", - ) - parser.add_argument( - "--enable-profiler", - action="store_true", - default=False, - help="Enables profiling when set.", - ) - parser.add_argument( - "--profiler-stages", - type=int, - nargs="*", - default=[0], - help="List of stage IDs to profile. If not set, profiles all stages.", - ) - parser.add_argument( - "--image-path", - "-i", - type=str, - default=None, - help="Path to local image file. Uses default asset if not provided.", - ) - parser.add_argument( - "--audio-path", - "-a", - type=str, - default=None, - help="Path to local audio file. Uses default asset if not provided.", - ) - parser.add_argument( - "--video-path", - "-v", - type=str, - default=None, - help="Path to local video file. Uses default asset if not provided.", - ) - parser.add_argument( - "--num-frames", - type=int, - default=16, - help="Number of frames to extract from video.", - ) - parser.add_argument( - "--sampling-rate", - type=int, - default=16000, - help="Sampling rate for audio loading.", - ) - parser.add_argument( - "--max-tokens", - type=int, - default=16384, - help="Maximum tokens to generate.", - ) - parser.add_argument( - "--num-prompts", - type=int, - default=1, - help="Number of prompts to generate.", - ) - parser.add_argument( - "--modalities", - type=str, - default=None, - help="Output modalities (comma-separated).", - ) - parser.add_argument( - "--output-dir", - type=str, - default="output_ming", - help="Output directory for results.", - ) - - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/examples/offline_inference/ming_flash_omni_tts/README.md b/examples/offline_inference/ming_flash_omni_tts/README.md deleted file mode 100644 index 15b84041df2..00000000000 --- a/examples/offline_inference/ming_flash_omni_tts/README.md +++ /dev/null @@ -1,47 +0,0 @@ -# Ming-flash-omni Standalone TTS (Offline) - -This example runs **Ming-flash-omni-2.0 talker-only** offline inference with: - -- `model`: `Jonathan1909/Ming-flash-omni-2.0` -- `stage config`: `vllm_omni/model_executor/stage_configs/ming_flash_omni_tts.yaml` - -It follows the Ming cookbook parameter style: - -- `prompt`: `"Please generate speech based on the following description.\n"` -- `max_decode_steps`: `200` -- `cfg`: `2.0` -- `sigma`: `0.25` -- `temperature`: `0.0` - -## Quick Start - -```bash -python examples/offline_inference/ming_flash_omni_tts/end2end.py --case style -``` - -## Cases - -```bash -# Style -python examples/offline_inference/ming_flash_omni_tts/end2end.py --case style - -# IP -python examples/offline_inference/ming_flash_omni_tts/end2end.py --case ip - -# Basic (speed/pitch/volume control) -python examples/offline_inference/ming_flash_omni_tts/end2end.py --case basic -``` - -## Useful Arguments - -- `--text`: override default text in the selected case -- `--output`: custom output wav path -- `--model`: local model path or HF repo id -- `--stage-configs-path`: custom talker stage config path -- `--log-stats`: enable runtime stats logs - -## Notes - -- This directory is for **standalone talker deployment (TTS)**. -- For Ming thinker multimodal understanding examples, see: - `examples/offline_inference/ming_flash_omni/`. diff --git a/examples/offline_inference/ming_flash_omni_tts/end2end.py b/examples/offline_inference/ming_flash_omni_tts/end2end.py deleted file mode 100644 index 928994510a6..00000000000 --- a/examples/offline_inference/ming_flash_omni_tts/end2end.py +++ /dev/null @@ -1,128 +0,0 @@ -"""Offline e2e example for Ming-flash-omni-2.0 standalone talker (TTS).""" - -import os -from typing import Any - -import soundfile as sf -import torch -from vllm.utils.argparse_utils import FlexibleArgumentParser - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - -from vllm_omni.entrypoints.omni import Omni -from vllm_omni.inputs.data import OmniTokensPrompt -from vllm_omni.model_executor.models.ming_flash_omni.prompt_utils import ( - DEFAULT_PROMPT, - create_instruction, -) - -MODEL_NAME = "Jonathan1909/Ming-flash-omni-2.0" -DEFAULT_STAGE_CONFIG = "vllm_omni/model_executor/stage_configs/ming_flash_omni_tts.yaml" - - -def get_messages(case: str, text_override: str | None) -> dict[str, Any]: - if case == "style": - text = text_override or "我会一直在这里陪着你,直到你慢慢、慢慢地沉入那个最温柔的梦里……好吗?" - instruction = create_instruction( - { - "风格": "这是一种ASMR耳语,属于一种旨在引发特殊感官体验的创意风格。这个女性使用轻柔的普通话进行耳语,声音气音成分重。音量极低,紧贴麦克风,语速极慢,旨在制造触发听者颅内快感的声学刺激。", - } - ) - return { - "prompt": DEFAULT_PROMPT, - "text": text, - "instruction": instruction, - "use_zero_spk_emb": True, - } - if case == "ip": - text = text_override or "这款产品的名字,叫变态坑爹牛肉丸。" - return { - "prompt": DEFAULT_PROMPT, - "text": text, - "instruction": create_instruction({"IP": "灵小甄"}), - "use_zero_spk_emb": True, - } - if case == "basic": - text = text_override or "我们当迎着阳光辛勤耕作,去摘取,去制作,去品尝,去馈赠。" - return { - "prompt": DEFAULT_PROMPT, - "text": text, - "instruction": create_instruction({"语速": "快速", "基频": "中", "音量": "中"}), - "use_zero_spk_emb": True, - } - raise ValueError(f"Unknown case: {case}") - - -def save_audio(mm: dict[str, Any], output_path: str) -> None: - if not mm or "audio" not in mm: - raise RuntimeError("No audio found in model output") - audio = mm["audio"] - sr_raw = mm.get("sr", 44100) - if isinstance(sr_raw, torch.Tensor): - sample_rate = int(sr_raw.item()) - else: - sample_rate = int(sr_raw) - waveform = audio.squeeze().float().cpu().numpy() - sf.write(output_path, waveform, sample_rate) - print(f"Saved {output_path} ({len(waveform) / sample_rate:.2f}s, {sample_rate}Hz)") - - -def parse_args(): - parser = FlexibleArgumentParser(description="Ming-flash-omni standalone talker offline e2e example") - parser.add_argument("--model", type=str, default=MODEL_NAME, help="Model name or local path.") - parser.add_argument( - "--stage-configs-path", - type=str, - default=DEFAULT_STAGE_CONFIG, - help="Path to stage configs yaml for standalone talker deployment.", - ) - parser.add_argument( - "--case", - type=str, - default="style", - choices=["style", "ip", "basic"], - help="Example case.", - ) - parser.add_argument("--text", type=str, default=None, help="Override default text for the selected case.") - parser.add_argument("--output", type=str, default=None, help="Output wav path.") - parser.add_argument("--log-stats", action="store_true", default=False, help="Enable stats logging.") - parser.add_argument("--init-timeout", type=int, default=600, help="Engine init timeout in seconds.") - parser.add_argument("--stage-init-timeout", type=int, default=300, help="Single stage init timeout in seconds.") - return parser.parse_args() - - -def main(): - args = parse_args() - - omni = Omni( - model=args.model, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - init_timeout=args.init_timeout, - stage_init_timeout=args.stage_init_timeout, - ) - - messages = get_messages(args.case, args.text) - decode_args = { - # Standalone TTS deployment - "ming_task": "instruct", - "max_decode_steps": 200, - "cfg": 2.0, - "sigma": 0.25, - "temperature": 0.0, - } - req = OmniTokensPrompt( - prompt_token_ids=[0], - additional_information={**messages, **decode_args}, - ) - - outputs = omni.generate(req) - mm = outputs[0].outputs[0].multimodal_output - - output_path = args.output or f"output_{args.case}.wav" - save_audio(mm, output_path) - omni.close() - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/omnivoice/end2end.py b/examples/offline_inference/omnivoice/end2end.py index cc6f585c50e..b41379b011a 100644 --- a/examples/offline_inference/omnivoice/end2end.py +++ b/examples/offline_inference/omnivoice/end2end.py @@ -89,6 +89,7 @@ def run_e2e(): omni = Omni( model=args.model, stage_configs_path=args.stage_config, + trust_remote_code=True, log_stats=True, ) @@ -102,9 +103,9 @@ def run_e2e(): if not os.path.exists(args.ref_audio): raise FileNotFoundError(f"Reference audio not found: {args.ref_audio}") - from vllm.multimodal.media.audio import load_audio + import librosa - audio_signal, sr = load_audio(args.ref_audio, sr=None) + audio_signal, sr = librosa.load(args.ref_audio, sr=None) multi_modal_data["audio"] = (audio_signal.astype(np.float32), sr) mm_processor_kwargs["ref_text"] = args.ref_text or "" mm_processor_kwargs["sample_rate"] = sr diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md index e2eae8a96b5..20740a0da02 100644 --- a/examples/offline_inference/qwen2_5_omni/README.md +++ b/examples/offline_inference/qwen2_5_omni/README.md @@ -60,3 +60,11 @@ If media file paths are not provided, the script will use default assets. Suppor - `mixed_modalities`: Audio + image + video - `use_audio_in_video`: Extract audio from video - `text`: Text-only query + +### FAQ + +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py index a65c554a9b0..7bba5998308 100644 --- a/examples/offline_inference/qwen2_5_omni/end2end.py +++ b/examples/offline_inference/qwen2_5_omni/end2end.py @@ -5,11 +5,11 @@ with the correct prompt format on Qwen2.5-Omni """ -import json import os import time from typing import NamedTuple +import librosa import numpy as np import soundfile as sf from PIL import Image @@ -17,7 +17,6 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode -from vllm.multimodal.media.audio import load_audio from vllm.sampling_params import SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -97,7 +96,7 @@ def get_mixed_modalities_query( if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = load_audio(audio_path, sr=sampling_rate) + audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate @@ -131,7 +130,7 @@ def get_use_audio_in_video_query( raise FileNotFoundError(f"Video file not found: {video_path}") video_frames = video_to_ndarrays(video_path, num_frames=num_frames) # Extract audio from video file - audio_signal, sr = load_audio(video_path, sr=sampling_rate) + audio_signal, sr = librosa.load(video_path, sr=sampling_rate) audio = (audio_signal.astype(np.float32), sr) else: asset = VideoAsset(name="baby_reading", num_frames=num_frames) @@ -166,7 +165,7 @@ def get_multi_audios_query(audio_path: str | None = None, sampling_rate: int = 1 if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = load_audio(audio_path, sr=sampling_rate) + audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) # Use the provided audio as the first audio, default as second audio_list = [ @@ -262,7 +261,7 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = load_audio(audio_path, sr=sampling_rate) + audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate @@ -290,10 +289,7 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin def main(args): - model_name = args.model - quantization_config = None - if args.quantization_config is not None: - quantization_config = json.loads(args.quantization_config) + model_name = "Qwen/Qwen2.5-Omni-7B" # Get paths from args video_path = getattr(args, "video_path", None) @@ -324,8 +320,14 @@ def main(args): query_result = query_func(audio_path=audio_path, sampling_rate=sampling_rate) else: query_result = query_func() - args.quantization_config = quantization_config - omni = Omni.from_cli_args(args, model=model_name) + omni = Omni( + model=model_name, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + batch_timeout=args.batch_timeout, + init_timeout=args.init_timeout, + shm_threshold_bytes=args.shm_threshold_bytes, + ) thinker_sampling_params = SamplingParams( temperature=0.0, # Deterministic - no randomness top_p=1.0, # Disable nucleus sampling @@ -429,18 +431,6 @@ def main(args): def parse_args(): parser = FlexibleArgumentParser(description="Demo on using vLLM for offline inference with audio language models") - parser.add_argument( - "--model", - type=str, - default="Qwen/Qwen2.5-Omni-7B", - help="Model name or local path.", - ) - parser.add_argument( - "--quantization-config", - type=str, - default=None, - help="Optional JSON string forwarded to Omni(quantization_config=...).", - ) parser.add_argument( "--query-type", "-q", diff --git a/examples/offline_inference/qwen3_omni/README.md b/examples/offline_inference/qwen3_omni/README.md index 0710faa133c..b3e8592532e 100644 --- a/examples/offline_inference/qwen3_omni/README.md +++ b/examples/offline_inference/qwen3_omni/README.md @@ -70,8 +70,8 @@ For true stage-level concurrency -- where downstream stages (Talker, Code2Wav) start **before** the upstream stage (Thinker) finishes -- use the async_chunk example. This requires: -1. A deploy config YAML with ``async_chunk: true`` (e.g. - ``qwen3_omni_moe.yaml``). +1. A stage config YAML with ``async_chunk: true`` (e.g. + ``qwen3_omni_moe_async_chunk.yaml``). 2. Hardware that matches the config (e.g. 2x H100 for the default 3-stage config). @@ -101,10 +101,18 @@ python end2end_async_chunk.py --query-type text --modalities text ```bash python end2end_async_chunk.py \ --query-type use_audio \ - --deploy-config /path/to/your_deploy_config.yaml + --stage-configs-path /path/to/your_async_chunk.yaml ``` > **Note**: The synchronous ``end2end.py`` (using ``Omni``) is still the > recommended entry point for non-async-chunk workflows. Only use the > async_chunk example when you need the stage-level concurrency semantics > described in PR #962 / #1151. + +### FAQ + +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 04aa7914db1..155eca4ed9f 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -9,6 +9,7 @@ import time from typing import NamedTuple +import librosa import numpy as np import soundfile as sf import vllm @@ -18,10 +19,8 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode -from vllm.multimodal.media.audio import load_audio from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni SEED = 42 @@ -130,7 +129,7 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = load_audio(audio_path, sr=sampling_rate) + audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate @@ -184,7 +183,7 @@ def get_mixed_modalities_query( if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = load_audio(audio_path, sr=sampling_rate) + audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate @@ -295,7 +294,14 @@ def main(args): else: query_result = query_func() - omni = Omni.from_cli_args(args, model=model_name) + omni = Omni( + model=model_name, + dtype=args.dtype, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + init_timeout=args.init_timeout, + ) thinker_sampling_params = SamplingParams( temperature=0.9, @@ -551,7 +557,6 @@ def parse_args(): help="Model dtype (auto, half, float16, bfloat16, float, float32).", ) - nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py index 85c2da20b04..8adbae9eb66 100644 --- a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py +++ b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py @@ -14,7 +14,7 @@ Usage ----- python end2end_async_chunk.py --query-type use_audio \ - --deploy-config + --stage-configs-path See ``--help`` for all options. """ @@ -32,13 +32,13 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +import librosa from PIL import Image from vllm import SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode -from vllm.multimodal.media.audio import load_audio from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni.entrypoints.async_omni import AsyncOmni @@ -89,7 +89,7 @@ def get_audio_query( if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = load_audio(audio_path, sr=sampling_rate) + audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate @@ -179,26 +179,20 @@ def clone_prompt_for_request(template: dict) -> dict: return cloned -def _default_deploy_config_path() -> str | None: - """Best-effort default deploy config for running Qwen3-Omni with async_chunk. +def _default_async_chunk_stage_configs_path() -> str | None: + """Best-effort default stage config for running Qwen3-Omni with async_chunk. - The default ``vllm_omni/deploy/qwen3_omni_moe.yaml`` ships with - ``async_chunk: true`` at the top level, so loading it is enough to - enable async-chunk semantics. To disable it, copy the YAML and set - ``async_chunk: false`` (or pass ``--deploy-config`` to a YAML that - overrides the flag). - - When this example is executed from within the repository, we resolve - the default YAML path relative to this file. When installed elsewhere, - the file may not exist and callers should pass ``--deploy-config`` - explicitly. + When this example is executed from within the repository, we resolve the + default YAML path relative to this file. When installed elsewhere, the + file may not exist and callers should pass --stage-configs-path explicitly. """ repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")) candidate = os.path.join( repo_root, "vllm_omni", - "deploy", - "qwen3_omni_moe.yaml", + "model_executor", + "stage_configs", + "qwen3_omni_moe_async_chunk.yaml", ) return candidate if os.path.exists(candidate) else None @@ -242,7 +236,8 @@ async def run_single_request( if stage_0_first_output_ts is None: stage_0_first_output_ts = time.perf_counter() text_output = output.outputs[0].text - text_parts.append(text_output) + if output.finished: + text_parts.append(text_output) elif omni_output.final_output_type == "audio": mm_out = output.outputs[0].multimodal_output if mm_out and "audio" in mm_out: @@ -292,7 +287,7 @@ async def run_single_request( if text_parts: text_file = os.path.join(output_dir, f"{request_id}.txt") with open(text_file, "w", encoding="utf-8") as f: - f.write("".join(text_parts)) + f.write("\n".join(text_parts)) result["saved_files"].append(text_file) print( f"[Request {request_id}] Text saved to {text_file} " @@ -379,23 +374,18 @@ async def run_all(args): prompt["modalities"] = output_modalities # Create AsyncOmni - print(f"[Info] Creating AsyncOmni with deploy_config={args.deploy_config}") + print(f"[Info] Creating AsyncOmni with stage_configs_path={args.stage_configs_path}") async_omni = None try: - # ``from_cli_args`` expands vars(args) into kwargs and auto-captures - # ``_cli_explicit_keys`` from ``sys.argv[1:]`` so argparse defaults - # do not silently override deploy YAML values. Mirrors the - # ``EngineArgs.from_cli_args`` pattern used throughout vllm / - # vllm-omni. ``deploy_config=None`` (the default) falls through to - # the bundled ``vllm_omni/deploy/qwen3_omni_moe.yaml``. - async_omni = AsyncOmni.from_cli_args(args) + async_omni = AsyncOmni( + model=args.model, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + ) # Use default sampling params from stage config (they are pre-configured # in the YAML for each stage). - # - # NOTE: Since we do not set the sampling params directly, .generate in - # will automatically set the output kind to delta, since this is what - # makes sense for most multimodal use-cases. sampling_params_list = None output_dir = args.output_dir @@ -480,11 +470,11 @@ def parse_args(): help="Query type.", ) parser.add_argument( - "--deploy-config", + "--stage-configs-path", type=str, - default=_default_deploy_config_path(), + default=_default_async_chunk_stage_configs_path(), help=( - "Path to a deploy config YAML. " + "Path to an async_chunk stage config YAML. " "If not set, uses the model's default config " "(make sure it has async_chunk: true)." ), diff --git a/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh b/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh index 2f2be20915a..809054867c3 100755 --- a/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh +++ b/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh @@ -17,7 +17,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" python "${SCRIPT_DIR}/end2end_async_chunk.py" \ --query-type text \ --txt-prompts "${SCRIPT_DIR}/text_prompts_10.txt" \ - --deploy-config "${REPO_ROOT}/vllm_omni/deploy/qwen3_omni_moe.yaml" \ + --stage-configs-path "${REPO_ROOT}/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml" \ --output-dir output_audio_async_chunk \ --max-in-flight 2 \ "$@" diff --git a/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh b/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh index 9ef69293cb5..918c7ee4fd9 100755 --- a/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh +++ b/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh @@ -6,13 +6,13 @@ # achieving true stage-level concurrency via chunk-level streaming. # # Prerequisites: -# - A deploy config YAML (e.g. qwen3_omni_moe.yaml) +# - An async_chunk stage config YAML (e.g. qwen3_omni_moe_async_chunk.yaml) # - Hardware matching the config (e.g. 2x H100 for the default 3-stage config) # # Usage: # bash run_single_prompt_async_chunk.sh # bash run_single_prompt_async_chunk.sh --query-type text --modalities text -# bash run_single_prompt_async_chunk.sh --deploy-config /path/to/custom.yaml +# bash run_single_prompt_async_chunk.sh --stage-configs-path /path/to/custom.yaml set -euo pipefail @@ -21,6 +21,6 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" python "${SCRIPT_DIR}/end2end_async_chunk.py" \ --query-type use_audio \ - --deploy-config "${REPO_ROOT}/vllm_omni/deploy/qwen3_omni_moe.yaml" \ + --stage-configs-path "${REPO_ROOT}/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml" \ --output-dir output_audio_async_chunk \ "$@" diff --git a/examples/offline_inference/qwen3_tts/README.md b/examples/offline_inference/qwen3_tts/README.md index 2971ad716a2..bf59dc9ba49 100644 --- a/examples/offline_inference/qwen3_tts/README.md +++ b/examples/offline_inference/qwen3_tts/README.md @@ -15,11 +15,11 @@ Please refer to the [stage configuration documentation](https://docs.vllm.ai/pro ### ROCm Dependencies -You will need to install the dependency `onnxruntime-rocm`. +You will need to install these two dependencies `onnxruntime-rocm` and `sox`. ``` pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm -pip install onnxruntime-rocm +pip install onnxruntime-rocm sox ``` ## Quick Start @@ -104,13 +104,13 @@ completes. This demonstrates that audio data is available progressively rather t ## Batched Decoding -The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, set `max_num_seqs > 1` on both stages via `--stage-overrides` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. +The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, provide a stage config with `max_num_seqs > 1` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. ``` python end2end.py --query-type CustomVoice \ --txt-prompts benchmark_prompts.txt \ --batch-size 4 \ - --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2},"1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml ``` **Important:** `--batch-size` must match a CUDA graph capture size (1, 2, 4, 8, 16...) because the Talker's code predictor KV cache is sized to `max_num_seqs`, and CUDA graphs pad the batch to the next capture size. Both stages need `max_num_seqs >= batch_size` in the stage config for batching to take effect. If only stage 1 has a higher `max_num_seqs`, it won't help — stage 1 can only batch chunks from requests that are in-flight simultaneously, which requires stage 0 to also process multiple requests concurrently. diff --git a/examples/offline_inference/qwen3_tts/end2end.py b/examples/offline_inference/qwen3_tts/end2end.py index 77da356b4f8..901418c39b8 100644 --- a/examples/offline_inference/qwen3_tts/end2end.py +++ b/examples/offline_inference/qwen3_tts/end2end.py @@ -366,7 +366,12 @@ def main(args): output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) - omni = Omni.from_cli_args(args, model=model_name) + omni = Omni( + model=model_name, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + ) batch_size = args.batch_size for batch_start in range(0, len(inputs), batch_size): @@ -382,7 +387,12 @@ async def main_streaming(args): output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) - omni = AsyncOmni.from_cli_args(args, model=model_name) + omni = AsyncOmni( + model=model_name, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + ) for i, prompt in enumerate(inputs): request_id = str(i) diff --git a/examples/offline_inference/text_to_audio/README.md b/examples/offline_inference/text_to_audio/README.md index 10a8ae37ed1..7edc38092ad 100644 --- a/examples/offline_inference/text_to_audio/README.md +++ b/examples/offline_inference/text_to_audio/README.md @@ -23,23 +23,6 @@ python text_to_audio.py \ --guidance-scale 7.0 \ --audio-length 10.0 \ --num-inference-steps 100 \ - --cache-backend tea_cache \ - --output stable_audio_output.wav -``` - -To reduce per-GPU memory for multi-GPU inference, launch with HSDP: - -```bash -python text_to_audio.py \ - --model stabilityai/stable-audio-open-1.0 \ - --prompt "The sound of a hammer hitting a wooden surface" \ - --negative-prompt "Low quality" \ - --seed 42 \ - --guidance-scale 7.0 \ - --audio-length 10.0 \ - --num-inference-steps 100 \ - --use-hsdp \ - --hsdp-shard-size 2 \ --output stable_audio_output.wav ``` @@ -51,8 +34,4 @@ Key arguments: - `--guidance-scale`: classifier-free guidance scale. - `--audio-length`: audio duration in seconds. - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower). -- `--use-hsdp`: enable HSDP weight sharding for the Stable Audio DiT. -- `--hsdp-shard-size`: number of GPUs used for HSDP sharding. -- `--hsdp-replicate-size`: number of HSDP replica groups. -- `--cache-backend`: cache acceleration backend. Stable Audio currently supports `tea_cache`. - `--output`: path to save the generated WAV file. diff --git a/examples/offline_inference/text_to_audio/text_to_audio.py b/examples/offline_inference/text_to_audio/text_to_audio.py index 2a1613e5e91..a6968c419f6 100644 --- a/examples/offline_inference/text_to_audio/text_to_audio.py +++ b/examples/offline_inference/text_to_audio/text_to_audio.py @@ -11,7 +11,6 @@ python text_to_audio.py --prompt "The sound of a dog barking" python text_to_audio.py --prompt "A piano playing a gentle melody" --audio-length 10.0 python text_to_audio.py --prompt "Thunder and rain sounds" --negative-prompt "Low quality" - python text_to_audio.py --prompt "A soft synth pad" --cache-backend tea_cache """ import argparse @@ -21,7 +20,6 @@ import numpy as np import torch -from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -92,83 +90,11 @@ def parse_args() -> argparse.Namespace: default=44100, help="Sample rate for output audio (Stable Audio uses 44100 Hz).", ) - parser.add_argument( - "--cache-backend", - type=str, - default=None, - choices=["tea_cache"], - help=( - "Cache backend to use for acceleration. " - "Stable Audio currently supports 'tea_cache'. " - "Default: None (no cache acceleration)." - ), - ) - parser.add_argument( - "--tea-cache-rel-l1-thresh", - type=float, - default=0.2, - help="[tea_cache] Threshold for accumulated relative L1 distance.", - ) parser.add_argument( "--enable-diffusion-pipeline-profiler", action="store_true", help="Enable diffusion pipeline profiler to display stage durations.", ) - parser.add_argument( - "--use-hsdp", - action="store_true", - help="Enable HSDP for Stable Audio DiT weight sharding.", - ) - parser.add_argument( - "--hsdp-shard-size", - type=int, - default=1, - help="Number of GPUs to shard Stable Audio DiT weights across when HSDP is enabled.", - ) - parser.add_argument( - "--hsdp-replicate-size", - type=int, - default=1, - help="Number of HSDP replica groups. Default 1 means pure sharding.", - ) - parser.add_argument( - "--tensor-parallel-size", - type=int, - default=1, - help="Number of GPUs used for tensor parallelism (TP) inside the DiT.", - ) - parser.add_argument( - "--ulysses-degree", - type=int, - default=1, - help="Number of GPUs used for ulysses sequence parallelism.", - ) - parser.add_argument( - "--ulysses-mode", - type=str, - default="strict", - choices=["strict", "advanced_uaa"], - help="Ulysses sequence-parallel mode: 'strict' (divisibility required) or 'advanced_uaa' (UAA).", - ) - parser.add_argument( - "--ring-degree", - type=int, - default=1, - help="Number of GPUs used for ring sequence parallelism.", - ) - parser.add_argument( - "--cfg-parallel-size", - type=int, - default=1, - choices=[1, 2], - help="Number of GPUs used for classifier free guidance parallel size.", - ) - parser.add_argument( - "--vae-patch-parallel-size", - type=int, - default=1, - help="Number of GPUs used for VAE patch/tile parallelism (decode).", - ) return parser.parse_args() @@ -198,11 +124,6 @@ def save_audio(audio_data: np.ndarray, output_path: str, sample_rate: int = 4410 def main(): args = parse_args() generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed) - cache_config = None - if args.cache_backend == "tea_cache": - cache_config = { - "rel_l1_thresh": args.tea_cache_rel_l1_thresh, - } print(f"\n{'=' * 60}") print("Stable Audio Open - Text-to-Audio Generation") @@ -213,26 +134,12 @@ def main(): print(f" Audio length: {args.audio_length}s") print(f" Inference steps: {args.num_inference_steps}") print(f" Guidance scale: {args.guidance_scale}") - print(f" Cache backend: {args.cache_backend if args.cache_backend else 'None (no acceleration)'}") - if args.use_hsdp: - print(f" HSDP: enabled (shard_size={args.hsdp_shard_size}, replicate_size={args.hsdp_replicate_size})") - else: - print(" HSDP: disabled") print(f" Seed: {args.seed}") print(f"{'=' * 60}\n") - parallel_config = DiffusionParallelConfig( - use_hsdp=args.use_hsdp, - hsdp_shard_size=args.hsdp_shard_size, - hsdp_replicate_size=args.hsdp_replicate_size, - ) - # Initialize Omni with Stable Audio model omni = Omni( model=args.model, - parallel_config=parallel_config, - cache_backend=args.cache_backend, - cache_config=cache_config, enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, ) diff --git a/examples/offline_inference/text_to_image/README.md b/examples/offline_inference/text_to_image/README.md index c71773972b3..235b710a68e 100644 --- a/examples/offline_inference/text_to_image/README.md +++ b/examples/offline_inference/text_to_image/README.md @@ -29,12 +29,10 @@ This folder provides several entrypoints for experimenting with text-to-image di | `AIDC-AI/Ovis-Image-7B` | 1024 x 1024 | 71.8 | 17.1 | | `OmniGen2/OmniGen2` | 1024 x 1024 | 20.1 | 14.7 | | `stabilityai/stable-diffusion-3.5-medium` | 1024 x 1024 | 20.1 | 15.6 | -| `black-forest-labs/FLUX.1-dev` | 1024 x 1024 | 33.9 | 31.4 | -| `black-forest-labs/FLUX.1-schnell` | 1024 x 1024 | 33.9 | 31.4 | +| `black-forest-labs/FLUX.1-dev` | 1024 x 1024 | 77.6 | 31.4 | | `black-forest-labs/FLUX.2-klein-4B` | 1024 x 1024 | 72.7 | 14.9 | | `black-forest-labs/FLUX.2-klein-9B` | 1024 x 1024 | 37.1 | 32.3 | | `black-forest-labs/FLUX.2-dev` | 1024 x 1024 | 65.7 | >80 (CPU offload required) | -| `HunyuanImage-3.0` | 1024 x 1024 | 80.0 (TP≥3) | 160 | !!! info *Peak VRAM: based on basic single-card usage, batch size =1, without any acceleration/optimization features. FLUX.2-dev requires `--enable-cpu-offload` on a single 80 GiB GPU. @@ -92,8 +90,6 @@ python text_to_image.py \ | `--enable-cpu-offload` | flag | off | Enable CPU offloading for diffusion models | | `--lora-path` | str | — | Path to PEFT LoRA adapter folder | | `--lora-scale` | float | `1.0` | Scale factor for LoRA weights | -| `--use-system-prompt` | str | `None` | System prompt preset: `en_unified`, `en_vanilla`, `en_recaption`, `en_think_recaption`, `dynamic`, `None`, or custom text. Recommended: `en_unified`. Only for HunyuanImage-3.0.| -| `--system-prompt` | str | `None` | Custom system prompt text. Only used when `--use-system-prompt` is set to `custom`. Only for HunyuanImage-3.0.| **NextStep-1.1 specific arguments:** @@ -248,7 +244,7 @@ python examples/offline_inference/text_to_image/text_to_image.py \ #### CFG Parallel Set `--cfg-parallel-size 2` to enable CFG Parallel for faster inference on multi-GPU setups. -See more examples in the [cfg_parallel user guide](../../../docs/user_guide/parallelism/cfg_parallel.md#using-cfg-parallel). +See more examples in the [diffusion acceleration user guide](../../../docs/user_guide/diffusion_acceleration.md#using-cfg-parallel). #### LoRA diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index bc18c685912..927b0f0b087 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse -import json +import os import time from pathlib import Path from typing import Any @@ -30,16 +30,6 @@ def is_nextstep_model(model_name: str) -> bool: return False -def parse_profiler_config(value: str) -> dict[str, Any]: - try: - config = json.loads(value) - except json.JSONDecodeError as e: - raise argparse.ArgumentTypeError(f"--profiler-config must be valid JSON: {e}") from e - if not isinstance(config, dict): - raise argparse.ArgumentTypeError("--profiler-config must be a JSON object") - return config - - def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Generate an image with supported diffusion models.") parser.add_argument( @@ -154,23 +144,6 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Enable layerwise (blockwise) offloading on DiT modules.", ) - parser.add_argument( - "--use-hsdp", - action="store_true", - help="Enable HSDP (Hybrid Sharded Data Parallel) for diffusion models.", - ) - parser.add_argument( - "--hsdp-shard-size", - type=int, - default=1, - help="Number of GPUs to shard weights across for HSDP.", - ) - parser.add_argument( - "--hsdp-replicate-size", - type=int, - default=1, - help="Number of HSDP replica groups.", - ) parser.add_argument( "--quantization", type=str, @@ -264,46 +237,11 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Enable diffusion pipeline profiler to display stage durations.", ) - parser.add_argument( - "--profiler-config", - type=parse_profiler_config, - default=None, - help='JSON profiler config for torch/cuda profiling, e.g. \'{"profiler":"torch","torch_profiler_dir":"./perf"}\'.', - ) parser.add_argument( "--log-stats", action="store_true", help="Enable logging of diffusion pipeline stats.", ) - parser.add_argument( - "--init-timeout", - type=int, - default=600, - help="Timeout for initializing a single stage in seconds (default: 600s)", - ) - parser.add_argument( - "--stage-init-timeout", - type=int, - default=600, - help="Timeout for initializing a single stage in seconds (default: 600s)", - ) - parser.add_argument( - "--use-system-prompt", - type=str, - default=None, - choices=["None", "dynamic", "en_vanilla", "en_recaption", "en_think_recaption", "en_unified", "custom"], - help="System prompt preset for generation. Recommended: en_unified.", - ) - parser.add_argument( - "--system-prompt", - type=str, - default=None, - help=("Custom system prompt. Used when --use-system-prompt is custom. "), - ) - current_omni_platform.pre_register_and_update(parser) - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults - - nullify_stage_engine_defaults(parser) return parser.parse_args() @@ -353,7 +291,8 @@ def main(): enable_expert_parallel=args.enable_expert_parallel, ) - profiler_enabled = args.profiler_config is not None + # Check if profiling is requested via environment variable + profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) # Prepare LoRA kwargs for Omni initialization lora_args: dict[str, Any] = {} @@ -394,9 +333,6 @@ def main(): "mode": "text-to-image", "log_stats": args.log_stats, "enable_diffusion_pipeline_profiler": args.enable_diffusion_pipeline_profiler, - "profiler_config": args.profiler_config, - "init_timeout": args.init_timeout, - "stage_init_timeout": args.stage_init_timeout, **lora_args, **quant_kwargs, } @@ -427,7 +363,7 @@ def main(): f"vae_patch_parallel_size={args.vae_patch_parallel_size}, " f"enable_expert_parallel={args.enable_expert_parallel}." ) - print(f" CPU offload: {args.enable_cpu_offload}; CPU Layerwise Offload: {args.enable_layerwise_offload}") + print(f" CPU offload: {args.enable_cpu_offload}") print(f" Image size: {args.width}x{args.height}") if args.lora_path: print(f" LoRA: scale={args.lora_scale}") @@ -446,13 +382,13 @@ def main(): ) generation_start = time.perf_counter() + extra_args = { "timesteps_shift": args.timesteps_shift, "cfg_schedule": args.cfg_schedule, "use_norm": args.use_norm, - "use_system_prompt": args.use_system_prompt, - "system_prompt": args.system_prompt, } + if lora_request: extra_args["lora_request"] = lora_request extra_args["lora_scale"] = args.lora_scale diff --git a/examples/offline_inference/text_to_video/text_to_video.py b/examples/offline_inference/text_to_video/text_to_video.py index d1bbf27cb45..cf779210977 100644 --- a/examples/offline_inference/text_to_video/text_to_video.py +++ b/examples/offline_inference/text_to_video/text_to_video.py @@ -2,10 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse -import json +import os import time from pathlib import Path -from typing import Any import numpy as np import torch @@ -45,16 +44,6 @@ def _detect_preset(model: str) -> dict: return _MODEL_PRESETS["wan"] -def parse_profiler_config(value: str) -> dict[str, Any]: - try: - config = json.loads(value) - except json.JSONDecodeError as e: - raise argparse.ArgumentTypeError(f"--profiler-config must be valid JSON: {e}") from e - if not isinstance(config, dict): - raise argparse.ArgumentTypeError("--profiler-config must be a JSON object") - return config - - def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Generate a video from a text prompt. " @@ -142,13 +131,6 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Enable layerwise (blockwise) offloading on DiT modules.", ) - parser.add_argument( - "--ulysses-mode", - type=str, - default="strict", - choices=["strict", "advanced_uaa"], - help="Ulysses sequence-parallel mode: 'strict' (divisibility required) or 'advanced_uaa' (UAA).", - ) parser.add_argument( "--ulysses-degree", type=int, @@ -178,7 +160,7 @@ def parse_args() -> argparse.Namespace: "--audio-sample-rate", type=int, default=24000, - help="Sample rate for audio output when saved (default: 24000).", + help="Sample rate for audio output when saved (default: 24000 for LTX2).", ) parser.add_argument( "--vae-patch-parallel-size", @@ -196,12 +178,6 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Enable diffusion pipeline profiler to display stage durations.", ) - parser.add_argument( - "--profiler-config", - type=parse_profiler_config, - default=None, - help='JSON profiler config for torch/cuda profiling, e.g. \'{"profiler":"torch","torch_profiler_dir":"./perf"}\'.', - ) parser.add_argument( "--quantization", type=str, @@ -209,26 +185,6 @@ def parse_args() -> argparse.Namespace: choices=["fp8", "gguf"], help="Quantization method for the transformer (fp8 for online FP8 quantization).", ) - parser.add_argument( - "--use-hsdp", - action="store_true", - help="Enable HSDP (Hybrid Sharded Data Parallel) for diffusion models.", - ) - parser.add_argument( - "--hsdp-shard-size", - type=int, - default=1, - help="Number of GPUs to shard weights across for HSDP.", - ) - parser.add_argument( - "--hsdp-replicate-size", - type=int, - default=1, - help="Number of HSDP replica groups.", - ) - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults - - nullify_stage_engine_defaults(parser) return parser.parse_args() @@ -268,7 +224,8 @@ def main(): enable_expert_parallel=args.enable_expert_parallel, ) - profiler_enabled = args.profiler_config is not None + # Check if profiling is requested via environment variable + profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) omni_kwargs = dict( model=args.model, @@ -282,7 +239,6 @@ def main(): cache_backend=args.cache_backend, cache_config=cache_config, enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, - profiler_config=args.profiler_config, ) if args.boundary_ratio is not None: omni_kwargs["boundary_ratio"] = args.boundary_ratio @@ -482,8 +438,17 @@ def _ensure_frame_list(video_array): video_array = _ensure_frame_list(video_array) + use_ltx2_export = False + if args.model and "ltx" in str(args.model).lower(): + use_ltx2_export = True if audio is not None: - from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes + use_ltx2_export = True + + if use_ltx2_export: + try: + from diffusers.pipelines.ltx2.export_utils import encode_video + except ImportError: + raise ImportError("diffusers is required for LTX2 encode_video.") if isinstance(video_array, list): frames_np = np.stack(video_array, axis=0) @@ -492,24 +457,28 @@ def _ensure_frame_list(video_array): else: frames_np = np.asarray(video_array) - frames_u8 = (np.clip(frames_np, 0.0, 1.0) * 255).round().clip(0, 255).astype("uint8") - - audio_np = audio - if isinstance(audio_np, list): - audio_np = audio_np[0] if audio_np else None - if isinstance(audio_np, torch.Tensor): - audio_np = audio_np.detach().cpu().float().numpy() - if isinstance(audio_np, np.ndarray): - audio_np = np.squeeze(audio_np).astype(np.float32) - - video_bytes = mux_video_audio_bytes( - frames_u8, - audio_np, - fps=float(args.fps), - audio_sample_rate=args.audio_sample_rate, + frames_u8 = (frames_np * 255).round().clip(0, 255).astype("uint8") + video_tensor = torch.from_numpy(frames_u8) + + audio_out = None + if audio is not None: + if isinstance(audio, list): + audio = audio[0] if audio else None + if isinstance(audio, np.ndarray): + audio = torch.from_numpy(audio) + if isinstance(audio, torch.Tensor): + audio_out = audio + if audio_out.dim() > 1: + audio_out = audio_out[0] + audio_out = audio_out.float().cpu() + + encode_video( + video_tensor, + fps=args.fps, + audio=audio_out, + audio_sample_rate=args.audio_sample_rate if audio_out is not None else None, + output_path=str(output_path), ) - with open(str(output_path), "wb") as f: - f.write(video_bytes) else: export_to_video(video_array, str(output_path), fps=args.fps) print(f"Saved generated video to {output_path}") diff --git a/examples/offline_inference/vace/vace_video_generation.py b/examples/offline_inference/vace/vace_video_generation.py index 5fad3736635..6ca0d74c52e 100644 --- a/examples/offline_inference/vace/vace_video_generation.py +++ b/examples/offline_inference/vace/vace_video_generation.py @@ -71,9 +71,6 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--ulysses-degree", type=int, default=1, help="Ulysses SP degree.") parser.add_argument("--ring-degree", type=int, default=1, help="Ring attention degree.") parser.add_argument("--cfg-parallel-size", type=int, default=1, choices=[1, 2], help="CFG parallel size.") - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults - - nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/voxcpm/README.md b/examples/offline_inference/voxcpm/README.md deleted file mode 100644 index 1eaea9b0dba..00000000000 --- a/examples/offline_inference/voxcpm/README.md +++ /dev/null @@ -1,123 +0,0 @@ -# VoxCPM Offline Example - -This directory contains the minimal offline VoxCPM example for vLLM Omni. - -`end2end.py` is intentionally small and only covers: - -- single text-to-speech -- single voice cloning with `ref_audio` + `ref_text` -- non-streaming with `vllm_omni/model_executor/stage_configs/voxcpm.yaml` -- streaming with `vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml` - -Advanced workflows were moved out of the getting-started example: - -- `benchmarks/voxcpm/vllm_omni/bench_tts_offline.py`: warmup, batch prompts, profiler, offline TTFP / RTF -- `benchmarks/voxcpm/vllm_omni/run_offline_matrix.py`: fixed offline smoke matrix -- `benchmarks/voxcpm/`: benchmark scripts and benchmark docs - -## Prerequisites - -Install VoxCPM in one of these ways: - -```bash -pip install voxcpm -``` - -or point vLLM Omni to the local VoxCPM source tree: - -```bash -export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/VoxCPM/src -``` - -The example writes WAV files with `soundfile`: - -```bash -pip install soundfile -``` - -## Model Path - -Pass the native VoxCPM model directory directly: - -```bash -export VOXCPM_MODEL=/path/to/voxcpm-model -``` - -If the native VoxCPM `config.json` does not contain HuggingFace metadata such as -`model_type`, prepare a persistent HF-compatible config directory and point the -stage configs to it with `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH`: - -```bash -export VLLM_OMNI_VOXCPM_HF_CONFIG_PATH=/tmp/voxcpm_hf_config -mkdir -p "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH" -cp "$VOXCPM_MODEL/config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/config.json" -cp "$VOXCPM_MODEL/generation_config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/generation_config.json" 2>/dev/null || true -python3 -c 'import json, os; p=os.path.join(os.environ["VLLM_OMNI_VOXCPM_HF_CONFIG_PATH"], "config.json"); cfg=json.load(open(p, "r", encoding="utf-8")); cfg["model_type"]="voxcpm"; cfg.setdefault("architectures", ["VoxCPMForConditionalGeneration"]); json.dump(cfg, open(p, "w", encoding="utf-8"), indent=2, ensure_ascii=False)' -``` - -If the model directory itself already has `model_type`, this extra directory is -not required. - -## Quick Start - -Single text-to-speech, non-streaming: - -```bash -python examples/offline_inference/voxcpm/end2end.py \ - --model "$VOXCPM_MODEL" \ - --text "This is a split-stage VoxCPM synthesis example running on vLLM Omni." -``` - -Single voice cloning, non-streaming: - -```bash -python examples/offline_inference/voxcpm/end2end.py \ - --model "$VOXCPM_MODEL" \ - --text "This sentence is synthesized with a cloned voice." \ - --ref-audio /path/to/reference.wav \ - --ref-text "The exact transcript spoken in reference.wav." -``` - -Streaming: - -```bash -python examples/offline_inference/voxcpm/end2end.py \ - --model "$VOXCPM_MODEL" \ - --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ - --text "This is a split-stage VoxCPM streaming example running on vLLM Omni." -``` - -By default, `end2end.py` writes to `output_audio/` for non-streaming and -`output_audio_streaming/` for streaming. - -## Advanced Workflows - -Use `benchmarks/voxcpm/vllm_omni/bench_tts_offline.py` when you need: - -- warmup runs -- prompt files -- batch JSONL inputs -- profiler injection -- offline TTFP / RTF emission - -Use `benchmarks/voxcpm/vllm_omni/run_offline_matrix.py` when you need the fixed offline smoke matrix that previously lived in `test.py`. - -Full matrix benchmark example: - -```bash -python benchmarks/voxcpm/vllm_omni/run_offline_matrix.py \ - --model "$VOXCPM_MODEL" \ - --ref-audio /path/to/reference.wav \ - --ref-text "The exact transcript spoken in reference.wav." -``` - -For online serving examples, see [examples/online_serving/voxcpm](../../online_serving/voxcpm/README.md). - -For benchmark reporting, see [benchmarks/voxcpm](../../../benchmarks/voxcpm/README.md). - -## Notes - -- `voxcpm.yaml` is the default non-streaming stage config. -- `voxcpm_async_chunk.yaml` is the streaming stage config. -- Streaming is currently single-request oriented; the fixed smoke matrix now lives in `benchmarks/voxcpm/vllm_omni/run_offline_matrix.py`. -- `ref_text` must be the real transcript of the reference audio. Mismatched text usually causes obvious quality degradation. diff --git a/examples/offline_inference/voxcpm/end2end.py b/examples/offline_inference/voxcpm/end2end.py deleted file mode 100644 index 980410feaeb..00000000000 --- a/examples/offline_inference/voxcpm/end2end.py +++ /dev/null @@ -1,206 +0,0 @@ -"""Minimal offline VoxCPM example for vLLM Omni.""" - -from __future__ import annotations - -import asyncio -import time -from pathlib import Path -from typing import Any - -import soundfile as sf -import torch -from vllm.utils.argparse_utils import FlexibleArgumentParser - -from vllm_omni import AsyncOmni, Omni - -REPO_ROOT = Path(__file__).resolve().parents[3] -DEFAULT_SYNC_STAGE_CONFIG = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" - - -def _build_prompt(args) -> dict[str, Any]: - additional_information: dict[str, list[Any]] = { - "text": [args.text], - "cfg_value": [args.cfg_value], - "inference_timesteps": [args.inference_timesteps], - "min_len": [args.min_len], - "max_new_tokens": [args.max_new_tokens], - } - if args.streaming_prefix_len is not None: - additional_information["streaming_prefix_len"] = [args.streaming_prefix_len] - if args.ref_audio is not None: - additional_information["ref_audio"] = [args.ref_audio] - if args.ref_text is not None: - additional_information["ref_text"] = [args.ref_text] - return { - "prompt_token_ids": [1], - "additional_information": additional_information, - } - - -def _extract_audio_tensor(mm: dict[str, Any]) -> torch.Tensor: - audio = mm.get("audio", mm.get("model_outputs")) - if audio is None: - raise ValueError("No audio output found in multimodal output.") - if isinstance(audio, list): - parts = [torch.as_tensor(item).float().cpu().reshape(-1) for item in audio] - audio = torch.cat(parts, dim=-1) if parts else torch.zeros(0) - if not isinstance(audio, torch.Tensor): - audio = torch.as_tensor(audio) - return audio.float().cpu().reshape(-1) - - -def _extract_sample_rate(mm: dict[str, Any]) -> int: - sr_raw = mm.get("sr", 24000) - if isinstance(sr_raw, list) and sr_raw: - sr_raw = sr_raw[-1] - if hasattr(sr_raw, "item"): - return int(sr_raw.item()) - return int(sr_raw) - - -def _is_streaming_stage_config(stage_config_path: str) -> bool: - return "async_chunk" in Path(stage_config_path).stem - - -def _save_audio(audio: torch.Tensor, sample_rate: int, output_dir: Path, request_id: str) -> Path: - output_dir.mkdir(parents=True, exist_ok=True) - output_path = output_dir / f"output_{request_id}.wav" - sf.write( - output_path, - audio.float().cpu().clamp(-1.0, 1.0).numpy(), - sample_rate, - format="WAV", - subtype="PCM_16", - ) - return output_path - - -async def _run_streaming(args) -> Path: - prompt = _build_prompt(args) - output_dir = Path(args.output_dir) if args.output_dir is not None else Path("output_audio_streaming") - request_id = "streaming_example" - sample_rate = 24000 - buffered_samples = 0 - chunks: list[torch.Tensor] = [] - started = time.perf_counter() - omni = AsyncOmni( - model=args.model, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - ) - try: - async for stage_output in omni.generate(prompt, request_id=request_id): - mm = getattr(stage_output, "multimodal_output", None) - if not isinstance(mm, dict): - request_output = getattr(stage_output, "request_output", None) - if request_output is None: - continue - mm = getattr(request_output, "multimodal_output", None) - if not isinstance(mm, dict) and getattr(request_output, "outputs", None): - mm = getattr(request_output.outputs[0], "multimodal_output", None) - if not isinstance(mm, dict): - continue - audio = _extract_audio_tensor(mm) - if audio.numel() == 0: - continue - sample_rate = _extract_sample_rate(mm) - if audio.numel() > buffered_samples: - delta = audio[buffered_samples:] - buffered_samples = int(audio.numel()) - else: - delta = audio - buffered_samples += int(delta.numel()) - if delta.numel() > 0: - chunks.append(delta) - if not chunks: - raise RuntimeError("No streaming audio chunks received from VoxCPM.") - output_audio = torch.cat(chunks, dim=0) - output_path = _save_audio(output_audio, sample_rate, output_dir, request_id) - print(f"Saved streaming audio to: {output_path} ({time.perf_counter() - started:.2f}s)") - return output_path - finally: - omni.shutdown() - - -def _run_sync(args) -> Path: - prompt = _build_prompt(args) - output_dir = Path(args.output_dir) if args.output_dir is not None else Path("output_audio") - request_id = "sync_example" - started = time.perf_counter() - last_mm: dict[str, Any] | None = None - omni = Omni( - model=args.model, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - ) - for stage_outputs in omni.generate(prompt): - request_output = getattr(stage_outputs, "request_output", None) - if request_output is None: - continue - outputs = getattr(request_output, "outputs", None) - if outputs: - for output in outputs: - mm = getattr(output, "multimodal_output", None) - if isinstance(mm, dict): - last_mm = mm - mm = getattr(request_output, "multimodal_output", None) - if isinstance(mm, dict): - last_mm = mm - if last_mm is None: - raise RuntimeError("No audio output received from VoxCPM.") - output_path = _save_audio( - _extract_audio_tensor(last_mm), - _extract_sample_rate(last_mm), - output_dir, - request_id, - ) - print(f"Saved audio to: {output_path} ({time.perf_counter() - started:.2f}s)") - return output_path - - -def parse_args(): - parser = FlexibleArgumentParser(description="Minimal offline VoxCPM example for vLLM Omni.") - parser.add_argument("--model", type=str, required=True, help="Local VoxCPM model directory.") - parser.add_argument( - "--stage-configs-path", - type=str, - default=str(DEFAULT_SYNC_STAGE_CONFIG), - help=("Stage config path. Use voxcpm.yaml for non-streaming or voxcpm_async_chunk.yaml for streaming."), - ) - parser.add_argument("--text", type=str, required=True, help="Input text for synthesis.") - parser.add_argument("--ref-audio", type=str, default=None, help="Reference audio path for voice cloning.") - parser.add_argument("--ref-text", type=str, default=None, help="Transcript of the reference audio.") - parser.add_argument("--output-dir", type=str, default=None, help="Output directory for generated wav files.") - parser.add_argument("--cfg-value", type=float, default=2.0, help="Guidance value passed to VoxCPM.") - parser.add_argument("--inference-timesteps", type=int, default=10, help="Number of diffusion timesteps.") - parser.add_argument("--min-len", type=int, default=2, help="Minimum latent length.") - parser.add_argument("--max-new-tokens", type=int, default=4096, help="Maximum latent length.") - parser.add_argument( - "--streaming-prefix-len", - type=int, - default=3, - help="Streaming prefix length used by voxcpm_async_chunk.yaml.", - ) - parser.add_argument("--stage-init-timeout", type=int, default=600, help="Stage initialization timeout in seconds.") - parser.add_argument("--log-stats", action="store_true", help="Enable vLLM Omni stats logging.") - args = parser.parse_args() - if (args.ref_audio is None) != (args.ref_text is None): - raise ValueError("Voice cloning requires --ref-audio and --ref-text together.") - return args - - -def main(args) -> None: - route = "streaming" if _is_streaming_stage_config(args.stage_configs_path) else "sync" - print(f"Model: {args.model}") - print(f"Stage config: {args.stage_configs_path}") - print(f"Route: {route}") - if route == "streaming": - asyncio.run(_run_streaming(args)) - else: - _run_sync(args) - - -if __name__ == "__main__": - main(parse_args()) diff --git a/examples/offline_inference/voxcpm2/README.md b/examples/offline_inference/voxcpm2/README.md deleted file mode 100644 index e9827307997..00000000000 --- a/examples/offline_inference/voxcpm2/README.md +++ /dev/null @@ -1,83 +0,0 @@ -# VoxCPM2 Offline Inference (Native AR) - -VoxCPM2 is a 2B-parameter tokenizer-free diffusion AR TTS model. It produces 48kHz audio and supports 30+ languages with a single-stage native AR pipeline backed by MiniCPM4. - -## Prerequisites - -Install the `voxcpm` package, or set the environment variable pointing to the source tree: - -```bash -# Option A: install package -pip install voxcpm - -# Option B: use source checkout -export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/voxcpm -``` - -## Quick Start - -Zero-shot synthesis: - -```bash -python examples/offline_inference/voxcpm2/end2end.py \ - --model openbmb/VoxCPM2 \ - --text "Hello, this is a VoxCPM2 demo." \ - --output-dir output_audio -``` - -Voice cloning with a reference audio: - -```bash -python examples/offline_inference/voxcpm2/end2end.py \ - --text "Hello, this is a voice clone demo." \ - --reference-audio /path/to/reference.wav \ - --output-dir output_clone -``` - -Prompt continuation (matched audio + text prefix): - -```bash -python examples/offline_inference/voxcpm2/end2end.py \ - --text "Continuation target sentence." \ - --prompt-audio /path/to/prompt.wav \ - --prompt-text "Transcript of the prompt audio." \ - --output-dir output_cont -``` - -The script accepts the following arguments: - -| Argument | Default | Description | -|---|---|---| -| `--model` | `openbmb/VoxCPM2` | HuggingFace repo ID or local path | -| `--text` | (example sentence) | Text to synthesize | -| `--output-dir` | `output_audio` | Directory for output WAV files | -| `--stage-configs-path` | `voxcpm2.yaml` | Stage config YAML path | -| `--reference-audio` | `None` | Reference audio for voice cloning (isolated) | -| `--prompt-audio` | `None` | Prompt audio for continuation mode | -| `--prompt-text` | `None` | Transcript matching `--prompt-audio` | - -## Performance - -Measured on a single H20 GPU (80 GB): - -| Input length | RTF | Sample rate | -|---|---|---| -| Short (~10 tokens) | ~0.28 | 48 kHz | -| Long (~100 tokens) | ~0.34 | 48 kHz | - -RTF < 1.0 means faster than real time. - -## Architecture - -VoxCPM2 uses a single-stage native AR pipeline: - -``` -feat_encoder -└─► MiniCPM4 (base LM) - └─► FSQ (finite scalar quantization) - └─► residual_lm (residual AR) - └─► LocDiT (local diffusion transformer) - └─► AudioVAE → 48 kHz waveform -``` - -All stages are fused into one vllm-native execution graph via `voxcpm2.yaml`, eliminating inter-stage coordination overhead and enabling true end-to-end batching. diff --git a/examples/offline_inference/voxcpm2/end2end.py b/examples/offline_inference/voxcpm2/end2end.py deleted file mode 100644 index 6b6bf78ddf1..00000000000 --- a/examples/offline_inference/voxcpm2/end2end.py +++ /dev/null @@ -1,171 +0,0 @@ -"""Offline VoxCPM2 inference example (native AR pipeline). - -Uses the single-stage native AR config (voxcpm2.yaml). -Requires the `voxcpm` package or VLLM_OMNI_VOXCPM_CODE_PATH env var. -""" - -from __future__ import annotations - -import os -import time -from pathlib import Path - -import soundfile as sf -import torch -from vllm.utils.argparse_utils import FlexibleArgumentParser - -from vllm_omni import Omni - -REPO_ROOT = Path(__file__).resolve().parents[3] -DEFAULT_STAGE_CONFIGS_PATH = str(REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm2.yaml") -SAMPLE_RATE = 48_000 - - -def parse_args(): - parser = FlexibleArgumentParser(description="Offline VoxCPM2 native AR inference") - parser.add_argument( - "--model", - type=str, - default="openbmb/VoxCPM2", - help="VoxCPM2 model path or HuggingFace repo ID.", - ) - parser.add_argument( - "--text", - type=str, - default="This is a VoxCPM2 native AR synthesis example running on vLLM Omni.", - help="Text to synthesize.", - ) - parser.add_argument( - "--output-dir", - type=str, - default="output_audio", - help="Directory for output WAV files.", - ) - parser.add_argument( - "--stage-configs-path", - type=str, - default=DEFAULT_STAGE_CONFIGS_PATH, - help="Path to the stage config YAML file.", - ) - parser.add_argument( - "--reference-audio", - type=str, - default=None, - help="Path to reference audio for voice cloning (isolated ref mode).", - ) - parser.add_argument( - "--prompt-audio", - type=str, - default=None, - help="Path to prompt audio for continuation mode (requires --prompt-text).", - ) - parser.add_argument( - "--prompt-text", - type=str, - default=None, - help="Text matching --prompt-audio for continuation mode.", - ) - parser.add_argument( - "--ref-text", - type=str, - default=None, - help="Optional transcript of --reference-audio (enables ref_continuation mode).", - ) - return parser.parse_args() - - -def extract_audio(multimodal_output: dict) -> torch.Tensor: - """Extract the final complete audio tensor from multimodal output. - - The output processor concatenates per-step delta tensors under - ``model_outputs``. Falls back to ``audio`` for backwards compat. - """ - audio = multimodal_output.get("model_outputs") - if audio is None: - audio = multimodal_output.get("audio") - if audio is None: - raise ValueError(f"No audio key in multimodal_output: {list(multimodal_output.keys())}") - - if isinstance(audio, list): - # Defensive: usually the output processor consolidates into a single - # tensor at request completion, but concatenate here too in case the - # caller consumes intermediate (pre-consolidation) outputs. - valid = [torch.as_tensor(a).float().cpu().reshape(-1) for a in audio if a is not None] - if not valid: - raise ValueError("Audio list is empty or all elements are None.") - return torch.cat(valid, dim=0) if len(valid) > 1 else valid[0] - - return torch.as_tensor(audio).float().cpu().reshape(-1) - - -def main(): - args = parse_args() - - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - engine = Omni( - model=args.model, - stage_configs_path=args.stage_configs_path, - ) - - from transformers import AutoTokenizer - - from vllm_omni.model_executor.models.voxcpm2.voxcpm2_talker import ( - build_cjk_split_map, - build_voxcpm2_prompt, - ) - - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) - split_map = build_cjk_split_map(tokenizer) - hf_config = engine.engine.stage_vllm_configs[0].model_config.hf_config - - ref_audio_arg = args.reference_audio or args.prompt_audio - ref_text_arg = args.ref_text or args.prompt_text - ref_wav, ref_sr = (None, None) - if ref_audio_arg: - ref_wav_arr, ref_sr = sf.read(ref_audio_arg) - ref_wav = ref_wav_arr.mean(axis=-1).tolist() if ref_wav_arr.ndim > 1 else ref_wav_arr.tolist() - - prompt = build_voxcpm2_prompt( - hf_config=hf_config, - tokenizer=tokenizer, - split_map=split_map, - text=args.text, - ref_audio=ref_wav, - ref_sr=ref_sr, - ref_text=ref_text_arg, - ) - - print(f"Model : {args.model}") - print(f"Text : {args.text}") - if ref_audio_arg: - print(f"Ref audio : {ref_audio_arg}") - if ref_text_arg: - print(f"Ref text : {ref_text_arg}") - print(f"Output dir : {output_dir}") - - t_start = time.perf_counter() - outputs = engine.generate([prompt]) - elapsed = time.perf_counter() - t_start - - # outputs[0].outputs[0].multimodal_output["audio"] is a list of tensors - request_output = outputs[0] - mm = request_output.outputs[0].multimodal_output - audio = extract_audio(mm) - - duration = audio.numel() / SAMPLE_RATE - rtf = elapsed / duration if duration > 0 else float("inf") - - output_path = output_dir / "output.wav" - sf.write(str(output_path), audio.numpy(), SAMPLE_RATE, format="WAV") - - print(f"Saved : {output_path}") - print(f"Duration : {duration:.2f}s") - print(f"Inference : {elapsed:.2f}s") - print(f"RTF : {rtf:.3f}") - - -if __name__ == "__main__": - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - main() diff --git a/examples/offline_inference/voxtral_tts/README.md b/examples/offline_inference/voxtral_tts/README.md index a55ce8830ee..5f3d5413be7 100644 --- a/examples/offline_inference/voxtral_tts/README.md +++ b/examples/offline_inference/voxtral_tts/README.md @@ -10,24 +10,28 @@ When `mistral_common` has `SpeechRequest` support, prompt token IDs are built vi ```bash # Basic single-prompt with cheerful_female voice preset python3 examples/offline_inference/voxtral_tts/end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ --write-audio --voice cheerful_female \ --model mistralai/Voxtral-4B-TTS-2603 \ --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?" # 32 replicate prompts with cheerful_female voice preset python3 examples/offline_inference/voxtral_tts/end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ --num-prompts 32 --write-audio --voice cheerful_female \ --model mistralai/Voxtral-4B-TTS-2603 \ --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?" # Streaming with neutral_female voice preset python3 examples/offline_inference/voxtral_tts/end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ --streaming --write-audio --voice neutral_female \ --model mistralai/Voxtral-4B-TTS-2603 \ --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?" # 32 prompts, 8 concurrent requests per wave, streaming with neutral_female voice python3 examples/offline_inference/voxtral_tts/end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ --num-prompts 32 --concurrency 8 --streaming --write-audio --voice neutral_female \ --model mistralai/Voxtral-4B-TTS-2603 \ --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?" @@ -35,6 +39,7 @@ python3 examples/offline_inference/voxtral_tts/end2end.py \ # Short debug prompt with reference audio # Note: Reference audio capability is not yet released. python3 examples/offline_inference/voxtral_tts/end2end.py \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ --write-audio \ --model mistralai/Voxtral-4B-TTS-2603 \ --text "This is a test message." \ @@ -49,7 +54,7 @@ python3 examples/offline_inference/voxtral_tts/end2end.py \ | `--text TEXT` | Text to synthesize (default: `"This is a test message."`) | | `--audio-path PATH` | Path to reference audio file for voice cloning | | `--output-dir DIR` | Directory to write output WAV files (default: `output_audio`) | -| `--deploy-config PATH` | Override the deploy config path. If unset, auto-loads `vllm_omni/deploy/voxtral_tts.yaml` from the HF `model_type`. | +| `--stage-configs-path PATH` | Path to stage configs YAML (currently it must be set for VoxtralTTS) | | `--num-prompts N` | Number of replicate prompts to run for measuring performance (default: 1) | | `--streaming` | Use streaming generation via `AsyncOmni` (default: blocking `Omni`) | | `--concurrency N` | Max concurrent requests per wave (must be used with `--streaming`, must evenly divide `--num-prompts`) | diff --git a/examples/offline_inference/voxtral_tts/end2end.py b/examples/offline_inference/voxtral_tts/end2end.py index 0a6f88715a9..0750246450a 100644 --- a/examples/offline_inference/voxtral_tts/end2end.py +++ b/examples/offline_inference/voxtral_tts/end2end.py @@ -39,7 +39,7 @@ async def run_streaming(inputs, sampling_params_list, model_name, args, output_dir): async_omni = AsyncOmni( model=model_name, - deploy_config=args.deploy_config, + stage_configs_path=args.stage_configs_path, log_stats=args.log_stats, ) @@ -192,7 +192,7 @@ def run_non_streaming(inputs, sampling_params_list, model_name, args, output_dir llm = Omni( model=model_name, log_stats=args.log_stats, - deploy_config=args.deploy_config, + stage_configs_path=args.stage_configs_path, ) if args.profiling_mode: @@ -253,11 +253,10 @@ def parse_args() -> Namespace: help="Directory to write output wav files.", ) parser.add_argument( - "--deploy-config", + "--stage-configs-path", type=str, default=None, - help="Override the deploy config path. If unset, auto-loads " - "vllm_omni/deploy/voxtral_tts.yaml based on the HF model_type.", + help="Path to stage configs YAML. Auto-resolved from model if not set.", ) parser.add_argument( "--num-prompts", type=int, default=1, help="Number of replicate prompts to run for measuring performance" @@ -298,12 +297,6 @@ def parse_args() -> Namespace: default=None, help="Voice to use instead of audio file.", ) - parser.add_argument( - "--cfg-alpha", - type=float, - default=None, - help="CFG alpha for flow-matching guidance (default: use value from stage config, typically 1.2).", - ) return parser.parse_args() @@ -355,13 +348,8 @@ def main(args: Any) -> None: inputs = compose_request(model_name, text_chunk, audio_prompt_file, args) - extra_args = {} - if args.cfg_alpha is not None: - extra_args["cfg_alpha"] = args.cfg_alpha - sampling_params = SamplingParams( max_tokens=max_num_tokens, - extra_args=extra_args if extra_args else None, ) sampling_params_list = [ sampling_params, diff --git a/examples/offline_inference/x_to_video_audio/download_dreamid_omni.py b/examples/offline_inference/x_to_video_audio/download_dreamid_omni.py index 2f66d5f7789..0dbf402e9e3 100644 --- a/examples/offline_inference/x_to_video_audio/download_dreamid_omni.py +++ b/examples/offline_inference/x_to_video_audio/download_dreamid_omni.py @@ -82,6 +82,7 @@ def main(output_dir: str): data = { "_class_name": "DreamIDOmniPipeline", + "fusion": "DreamID-Omni/dreamid_omni.safetensors", } with open(os.path.join(output_dir, "model_index.json"), "w", encoding="utf-8") as f: @@ -89,12 +90,6 @@ def main(output_dir: str): print(f"model_index.json created at {os.path.join(output_dir, 'model_index.json')}") - transformer_dir = os.path.join(output_dir, "transformer") - os.makedirs(transformer_dir, exist_ok=True) - with open(os.path.join(transformer_dir, "config.json"), "w", encoding="utf-8") as f: - json.dump({"fusion": "DreamID-Omni/dreamid_omni.safetensors"}, f) - print(f"transformer/config.json created at {os.path.join(transformer_dir, 'config.json')}") - # now we download the dependency code download_dependency() diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.md b/examples/offline_inference/x_to_video_audio/x_to_video_audio.md index 13f2cfe7c0a..59b993a728d 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.md +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.md @@ -24,15 +24,13 @@ dreamid_omni/ │ ├── models_t5_umt5-xxl-enc-bf16.pth │ ├── Wan2.2_VAE.pth │ -├── model_index.json -└── transformer/ - └── config.json # create by download_dreamid_omni.py +├── model_index.json # create by download_dreamid_omni.py ``` ### Run the Inference -```python +``` python x_to_video_audio.py \ - --model /path/to/dreamid_omni \ + --model /xx/dreamid_omni \ --prompt "Two people walking together and singing happily" \ --image-path ./example0.png ./example1.png \ --audio-path ./example0.wav ./example1.wav \ @@ -42,33 +40,11 @@ python x_to_video_audio.py \ --num-inference-steps 45 \ --height 704 \ --width 1280 \ - --output out_dreamid_omni_twoip.mp4 + --output dreamid_omni.mp4 ``` In the current test scenario (2 images + 2 audio inputs), the VRAM requirement is 72GB, regardless of whether cfg-parallel is enabled or disabled. The VRAM usage can be reduced by enabling CPU offload via --enable-cpu-offload. - -You could take reference images/audios from the test cases in the official repo: https://github.com/Guoxu1233/DreamID-Omni - -For example, single IP ref resources can be found under https://github.com/Guoxu1233/DreamID-Omni/tree/main/test_case/oneip, you could download them correspondingly to your local and use them for testing. - -```python -# Example usage for oneip, ref media from the official repo DreamID-Omni -python x_to_video_audio.py \ - --model /path/to/dreamid_omni \ - --prompt ": In the frame, a woman with black long hair is identified as .\n**Overall Environment/Scene**: A lively open-kitchen café at night; stove flames flare, steam rises, and warm pendant lights swing slightly as staff move behind her. The shot is an upper-body close-up.\n**Main Characters/Subjects Appearance**: is a young woman with thick dark wavy hair and a side part. She wears a fitted black top under a light apron, a thin gold chain necklace, and small stud earrings.\n**Main Characters/Subjects Actions**: tastes the sauce with a spoon, then turns her face toward the camera while still holding the spoon, her expression shifting from focused to conflicted.\n maintains eye contact, swallows as if choosing her words, and says, I keep telling myself I’m fine,but some nights it feels like I’m just performing calm." \ - --image-path 9.png \ - --audio-path 9.wav \ - --video-negative-prompt "jitter, bad hands, blur, distortion" \ - --audio-negative-prompt "robotic, muffled, echo, distorted" \ - --cfg-parallel-size 2 \ - --num-inference-steps 45 \ - --height 704 \ - --width 1280 \ - --output out_dreamid_omni_oneip.mp4 -``` - - Key arguments: - `--prompt`: text description (string). - `--model`: path to the model local directory. diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py index 497284ceb96..17d0f06c3c5 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py @@ -5,12 +5,10 @@ import re import time -import numpy as np +import librosa from PIL import Image -from vllm.multimodal.media.audio import load_audio from vllm_omni.diffusion.data import DiffusionParallelConfig -from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -38,8 +36,8 @@ def parse_args() -> argparse.Namespace: "--cfg-parallel-size", type=int, default=1, - choices=[1, 2, 3, 4], - help="Number of GPUs used for classifier free guidance parallel size (max 4 branches).", + choices=[1, 2], + help="Number of GPUs used for classifier free guidance parallel size.", ) parser.add_argument( "--video-negative-prompt", @@ -58,11 +56,6 @@ def parse_args() -> argparse.Namespace: default=False, help="Enable CPU offloading for diffusion models.", ) - parser.add_argument( - "--enable-layerwise-offload", - action="store_true", - help="Enable layerwise (blockwise) offloading on DiT modules.", - ) return parser.parse_args() @@ -76,7 +69,7 @@ def load_image_and_audio(image_paths, audio_paths): image.append(img) for path in audio_paths: - audio_array, sr = load_audio(path, sr=16000) + audio_array, sr = librosa.load(path, sr=16000) audio_array = audio_array[int(sr * 1) : int(sr * 3)] audio.append(audio_array) return image, audio @@ -131,7 +124,6 @@ def main() -> None: parallel_config=parallel_config, model_type=args.model_type, enable_cpu_offload=args.enable_cpu_offload, - enable_layerwise_offload=args.enable_layerwise_offload, ) start = time.perf_counter() outputs = omni.generate(prompt, sampling_params) @@ -139,35 +131,15 @@ def main() -> None: if not outputs: raise RuntimeError("No output returned from DreamID-Omni.") - result = outputs[0] - if not result.images: - raise RuntimeError("No video frames found in DreamID-Omni output.") - generated_video = result.images[0] - mm = result.multimodal_output or {} - generated_audio = mm.get("audio") - fps = int(mm.get("fps", 24)) - sample_rate = int(mm.get("audio_sample_rate", 16000)) - - # DreamID-Omni returns video as (C, F, H, W) float32 in [-1, 1]. - # mux_video_audio_bytes expects (F, H, W, C) uint8. - if not isinstance(generated_video, np.ndarray) or generated_video.ndim != 4: - raise RuntimeError(f"Unexpected video shape: {getattr(generated_video, 'shape', None)}") - frames = generated_video.transpose(1, 2, 3, 0) - frames = (np.clip((frames + 1.0) / 2.0, 0.0, 1.0) * 255.0).round().astype(np.uint8) - - audio_np = None - if generated_audio is not None: - audio_np = np.squeeze(np.asarray(generated_audio)).astype(np.float32) - + output = outputs[0].request_output + generated_video = output[0].images[0][0] + generated_audio = output[0].images[0][1] + try: + from dreamid_omni.utils.io_utils import save_video + except Exception as e: + raise RuntimeError(f"Failed to extract video and audio from DreamID-Omni output. Error: {e}") output_path = args.output - video_bytes = mux_video_audio_bytes( - frames, - audio_np, - fps=float(fps), - audio_sample_rate=sample_rate, - ) - with open(output_path, "wb") as f: - f.write(video_bytes) + save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000) print(f"Saved generated video to {output_path}") print(f"Total time: {elapsed:.2f}s") diff --git a/examples/online_serving/bagel/README.md b/examples/online_serving/bagel/README.md index 763927222cf..9b74acae10e 100644 --- a/examples/online_serving/bagel/README.md +++ b/examples/online_serving/bagel/README.md @@ -1,111 +1,145 @@ # BAGEL-7B-MoT -## Installation +## 🛠️ Installation Please refer to [README.md](../../../README.md) -## Architecture +## Run examples (BAGEL-7B-MoT) -BAGEL-7B-MoT is a Mixture-of-Transformers (MoT) model supporting both image generation and understanding. It offers two deployment topologies: +**Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, please modify the stage configuration to distribute the model across devices. -| Topology | Stages | Description | -| :------- | :----- | :---------- | -| **Two-stage** (default) | Stage 0 (Thinker, AR) + Stage 1 (DiT, Diffusion) | Thinker handles text/understanding via vLLM AR engine; DiT handles image generation. KV cache is transferred between stages. | -| **Single-stage** | Stage 0 (DiT, Diffusion) only | The DiT stage contains a full LLM, ViT, VAE, and tokenizer internally. All modalities are handled within a single diffusion process. | - -Both topologies support all four modalities: `text2img`, `img2img`, `img2text`, `text2text`. - -> **Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, modify the deploy YAML to distribute stages across devices. - -## Launch the Server - -### Two-Stage (Default) - -The default pipeline is auto-detected from the model. No extra flags needed: +### Launch the Server ```bash +# Use default configuration vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 ``` Or use the convenience script: ```bash -cd examples/online_serving/bagel +cd /workspace/vllm-omni/examples/online_serving/bagel bash run_server.sh +``` -# Initialize each stage in a discrete isolated process terminal -bash run_server_stage_cli.sh --stage 0 -bash run_server_stage_cli.sh --stage 1 +```bash +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --stage-configs-path /path/to/stage_configs_file ``` -To use a custom deploy YAML, pass it via `--deploy-config`: +#### 🚀 Tensor Parallelism (TP) + +For larger models or multi-GPU environments, you can enable Tensor Parallelism (TP) for the server. +1. **Modify Stage Config**: Create or modify a stage configuration yaml (e.g., [`bagel.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel.yaml)). Set `tensor_parallel_size` to `2` (or more) and update `devices` to include multiple GPU IDs (e.g., `"0,1"`). + +```yaml + engine_args: + tensor_parallel_size: 2 + ... + runtime: + devices: "0,1" +``` + +2. **Launch Server**: ```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ - --deploy-config /path/to/deploy_config.yaml +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --stage-configs-path /path/to/your/custom_bagel.yaml ``` -See [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml) for the default two-stage deploy configuration. +#### Using Mooncake Connector -### Single-Stage +By default, BAGEL uses `SharedMemoryConnector` for inter-stage communication. You can use the [Mooncake](https://github.com/kvcache-ai/Mooncake) connector to transfer KV cache between stages, which also enables multi-node deployment. -The DiT stage contains a full LLM, ViT, VAE, and tokenizer, so it can handle all modalities (text2img, img2img, img2text, text2text, think) without a separate Thinker stage: +**1. Install Mooncake** ```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ - --deploy-config vllm_omni/deploy/bagel_single_stage.yaml +# For CUDA-enabled systems (recommended) +pip install mooncake-transfer-engine + +# For non-CUDA systems +pip install mooncake-transfer-engine-non-cuda ``` -See [`bagel_single_stage.yaml`](../../../vllm_omni/deploy/bagel_single_stage.yaml) for configuration. The `pipeline: bagel_single_stage` field selects the single-stage topology from the pipeline registry. +**2. Start Mooncake Master** on the primary node: -### Tensor Parallelism (TP) +```bash +# Optional: enable disk-backed storage by creating a directory and passing --root_fs_dir. +# Without it, Mooncake runs in memory-only mode, which is sufficient for KV cache transfer. +mkdir -p ./mc_storage + +mooncake_master \ + --rpc_port=50051 \ + --enable_http_metadata_server=true \ + --http_metadata_server_host=0.0.0.0 \ + --http_metadata_server_port=8080 \ + --metrics_port=9003 \ + --root_fs_dir=./mc_storage/ \ + --cluster_id=mc-local-1 & +``` -For larger models or multi-GPU environments, enable TP via CLI: +**3. Launch the server** with the Mooncake stage config: ```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --tensor-parallel-size 2 +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml ``` -Or set `tensor_parallel_size` per stage in a custom deploy YAML. +> **Note**: Before launching, edit [`bagel_multiconnector.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml) and replace the `metadata_server` and `master` addresses with your Mooncake master node's actual IP. For single-node testing, `127.0.0.1` works. + +The client-side usage is identical to the default setup -- the Mooncake connector is transparent to the API. See the requests section below. + +For more details on the Mooncake connector configuration, see the [Mooncake Store Connector documentation](../../../docs/design/feature/omni_connectors/mooncake_store_connector.md). + +#### Multi-Node Deployment -### Multi-Node Deployment +You can deploy each stage on a **separate node** for better resource utilization. In this example, the orchestrator (Stage 0 / Thinker) and Stage 1 (DiT) run on different machines, connected via Mooncake. -Deploy each stage on a **separate node** for better resource utilization. Replace `` with the actual IP address of your orchestrator node. +Replace `` below with the actual IP address of your orchestrator node (e.g., `10.244.227.244`). -**1. Launch Stage 0 (Thinker / Orchestrator)** on the orchestrator node: +> [!WARNING] +> **Before launching**, edit [`bagel_multiconnector.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml) and replace the `metadata_server` and `master` addresses with your Mooncake master node's actual IP. Mismatched addresses will cause silent connection failures. + +**1. Start Mooncake Master** (on the orchestrator node): + +```bash +mooncake_master \ + --rpc_port=50051 \ + --enable_http_metadata_server=true \ + --http_metadata_server_host= \ + --http_metadata_server_port=8080 \ + --metrics_port=9003 +``` + +**2. Launch Stage 0 (Thinker / Orchestrator)** on the orchestrator node: ```bash -# API server port for client requests: 8000 vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ - --port 8000 \ + --port 8000 \ # API server port for client requests + --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml \ --stage-id 0 \ - --omni-master-address \ - --omni-master-port 8091 + -oma \ + -omp 8091 ``` -**2. Launch Stage 1 (DiT)** on the remote node in headless mode: +**3. Launch Stage 1 (DiT)** on the remote node in headless mode: ```bash vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ + --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml \ --stage-id 1 \ --headless \ - --omni-master-address \ - --omni-master-port 8091 + -oma \ + -omp 8091 ``` -Or use the convenience script: - -```bash -# Terminal 1: Stage 0 -bash run_server_stage_cli.sh --stage 0 +**Mooncake Master arguments:** -# Terminal 2: Stage 1 -bash run_server_stage_cli.sh --stage 1 - -# With extra args -bash run_server_stage_cli.sh --stage 0 -- --tensor-parallel-size 2 -bash run_server_stage_cli.sh --stage 1 -- --gpu-memory-utilization 0.9 -``` +| Argument | Description | +| :------- | :---------- | +| `--rpc_port` | Mooncake RPC port for control-plane coordination between stages | +| `--enable_http_metadata_server` | Enable the HTTP metadata server for service discovery | +| `--http_metadata_server_host` | IP address to bind the metadata server (use the orchestrator node's IP) | +| `--http_metadata_server_port` | Port for the HTTP metadata server | +| `--metrics_port` | Port for Prometheus-compatible metrics endpoint | **vllm serve arguments:** @@ -113,31 +147,85 @@ bash run_server_stage_cli.sh --stage 1 -- --gpu-memory-utilization 0.9 | :------- | :---------- | | `--stage-id` | Which stage this process runs (0 = Thinker, 1 = DiT) | | `--headless` | Run without the API server (worker-only mode) | -| `-oma` / `--omni-master-address` | Orchestrator master address | -| `-omp` / `--omni-master-port` | Orchestrator master port | +| `-oma` | Orchestrator master address | +| `-omp` | Orchestrator master port for Stage 1 to connect to Stage 0 for task coordination | > [!IMPORTANT] > **Startup Order**: Stage 0 (orchestrator) must be launched **before** Stage 1 (headless). > Stage 0 will appear to hang on startup until Stage 1 (worker) connects — this is expected behavior. -### Inter-Stage Connectors +**Network Requirements** + +All nodes must have network connectivity to each other. Ensure the following ports are open **between all participating nodes**: -When deploying stages across nodes, configure the connector type in the deploy YAML: +| Port | Protocol | Service | Direction | +| :--- | :------- | :------ | :-------- | +| 50051 | TCP | Mooncake Master RPC | Worker → Orchestrator | +| 8080 | TCP | Mooncake HTTP Metadata Server | Worker → Orchestrator | +| 8091 | TCP | Orchestrator Master (`-omp`) | Worker → Orchestrator | +| 8000 | TCP | API Server (`--port`) | Client → Orchestrator | +| 9003 | TCP | Metrics (optional) | Monitoring → Orchestrator | -- **SharedMemoryConnector** (default): Used for single-node deployments. No explicit configuration needed. -- **MooncakeTransferEngineConnector**: For multi-node setups with RDMA hardware. Defined in [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml) under `connectors.rdma_connector`. +> **Tip**: If nodes are behind a firewall or in different VPCs/security groups, make sure the above ports are allowed in ingress/egress rules. All nodes should be reachable via their IP addresses (no NAT). Using nodes on the same subnet or VPC is recommended to minimize latency for Mooncake KV cache transfers. -To use Mooncake, create a custom deploy YAML that binds `output_connectors` / `input_connectors` on each stage to the `rdma_connector` defined in the `connectors` section. +### Send Multi-modal Request -## Send Requests +Get into the bagel folder: ```bash cd examples/online_serving/bagel ``` +Send request via Python + +```bash +python openai_chat_client.py --prompt "A cute cat" --modality text2img +``` + +The Python client supports the following command-line arguments: + +- `--prompt` (or `-p`): Text prompt for generation (default: `A cute cat`) +- `--output` (or `-o`): Output file path for image results (default: `bagel_output.png`) +- `--server` (or `-s`): Server URL (default: `http://localhost:8091`) +- `--image-url` (or `-i`): Input image URL or local file path (for img2img/img2text modes) +- `--modality` (or `-m`): Task modality (default: `text2img`). Options: `text2img`, `img2img`, `img2text`, `text2text` +- `--height`: Image height in pixels (default: 512) +- `--width`: Image width in pixels (default: 512) +- `--steps`: Number of inference steps (default: 25) +- `--seed`: Random seed (default: 42) +- `--negative`: Negative prompt for image generation + +Example with custom parameters: + +```bash +python openai_chat_client.py \ + --prompt "A futuristic city" \ + --modality text2img \ + --height 768 \ + --width 768 \ + --steps 50 \ + --seed 42 \ + --negative "blurry, low quality" +``` + +## Modality Control + +BAGEL-7B-MoT supports **multiple modality modes** for different use cases. + +The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel.yaml) + +| Modality | Input | Output | Description | +| ----------- | ------------ | ------ | -------------------------------------- | +| `text2img` | Text | Image | Generate images from text prompts | +| `img2img` | Image + Text | Image | Transform images using text guidance | +| `img2text` | Image + Text | Text | Generate text descriptions from images | +| `text2text` | Text | Text | Pure text generation | + ### Text to Image (text2img) -**Python client:** +Generate images from text prompts: + +**Using Python client** ```bash python openai_chat_client.py \ @@ -147,7 +235,7 @@ python openai_chat_client.py \ --steps 50 ``` -**curl:** +**Using curl** ```bash curl http://localhost:8091/v1/chat/completions \ @@ -162,9 +250,12 @@ curl http://localhost:8091/v1/chat/completions \ }' ``` + ### Image to Image (img2img) -**Python client:** +Transform images based on text prompts: + +**Using Python client** ```bash python openai_chat_client.py \ @@ -174,7 +265,7 @@ python openai_chat_client.py \ --output transformed.png ``` -**curl:** +**Using curl** ```bash IMAGE_BASE64=$(base64 -w 0 cat.jpg) @@ -199,11 +290,14 @@ EOF curl http://localhost:8091/v1/chat/completions \ -H "Content-Type: application/json" \ -d @payload.json + ``` ### Image to Text (img2text) -**Python client:** +Generate text descriptions from images: + +**Using Python client** ```bash python openai_chat_client.py \ @@ -212,7 +306,7 @@ python openai_chat_client.py \ --image-url /path/to/image.jpg ``` -**curl:** +**Using curl** ```bash IMAGE_BASE64=$(base64 -w 0 cat.jpg) @@ -237,7 +331,9 @@ curl http://localhost:8091/v1/chat/completions \ ### Text to Text (text2text) -**Python client:** +Pure text generation: + +**Using Python client** ```bash python openai_chat_client.py \ @@ -245,78 +341,30 @@ python openai_chat_client.py \ --modality text2text ``` -**curl:** +**Using curl** ```bash curl http://localhost:8091/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>user\nWhat is the capital of France?<|im_end|>\n<|im_start|>assistant\n"}]}], + "messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>user\nWhat is the capital of France?<|im_end|>\n<|im_start|>assistant\n"}]}] "modalities": ["text"] }' ``` -### Python Client Arguments - -| Argument | Default | Description | -| :------- | :------ | :---------- | -| `--prompt` / `-p` | `A cute cat` | Text prompt | -| `--output` / `-o` | `bagel_output.png` | Output file path | -| `--server` / `-s` | `http://localhost:8091` | Server URL | -| `--image-url` / `-i` | `None` | Input image URL or local path (img2img/img2text) | -| `--modality` / `-m` | `text2img` | `text2img`, `img2img`, `img2text`, `text2text` | -| `--height` | `512` | Image height in pixels | -| `--width` | `512` | Image width in pixels | -| `--steps` | `25` | Number of inference steps | -| `--seed` | `42` | Random seed | -| `--negative` | `None` | Negative prompt for CFG | +## FAQ -Example with custom parameters: +- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. ```bash -python openai_chat_client.py \ - --prompt "A futuristic city" \ - --modality text2img \ - --height 768 \ - --width 768 \ - --steps 50 \ - --seed 42 \ - --negative "blurry, low quality" +sudo apt update +sudo apt install ffmpeg ``` -## Configuration Reference - -### Deploy YAML Files - -| File | Description | -| :--- | :---------- | -| [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml) | Two-stage default (Thinker + DiT on GPU 0) | -| [`bagel_single_stage.yaml`](../../../vllm_omni/deploy/bagel_single_stage.yaml) | Single-stage (DiT only) | - -### Key Deploy YAML Fields - -| Field | Scope | Description | -| :---- | :---- | :---------- | -| `pipeline` | top-level | Override auto-detected pipeline (e.g. `bagel_single_stage`) | -| `stages[].stage_id` | per-stage | Stage identifier (0, 1, ...) | -| `stages[].devices` | per-stage | GPU device IDs (e.g. `"0"`, `"0,1"`) | -| `stages[].max_num_seqs` | per-stage | Maximum concurrent sequences | -| `stages[].gpu_memory_utilization` | per-stage | Fraction of GPU memory to use | -| `stages[].enforce_eager` | per-stage | Disable CUDA graphs | -| `stages[].tensor_parallel_size` | per-stage | TP degree for this stage | -| `connectors` | top-level | Define available connector instances (SHM, Mooncake) | -| `platforms` | top-level | Platform-specific overrides (e.g. `xpu`) | - -## FAQ - -- If you encounter OOM errors, try decreasing `max_model_len` or `gpu_memory_utilization` in the deploy YAML. - -**Two-stage VRAM usage:** - -| Stage | VRAM | -| :---- | :--- | -| Stage 0 (Thinker) | **15.04 GiB + KV Cache** | -| Stage 1 (DiT) | **26.50 GiB** | -| Total | **~42 GiB + KV Cache** | +- If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. -**Single-stage VRAM usage:** The DiT loads the full model (~42 GiB) in one process. +| Stage | VRAM | +| :------------------ | :--------------------------- | +| Stage-0 (Thinker) | **15.04 GiB** **+ KV Cache** | +| Stage-1 (DiT) | **26.50 GiB** | +| Total | **~42 GiB + KV Cache** | diff --git a/examples/online_serving/bagel/run_server_stage_cli.sh b/examples/online_serving/bagel/run_server_stage_cli.sh index 912e212f97e..51639153f73 100644 --- a/examples/online_serving/bagel/run_server_stage_cli.sh +++ b/examples/online_serving/bagel/run_server_stage_cli.sh @@ -1,164 +1,34 @@ #!/bin/bash -# Bagel multi-stage online serving startup script. -# -# Usage: -# ./run_server_stage_cli.sh --stage 0 -# ./run_server_stage_cli.sh --stage 1 -# ./run_server_stage_cli.sh --stage 0 -- --tensor-parallel-size 2 -# ./run_server_stage_cli.sh --stage 1 -- --gpu-memory-utilization 0.9 -# -# By default, `--stage all` keeps the old behavior and launches both stages in -# one session. Use `--stage 0` / `--stage 1` to launch each stage separately in -# different terminal sessions, with stage-specific extra CLI arguments passed -# after `--`. - -set -euo pipefail +# Bagel multi-stage online serving startup script +# Starts stage 0 as master with API server, and stage 1 in headless mode MODEL="${MODEL:-ByteDance-Seed/BAGEL-7B-MoT}" PORT="${PORT:-8091}" MASTER_ADDRESS="${MASTER_ADDRESS:-127.0.0.1}" MASTER_PORT="${MASTER_PORT:-8092}" -STAGE="all" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -DEPLOY_CONFIG="${DEPLOY_CONFIG:-$SCRIPT_DIR/../../../vllm_omni/deploy/bagel.yaml}" -EXTRA_ARGS=() - -usage() { - cat <&2 - usage - exit 1 - ;; - esac -done - -if [[ "$STAGE" != "0" && "$STAGE" != "1" && "$STAGE" != "all" ]]; then - echo "Invalid --stage value: $STAGE" >&2 - usage - exit 1 -fi - -print_config() { - echo "Model: $MODEL" - echo "API Port: $PORT" - echo "Master Address: $MASTER_ADDRESS" - echo "Master Port: $MASTER_PORT" - echo "Deploy Config: $DEPLOY_CONFIG" - echo "Selected Stage: $STAGE" - if [[ ${#EXTRA_ARGS[@]} -gt 0 ]]; then - echo "Extra Args: ${EXTRA_ARGS[*]}" - fi -} - -run_stage_0() { - echo "Starting Stage 0 (Thinker) as master..." - vllm serve "$MODEL" --omni \ - --port "$PORT" \ - --deploy-config "$DEPLOY_CONFIG" \ - --stage-id 0 \ - --omni-master-address "$MASTER_ADDRESS" \ - --omni-master-port "$MASTER_PORT" \ - "${EXTRA_ARGS[@]}" -} - -run_stage_1() { - echo "Starting Stage 1 (DiT) in headless mode..." - vllm serve "$MODEL" --omni \ - --deploy-config "$DEPLOY_CONFIG" \ - --stage-id 1 \ - --headless \ - --omni-master-address "$MASTER_ADDRESS" \ - --omni-master-port "$MASTER_PORT" \ - "${EXTRA_ARGS[@]}" -} +STAGE_CONFIGS_PATH="$(dirname "$0")/../../../vllm_omni/model_executor/stage_configs/bagel.yaml" echo "Starting Bagel multi-stage server..." -print_config - -case "$STAGE" in - 0) - run_stage_0 - ;; - 1) - run_stage_1 - ;; - all) - echo "Launching both stages in one session (legacy mode)..." - echo "Starting Stage 0 (Thinker) in background first..." - run_stage_0 & - STAGE_0_PID=$! - - cleanup() { - if [[ -n "${STAGE_0_PID:-}" ]]; then - kill "$STAGE_0_PID" 2>/dev/null || true - wait "$STAGE_0_PID" 2>/dev/null || true - fi - } - - trap cleanup EXIT INT TERM - - echo "Waiting briefly for Stage 0 to initialize..." - sleep 2 - run_stage_1 - ;; -esac +echo "Model: $MODEL" +echo "API Port: $PORT" +echo "Master Address: $MASTER_ADDRESS" +echo "Master Port: $MASTER_PORT" +echo "Stage Configs: $STAGE_CONFIGS_PATH" + +# Start stage 1 (DiT) in headless mode first +echo "Starting Stage 1 (DiT) in headless mode..." +vllm serve "$MODEL" --omni \ + --stage-configs-path "$STAGE_CONFIGS_PATH" \ + --stage-id 1 \ + --headless \ + -oma "$MASTER_ADDRESS" \ + -omp "$MASTER_PORT" & + +# Start stage 0 (Thinker) as master with API server +echo "Starting Stage 0 (Thinker) as master..." +vllm serve "$MODEL" --omni \ + --port "$PORT" \ + --stage-configs-path "$STAGE_CONFIGS_PATH" \ + --stage-id 0 \ + -oma "$MASTER_ADDRESS" \ + -omp "$MASTER_PORT" diff --git a/examples/online_serving/diffusers_pipeline_adapter/README.md b/examples/online_serving/diffusers_pipeline_adapter/README.md deleted file mode 100644 index 8dbf9369ae8..00000000000 --- a/examples/online_serving/diffusers_pipeline_adapter/README.md +++ /dev/null @@ -1,83 +0,0 @@ -# Diffusers Backend Adapter Example - -This example demonstrates how to serve any 🤗 Diffusers pipeline through vLLM-Omni -using the `diffusers` load format. - -## Supported Models - -Any model loadable via `DiffusionPipeline.from_pretrained()` should be supported, including text-to-image, image-to-image, text-to-video, image-to-video, and text-to-audio. - -## Limitations - -The diffusers backend is a black-box adapter. The following features are NOT yet supported. -It is not guaranteed whether they will be supported in the future. - -- CFG parallel execution -- Sequence parallel execution -- TeaCache / Cache-DiT acceleration -- Step-wise execution (continuous batching) - -For these features, it is recommended to use natively supported pipelines instead. - -## Usage - -### Option 1: CLI arguments - -```bash -vllm serve "stable-diffusion-v1-5/stable-diffusion-v1-5" \ - --omni \ - --diffusion-load-format diffusers \ - --diffusers-load-kwargs '{"use_safetensors": true}' \ - --diffusers-call-kwargs '{"num_inference_steps": 30, "guidance_scale": 7.5}' -``` - -`--diffusers-load-kwargs` and `--diffusers-call-kwargs` are only valid together with `--diffusion-load-format diffusers`. - -### Option 2: Stage config YAML - -```bash -vllm serve stable-diffusion-v1-5/stable-diffusion-v1-5 --stage-configs-path examples/online_serving/diffusers_pipeline_adapter/stage_config.yaml --omni -``` - -The particular fields of interest are `model`, `diffusion_load_format`, `diffusers_load_kwargs`, and `diffusers_call_kwargs` under `engine_args`. They are the same as the CLI arguments. - -## Send a Request - -```bash -curl http://localhost:8000/v1/images/generations \ - -H "Content-Type: application/json" \ - -d '{ - "model": "stable-diffusion-v1-5/stable-diffusion-v1-5", - "prompt": "a photo of an astronaut riding a horse on mars", - "n": 1, - "size": "512x512" - }' -``` - -Or refer to other documentation pages on how to request a particular input/output modality, such as `examples/online_serving/text_to_image/openai_chat_client.py`. - -## Configuration Reference - -For the diffusers adapter, set options under **`engine_args`**: - -### `diffusion_load_format: "diffusers"` - -This field selects the Hugging Face diffusers adapter path (see `DiffusersPipelineLoader`). - -### `diffusers_load_kwargs` - -Passed to `DiffusionPipeline.from_pretrained()`. - -This is suitable for model-specific configurations not available through the vLLM-Omni interface (such as `Omni.__init__()`, `vllm serve` CLI arguments, and stage config YAML fields outside `diffusers_load_kwargs`). - -When a parameter is available in the vLLM-Omni interface, it will be adapted here. -But if that parameter is simultaneously set in both the vLLM-Omni interface and `diffusers_load_kwargs`, the **latter** will take precedence. - -### `diffusers_call_kwargs` - -Passed to `pipeline.__call__()`. - -This is suitable for sampling parameters not available through the vLLM-Omni interface (such as `Omni.generate()` and online serving payloads). - -When a parameter is available in the vLLM-Omni interface, it will be adapted here. -But if that parameter is simultaneously set in both the vLLM-Omni interface and `diffusers_call_kwargs`, the **former** will take precedence (because it is set at request time). diff --git a/examples/online_serving/diffusers_pipeline_adapter/stage_config.yaml b/examples/online_serving/diffusers_pipeline_adapter/stage_config.yaml deleted file mode 100644 index 7c96eb6c167..00000000000 --- a/examples/online_serving/diffusers_pipeline_adapter/stage_config.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# Example stage config for diffusers backend -# This config demonstrates serving Stable Diffusion 1.5 via the diffusers adapter. -# Users should copy and modify this for their own models. - -model_type: diffusion - -stage_args: - - stage_id: 0 - stage_type: diffusion - engine_args: - model_stage: diffusion - model: "stable-diffusion-v1-5/stable-diffusion-v1-5" - distributed_executor_backend: "mp" - # gpu_memory_utilization: 0.9 - engine_output_type: image - # Select the HF diffusers adapter - diffusion_load_format: "diffusers" - # model_class_name: "DiffusersAdapterPipeline" # default when diffusion_load_format is diffusers - diffusers_load_kwargs: - # Passed to DiffusionPipeline.from_pretrained(). - # Good for model-specific loading parameters not covered by OmniDiffusionConfig. - # During model load time, parameters here override their counterparts in the vLLM-Omni interface. - use_safetensors: true - diffusers_call_kwargs: - # Passed to pipeline.__call__(). - # Good for model-specific sampling parameters not covered by OmniDiffusionSamplingParams. - # During request time, parameters here are overridden by the counterparts in OmniDiffusionSamplingParams. - num_inference_steps: 30 - guidance_scale: 7.5 - final_output: true - final_output_type: image diff --git a/examples/online_serving/dynin_omni/README.md b/examples/online_serving/dynin_omni/README.md deleted file mode 100644 index d8526d42373..00000000000 --- a/examples/online_serving/dynin_omni/README.md +++ /dev/null @@ -1,97 +0,0 @@ -# Dynin-Omni Online Serving Example - -## Installation - -Please refer to [README.md](../../../README.md). - -## Launch the Server - -First, find the `transformers_modules` path: - -```bash -python - <<'PY' -from transformers.utils.hub import HF_MODULES_CACHE -print(HF_MODULES_CACHE) -PY -``` - -Then export it for both `PYTHONPATH` and `HF_MODULES_CACHE`: - -```bash -export PYTHONPATH=:$PYTHONPATH -export HF_MODULES_CACHE= -``` - -Run from repository root: - -```bash -vllm-omni serve snu-aidas/Dynin-Omni \ - --omni \ - --port 8091 \ - --stage-configs-path "$(pwd)/vllm_omni/model_executor/stage_configs/dynin_omni.yaml" -``` - -If `vllm-omni` is not in PATH, run: - -```bash -PYTHONPATH="$(pwd)" python -m vllm_omni.entrypoints.cli.main serve snu-aidas/Dynin-Omni \ - --omni \ - --port 8091 \ - --stage-configs-path "$(pwd)/vllm_omni/model_executor/stage_configs/dynin_omni.yaml" -``` - -Wait until the server logs show both `All stages initialized successfully` and -`Application startup complete.` before sending requests. - -## Send Requests via Python Client - -Move to the example directory: - -```bash -cd examples/online_serving/dynin_omni -``` - -### Text -> Image - -```bash -python openai_chat_completion_client_for_multimodal_generation.py \ - --query-type t2i \ - --prompt "A realistic indoor living room with natural daylight." -``` - -### Image -> Image - -```bash -python openai_chat_completion_client_for_multimodal_generation.py \ - --query-type i2i \ - --image-path ../../offline_inference/dynin_omni/data/image/sofa_under_water.jpg \ - --prompt "Transform this surreal underwater setting into a realistic indoor living room while preserving the sofa layout." -``` - -### Text -> Speech - -```bash -python openai_chat_completion_client_for_multimodal_generation.py \ - --query-type t2s \ - --prompt "Hello. This is Dynin-omni." -``` - -## CLI Arguments - -- `--query-type` (`t2i|t2s|i2i`) -- `--model` (default: `snu-aidas/Dynin-Omni`) -- `--host` / `--port` (OpenAI-compatible vLLM endpoint) -- `--prompt` (custom text) -- `--image-path` (required for `i2i`) -- `--modalities` (optional output modalities override) -- `--output-dir` (default: `/tmp/dynin_online_outputs`) - -## Notes - -- This client currently supports only `t2i`, `t2s`, and `i2i`. -- `t2t` is intentionally not exposed in this online example. -- This example intentionally uses the OpenAI-compatible chat completion endpoint. -- Task routing for non-text outputs relies on Dynin task trigger tokens (`<|t2i|>`, `<|i2i|>`, `<|t2s|>`) injected by the client. -- Outputs are saved under `/tmp/dynin_online_outputs` by default. -- Dynin stage-0 warmup can take a while on first startup; do not send requests before startup completes. -- Dynin itself can execute text-returning tasks such as `t2t`, `s2t`, `i2t`, and `v2t`, but this online serving example currently runs stage-0 in `generation` mode. In that path, the generation worker does not surface the final text as `output.text`, so OpenAI chat responses for those text-output tasks may complete internally but still return empty text. diff --git a/examples/online_serving/dynin_omni/openai_chat_completion_client_for_multimodal_generation.py b/examples/online_serving/dynin_omni/openai_chat_completion_client_for_multimodal_generation.py deleted file mode 100644 index 9728555431f..00000000000 --- a/examples/online_serving/dynin_omni/openai_chat_completion_client_for_multimodal_generation.py +++ /dev/null @@ -1,342 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import argparse -import base64 -import json -import mimetypes -import os -import time -from pathlib import Path -from typing import Any - -DEFAULT_MODEL = "snu-aidas/Dynin-Omni" -DEFAULT_OUTPUT_DIR = "/tmp/dynin_online_outputs" - -QUERY_CHOICES = ("t2i", "t2s", "i2i") -DEFAULT_PROMPT_BY_QUERY = { - "t2i": "A high quality detailed living room interior photo.", - "t2s": "Please read this sentence naturally: Hello from Dynin-Omni online serving.", - "i2i": "Transform this image into a realistic indoor living room while preserving layout.", -} -DEFAULT_MODALITIES_BY_QUERY = { - "t2i": ["image"], - "t2s": ["audio"], - "i2i": ["image"], -} -OFFLINE_PARITY_STAGE_COUNT = 3 -OFFLINE_PARITY_STAGE_SAMPLING = { - "max_tokens": 1, - "temperature": 0.0, - "top_p": 1.0, - "detokenize": False, -} - - -def _infer_mime_type(path: Path) -> str: - mime_type, _ = mimetypes.guess_type(str(path)) - return mime_type or "application/octet-stream" - - -def _encode_file_as_data_url(path: Path) -> str: - mime_type = _infer_mime_type(path) - raw = path.read_bytes() - encoded = base64.b64encode(raw).decode("utf-8") - return f"data:{mime_type};base64,{encoded}" - - -def _to_image_url(path_or_url: str) -> str: - value = str(path_or_url) - if value.startswith(("http://", "https://", "data:image/")): - return value - path = Path(value).expanduser().resolve() - if not path.exists(): - raise FileNotFoundError(f"Image file not found: {path}") - return _encode_file_as_data_url(path) - - -def _build_user_content(query_type: str, prompt: str, image_path: str | None) -> list[dict[str, Any]]: - if query_type == "t2i": - return [{"type": "text", "text": f"<|t2i|> {prompt}"}] - - if query_type == "t2s": - return [{"type": "text", "text": f"<|t2s|> {prompt}"}] - - if query_type == "i2i": - if not image_path: - raise ValueError("--image-path is required for query type i2i") - return [ - {"type": "text", "text": f"<|i2i|> {prompt}"}, - {"type": "image_url", "image_url": {"url": _to_image_url(image_path)}}, - ] - - raise ValueError(f"Unsupported query_type: {query_type}") - - -def _collect_text_from_content(content: Any) -> list[str]: - texts: list[str] = [] - if isinstance(content, str): - stripped = content.strip() - if stripped: - texts.append(stripped) - return texts - - if isinstance(content, dict): - for key in ("text", "content", "value", "output_text"): - text_value = content.get(key) - if isinstance(text_value, str) and text_value.strip(): - texts.append(text_value.strip()) - return texts - - if isinstance(content, list): - for item in content: - texts.extend(_collect_text_from_content(item)) - return texts - - content_text = getattr(content, "text", None) - if isinstance(content_text, str) and content_text.strip(): - texts.append(content_text.strip()) - content_value = getattr(content, "content", None) - if isinstance(content_value, str) and content_value.strip(): - texts.append(content_value.strip()) - output_text = getattr(content, "output_text", None) - if isinstance(output_text, str) and output_text.strip(): - texts.append(output_text.strip()) - return texts - - -def _extract_text_outputs(chat_completion: Any) -> list[str]: - texts: list[str] = [] - for choice in getattr(chat_completion, "choices", []) or []: - message = getattr(choice, "message", None) - if message is None: - continue - content = getattr(message, "content", None) - texts.extend(_collect_text_from_content(content)) - reasoning_content = getattr(message, "reasoning_content", None) - if isinstance(reasoning_content, str) and reasoning_content.strip(): - texts.append(reasoning_content.strip()) - choice_text = getattr(choice, "text", None) - if isinstance(choice_text, str) and choice_text.strip(): - texts.append(choice_text.strip()) - top_level_output_text = getattr(chat_completion, "output_text", None) - if isinstance(top_level_output_text, str) and top_level_output_text.strip(): - texts.append(top_level_output_text.strip()) - return texts - - -def _extract_image_data_urls(chat_completion: Any) -> list[str]: - urls: list[str] = [] - for choice in getattr(chat_completion, "choices", []) or []: - message = getattr(choice, "message", None) - if message is None: - continue - content = getattr(message, "content", None) - if not isinstance(content, list): - continue - for item in content: - if not isinstance(item, dict): - continue - if item.get("type") != "image_url": - continue - image_url = (item.get("image_url") or {}).get("url") - if isinstance(image_url, str) and image_url.startswith("data:image"): - urls.append(image_url) - return urls - - -def _extract_audio_payloads(chat_completion: Any) -> list[bytes]: - payloads: list[bytes] = [] - for choice in getattr(chat_completion, "choices", []) or []: - message = getattr(choice, "message", None) - if message is None: - continue - message_audio = getattr(message, "audio", None) - if message_audio is None: - continue - data_b64 = getattr(message_audio, "data", None) - if isinstance(data_b64, str) and data_b64: - try: - payloads.append(base64.b64decode(data_b64)) - except Exception: - continue - return payloads - - -def _decode_data_url(data_url: str) -> tuple[bytes, str]: - header, data = data_url.split(",", 1) - mime_type = "image/png" - if ";" in header and ":" in header: - mime_type = header.split(":", 1)[1].split(";", 1)[0] - return base64.b64decode(data), mime_type - - -def _image_extension_from_mime(mime_type: str) -> str: - if mime_type == "image/jpeg": - return ".jpg" - if mime_type == "image/webp": - return ".webp" - if mime_type == "image/gif": - return ".gif" - return ".png" - - -def _save_outputs( - *, - query_type: str, - chat_completion: Any, - output_dir: Path, -) -> None: - output_dir.mkdir(parents=True, exist_ok=True) - stamp = time.strftime("%Y%m%d_%H%M%S") - - text_outputs = _extract_text_outputs(chat_completion) - image_data_urls = _extract_image_data_urls(chat_completion) - audio_payloads = _extract_audio_payloads(chat_completion) - - if text_outputs: - text_path = output_dir / f"{query_type}_{stamp}.txt" - text_path.write_text("\n\n".join(text_outputs) + "\n", encoding="utf-8") - print(f"[dynin-online] text saved: {text_path}") - print(text_outputs[0]) - - for idx, image_url in enumerate(image_data_urls): - image_bytes, mime_type = _decode_data_url(image_url) - ext = _image_extension_from_mime(mime_type) - image_path = output_dir / f"{query_type}_{stamp}_{idx}{ext}" - image_path.write_bytes(image_bytes) - print(f"[dynin-online] image saved: {image_path}") - - for idx, audio_bytes in enumerate(audio_payloads): - audio_path = output_dir / f"{query_type}_{stamp}_{idx}.wav" - audio_path.write_bytes(audio_bytes) - print(f"[dynin-online] audio saved: {audio_path}") - - if not text_outputs and not image_data_urls and not audio_payloads: - print("[dynin-online] no output extracted from response") - raw_path = output_dir / f"{query_type}_{stamp}_raw_response.json" - try: - if hasattr(chat_completion, "model_dump_json"): - serialized = chat_completion.model_dump_json(indent=2) - else: - if hasattr(chat_completion, "model_dump"): - raw_payload: Any = chat_completion.model_dump(mode="json") - else: - raw_payload = chat_completion - try: - serialized = json.dumps(raw_payload, ensure_ascii=False, indent=2) - except Exception: - serialized = json.dumps({"repr": repr(raw_payload)}, ensure_ascii=False, indent=2) - raw_path.write_text(serialized + "\n", encoding="utf-8") - print(f"[dynin-online] raw response saved: {raw_path}") - except Exception: - pass - - -def _build_offline_parity_sampling_params_list() -> list[dict[str, Any]]: - return [dict(OFFLINE_PARITY_STAGE_SAMPLING) for _ in range(OFFLINE_PARITY_STAGE_COUNT)] - - -def run_request(args: argparse.Namespace) -> None: - from openai import OpenAI - - client = OpenAI( - api_key="EMPTY", - base_url=f"http://{args.host}:{args.port}/v1", - ) - prompt = args.prompt.strip() if args.prompt else DEFAULT_PROMPT_BY_QUERY[args.query_type] - user_content = _build_user_content( - query_type=args.query_type, - prompt=prompt, - image_path=args.image_path, - ) - if args.modalities: - modalities = [item.strip() for item in args.modalities.split(",") if item.strip()] - else: - modalities = DEFAULT_MODALITIES_BY_QUERY[args.query_type] - - extra_body = { - "sampling_params_list": _build_offline_parity_sampling_params_list(), - } - chat_completion = client.chat.completions.create( - model=args.model, - messages=[{"role": "user", "content": user_content}], - modalities=modalities, - extra_body=extra_body, - ) - _save_outputs( - query_type=args.query_type, - chat_completion=chat_completion, - output_dir=Path(args.output_dir).expanduser(), - ) - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Dynin-Omni online chat completion client") - parser.add_argument( - "--query-type", - "-q", - type=str, - default="t2i", - choices=QUERY_CHOICES, - help="Dynin query type", - ) - parser.add_argument( - "--model", - "-m", - type=str, - default=DEFAULT_MODEL, - help="Model name/path", - ) - parser.add_argument( - "--host", - type=str, - default="localhost", - help="Host/IP of the vLLM Omni API server", - ) - parser.add_argument( - "--port", - type=int, - default=8091, - help="Port of the vLLM Omni API server", - ) - parser.add_argument( - "--prompt", - "-p", - type=str, - default="", - help="Custom prompt text", - ) - parser.add_argument( - "--image-path", - "-i", - type=str, - default=None, - help="Image path/URL for i2i", - ) - parser.add_argument( - "--modalities", - type=str, - default="", - help="Comma-separated output modalities override (e.g., text,image,audio)", - ) - parser.add_argument( - "--output-dir", - "-o", - type=str, - default=DEFAULT_OUTPUT_DIR, - help="Directory to save outputs", - ) - return parser.parse_args() - - -def main() -> None: - args = parse_args() - os.environ.setdefault("HF_HUB_DISABLE_XET", "1") - run_request(args) - - -if __name__ == "__main__": - main() diff --git a/examples/online_serving/fish_speech/README.md b/examples/online_serving/fish_speech/README.md index 9b4e3cc403d..ae968d3bada 100644 --- a/examples/online_serving/fish_speech/README.md +++ b/examples/online_serving/fish_speech/README.md @@ -29,12 +29,15 @@ Features: ## Launch the Server ```bash -vllm serve fishaudio/s2-pro --omni --port 8091 +vllm-omni serve fishaudio/s2-pro \ + --stage-configs-path vllm_omni/model_executor/stage_configs/fish_speech_s2_pro.yaml \ + --omni \ + --port 8091 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.9 ``` -The deploy config is auto-loaded from `vllm_omni/deploy/fish_qwen3_omni.yaml` -(the HF `model_type` on the fishaudio checkpoints is `fish_qwen3_omni`). - Or use the convenience script: ```bash diff --git a/examples/online_serving/fish_speech/run_gradio_demo.sh b/examples/online_serving/fish_speech/run_gradio_demo.sh index a0370b9cc88..98a69664437 100755 --- a/examples/online_serving/fish_speech/run_gradio_demo.sh +++ b/examples/online_serving/fish_speech/run_gradio_demo.sh @@ -11,13 +11,18 @@ MODEL="${MODEL:-fishaudio/s2-pro}" PORT="${PORT:-8091}" GRADIO_PORT="${GRADIO_PORT:-7860}" SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" echo "Starting Fish Speech S2 Pro server (port $PORT)..." FLASHINFER_DISABLE_VERSION_CHECK=1 \ -vllm serve "$MODEL" \ - --omni \ +vllm-omni serve "$MODEL" \ + --stage-configs-path "$REPO_ROOT/vllm_omni/model_executor/stage_configs/fish_speech_s2_pro.yaml" \ --host 0.0.0.0 \ - --port "$PORT" & + --port "$PORT" \ + --gpu-memory-utilization 0.9 \ + --trust-remote-code \ + --enforce-eager \ + --omni & SERVER_PID=$! cleanup() { diff --git a/examples/online_serving/fish_speech/run_server.sh b/examples/online_serving/fish_speech/run_server.sh index a865daf9378..59c09c7fe05 100755 --- a/examples/online_serving/fish_speech/run_server.sh +++ b/examples/online_serving/fish_speech/run_server.sh @@ -13,7 +13,11 @@ PORT="${PORT:-8091}" echo "Starting Fish Speech S2 Pro server with model: $MODEL" FLASHINFER_DISABLE_VERSION_CHECK=1 \ -vllm serve "$MODEL" \ - --omni \ +vllm-omni serve "$MODEL" \ + --stage-configs-path vllm_omni/model_executor/stage_configs/fish_speech_s2_pro.yaml \ --host 0.0.0.0 \ - --port "$PORT" + --port "$PORT" \ + --gpu-memory-utilization 0.9 \ + --trust-remote-code \ + --enforce-eager \ + --omni diff --git a/examples/online_serving/glm_image/README.md b/examples/online_serving/glm_image/README.md new file mode 100644 index 00000000000..7ba4b501ca9 --- /dev/null +++ b/examples/online_serving/glm_image/README.md @@ -0,0 +1,204 @@ +# GLM-Image Online Serving + +This example demonstrates how to deploy GLM-Image for online image generation using vLLM-Omni. + +## 🛠️ Installation + +Please refer to [README.md](../../../README.md) + +## Run examples (GLM-Image) + +**Note**: These examples work with the default configuration on **2× NVIDIA A100 (80GB)** or equivalent. Stage 0 (AR) and Stage 1 (Diffusion) each use one GPU by default. For single-GPU setups, modify the stage configuration to share the same device. + +### Launch the Server + +```bash +# Use default configuration +vllm serve zai-org/GLM-Image --omni --port 8091 +``` + +Or use the convenience script: + +```bash +cd examples/online_serving/glm_image +bash run_server.sh +``` + +If you have a custom stage configs file: + +```bash +vllm serve zai-org/GLM-Image --omni --port 8091 --stage-configs-path /path/to/glm_image.yaml +``` + +### Send Requests + +Get into the glm_image folder: + +```bash +cd examples/online_serving/glm_image +``` + +Send request via Python: + +```bash +python openai_chat_client.py --prompt "A cute cat sitting on a window sill" +``` + +The Python client supports the following command-line arguments: + +- `--prompt` (or `-p`): Text prompt for generation (default: `A beautiful sunset over the ocean with sailing boats`) +- `--output` (or `-o`): Output file path (default: `glm_image_output.png`) +- `--server` (or `-s`): Server URL (default: `http://localhost:8091`) +- `--image` (or `-i`): Input image path (for image-to-image editing) +- `--height`: Image height in pixels (default: 1024) +- `--width`: Image width in pixels (default: 1024) +- `--steps`: Number of inference steps (default: 50) +- `--guidance-scale`: Classifier-free guidance scale (default: 1.5) +- `--seed`: Random seed (default: 42) +- `--negative`: Negative prompt + +## Modality Control + +GLM-Image supports **text-to-image** and **image-to-image** modes. + +The default yaml configuration deploys AR on GPU 0 and DiT on GPU 1. You can use the default configuration file: [`glm_image.yaml`](../../../vllm_omni/model_executor/stage_configs/glm_image.yaml) + +| Mode | Input | Output | Description | +| -------------- | ------------ | ------ | ---------------------------------- | +| Text-to-Image | Text | Image | Generate images from text prompts | +| Image-to-Image | Image + Text | Image | Edit images with text instructions | + +### Text-to-Image + +```bash +python openai_chat_client.py \ + --prompt "A photorealistic mountain landscape at sunset" \ + --height 1024 \ + --width 1024 \ + --output landscape.png + +# Or use the curl script: +bash run_curl_text_to_image.sh "A futuristic city skyline at night" +``` + +### Image-to-Image (Image Editing) + +```bash +python openai_chat_client.py \ + --prompt "Convert this image to watercolor style" \ + --image input.png \ + --output watercolor.png + +# Or use the curl script: +bash run_curl_image_edit.sh input.png "Convert to watercolor style" +``` + +For general-purpose request methods (curl, OpenAI SDK, Python `requests`), see +the [Text-to-Image](../text_to_image/README.md) and +[Image-to-Image](../image_to_image/README.md) guides. + +## Generation Parameters + +When using `/v1/chat/completions`, pass these inside `extra_body` in the curl +JSON, or via the `extra_body` keyword argument in the OpenAI Python SDK. +When using the dedicated `/v1/images/generations` or `/v1/images/edits` +endpoints, pass the supported generation controls as top-level fields directly. +For image dimensions and count, use `size` and `n` rather than `height` or +`width`. + +| Parameter | Type | Default | Description | +| --------------------- | ----- | ------- | ----------------------------------- | +| `height` | int | 1024 | Image height in pixels | +| `width` | int | 1024 | Image width in pixels | +| `num_inference_steps` | int | 50 | Number of diffusion denoising steps | +| `guidance_scale` | float | 1.5 | Classifier-free guidance scale | +| `seed` | int | None | Optional random seed; `/v1/images/*` generates one server-side if omitted | +| `negative_prompt` | str | None | Negative prompt | + +## Response Format + +```json +{ + "id": "chatcmpl-xxx", + "created": 1234567890, + "model": "zai-org/GLM-Image", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "data:image/png;base64,..." + } + } + ] + }, + "finish_reason": "stop" + } + ], + "usage": {} +} +``` + +## Extract Image + +```bash +# From a saved JSON response +cat response.json | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > output.png +``` + +## Architecture + +GLM-Image uses a 2-stage multistage pipeline: + +``` +Stage 0 (AR Model) Stage 1 (Diffusion) +┌───────────────────┐ ┌─────────────────────┐ +│ vLLM-optimized │ prior │ GlmImagePipeline │ +│ GlmImageFor │──tokens──►│ ┌───────────────┐ │ +│ Conditional │ │ │ DiT Denoiser │ │ +│ Generation │ │ └───────┬───────┘ │ +│ (9B AR model) │ │ ▼ │ +└───────────────────┘ │ ┌───────────────┐ │ + ▲ │ │ VAE Decode │──┼──► Image + │ │ └───────────────┘ │ + Text / Image └─────────────────────┘ + Input +``` + +## VRAM Requirements + +| Stage | VRAM | +| :---------------- | :--------------------- | +| Stage-0 (AR) | **~18 GiB + KV Cache** | +| Stage-1 (DiT+VAE) | **~20 GiB** | +| Total | **~38 GiB + KV Cache** | + +## File Description + +| File | Description | +| --------------------------- | ------------------------------------- | +| `run_server.sh` | Server startup script | +| `run_curl_text_to_image.sh` | Text-to-image curl example | +| `run_curl_image_edit.sh` | Image-to-image (editing) curl example | +| `openai_chat_client.py` | Python client (t2i + i2i) | + +## FAQ + +- If you encounter OOM errors, adjust `gpu_memory_utilization` in the stage config: + +```yaml +# In glm_image.yaml, reduce from default 0.6: +gpu_memory_utilization: 0.5 +``` + +- The first request may be slow due to model warmup. Subsequent requests will be faster. + +- If you encounter `Transformers does not recognize this architecture` error, your have to upgrade `transformers` package to `5.3.0` or above: + +``` +pip install --upgrade transformers +``` diff --git a/examples/online_serving/glm_image/openai_chat_client.py b/examples/online_serving/glm_image/openai_chat_client.py new file mode 100644 index 00000000000..e142b071904 --- /dev/null +++ b/examples/online_serving/glm_image/openai_chat_client.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +""" +GLM-Image OpenAI-compatible chat client for text-to-image and image-to-image. + +Usage: + # Text-to-image + python openai_chat_client.py --prompt "A cute cat" --output output.png + + # Image-to-image (image editing) + python openai_chat_client.py --prompt "Convert to watercolor style" --image input.png --output output.png +""" + +import argparse +import base64 +from pathlib import Path + +import requests + + +def generate_image( + prompt: str, + server_url: str = "http://localhost:8091", + image_path: str | None = None, + height: int = 1024, + width: int = 1024, + steps: int = 50, + guidance_scale: float = 1.5, + seed: int | None = None, + negative_prompt: str | None = None, +) -> bytes | None: + """Generate or edit an image using the chat completions API. + + Args: + prompt: Text description or editing instruction + server_url: Server URL + image_path: Path to input image (for image-to-image editing) + height: Image height in pixels + width: Image width in pixels + steps: Number of inference steps + guidance_scale: Classifier-free guidance scale + seed: Random seed for reproducibility + negative_prompt: Negative prompt + + Returns: + Image bytes or None if failed + """ + # Build message content + content: list[dict] = [{"type": "text", "text": prompt}] + + if image_path: + img_path = Path(image_path) + if not img_path.exists(): + print(f"Error: Image file not found: {image_path}") + return None + b64_data = base64.b64encode(img_path.read_bytes()).decode("utf-8") + suffix = img_path.suffix.lstrip(".").lower() + mime = {"jpg": "jpeg", "jpeg": "jpeg", "png": "png", "webp": "webp"}.get(suffix, "png") + content.append( + { + "type": "image_url", + "image_url": {"url": f"data:image/{mime};base64,{b64_data}"}, + } + ) + + messages = [{"role": "user", "content": content}] + + # Build request payload + extra_body: dict = { + "height": height, + "width": width, + "num_inference_steps": steps, + "guidance_scale": guidance_scale, + } + if seed is not None: + extra_body["seed"] = seed + if negative_prompt: + extra_body["negative_prompt"] = negative_prompt + + payload = {"messages": messages, "extra_body": extra_body} + + # Send request + try: + mode = "image-to-image" if image_path else "text-to-image" + print(f"Sending {mode} request to {server_url}...") + response = requests.post( + f"{server_url}/v1/chat/completions", + headers={"Content-Type": "application/json"}, + json=payload, + timeout=600, + ) + response.raise_for_status() + data = response.json() + + # Extract image from response + choices = data.get("choices", []) + for choice in choices: + choice_content = choice.get("message", {}).get("content") + if isinstance(choice_content, list): + for item in choice_content: + if isinstance(item, dict) and "image_url" in item: + img_url = item["image_url"].get("url", "") + if img_url.startswith("data:image"): + _, b64 = img_url.split(",", 1) + return base64.b64decode(b64) + + print(f"Unexpected response format: {data}") + return None + + except Exception as e: + print(f"Error: {e}") + return None + + +def main(): + parser = argparse.ArgumentParser(description="GLM-Image chat client") + parser.add_argument( + "--prompt", + "-p", + default="A beautiful sunset over the ocean with sailing boats", + help="Text prompt", + ) + parser.add_argument("--output", "-o", default="glm_image_output.png", help="Output file") + parser.add_argument("--server", "-s", default="http://localhost:8091", help="Server URL") + + # Image-to-image + parser.add_argument( + "--image", + "-i", + type=str, + help="Input image path (for image-to-image editing)", + ) + + # Generation parameters + parser.add_argument("--height", type=int, default=1024, help="Image height") + parser.add_argument("--width", type=int, default=1024, help="Image width") + parser.add_argument("--steps", type=int, default=50, help="Inference steps") + parser.add_argument("--guidance-scale", type=float, default=1.5, help="CFG guidance scale") + parser.add_argument("--seed", type=int, default=42, help="Random seed") + parser.add_argument("--negative", help="Negative prompt") + + args = parser.parse_args() + + mode = "image-to-image" if args.image else "text-to-image" + print(f"Mode: {mode}") + print(f"Prompt: {args.prompt}") + if args.image: + print(f"Input image: {args.image}") + + image_bytes = generate_image( + prompt=args.prompt, + server_url=args.server, + image_path=args.image, + height=args.height, + width=args.width, + steps=args.steps, + guidance_scale=args.guidance_scale, + seed=args.seed, + negative_prompt=args.negative, + ) + + if image_bytes: + output_path = Path(args.output) + output_path.write_bytes(image_bytes) + print(f"Image saved to: {output_path}") + print(f"Size: {len(image_bytes) / 1024:.1f} KB") + else: + print("Failed to generate image") + exit(1) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/glm_image/run_curl_image_edit.sh b/examples/online_serving/glm_image/run_curl_image_edit.sh new file mode 100755 index 00000000000..bb1e851ba32 --- /dev/null +++ b/examples/online_serving/glm_image/run_curl_image_edit.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# GLM-Image image-edit (image-to-image) curl example + +set -euo pipefail + +if [[ $# -lt 2 ]]; then + echo "Usage: $0 \"\" [output_file]" >&2 + exit 1 +fi + +INPUT_IMG=$1 +PROMPT=$2 +SERVER="${SERVER:-http://localhost:8091}" +CURRENT_TIME=$(date +%Y%m%d%H%M%S) +OUTPUT="${3:-glm_image_i2i_${CURRENT_TIME}.png}" + +if [[ ! -f "$INPUT_IMG" ]]; then + echo "Input image not found: $INPUT_IMG" >&2 + exit 1 +fi + +# base64 encode (macOS uses -i, Linux uses -w0) +if [[ "$(uname)" == "Darwin" ]]; then + IMG_B64=$(base64 < "$INPUT_IMG" | tr -d '\n') +else + IMG_B64=$(base64 -w0 "$INPUT_IMG") +fi + +REQUEST_JSON=$( + jq -n --arg prompt "$PROMPT" --arg img "$IMG_B64" '{ + messages: [{ + role: "user", + content: [ + {"type": "text", "text": $prompt}, + {"type": "image_url", "image_url": {"url": ("data:image/png;base64," + $img)}} + ] + }], + extra_body: { + height: 1024, + width: 1024, + num_inference_steps: 50, + guidance_scale: 1.5, + seed: 42 + } + }' +) + +echo "Generating edited image..." +echo "Server: $SERVER" +echo "Prompt: $PROMPT" +echo "Input : $INPUT_IMG" +echo "Output: $OUTPUT" + +curl -s "$SERVER/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "$REQUEST_JSON" \ + | jq -r '.choices[0].message.content[0].image_url.url' \ + | cut -d',' -f2- \ + | base64 -d > "$OUTPUT" + +echo "Image saved to: $OUTPUT" diff --git a/examples/online_serving/glm_image/run_curl_text_to_image.sh b/examples/online_serving/glm_image/run_curl_text_to_image.sh new file mode 100755 index 00000000000..aecb6953c45 --- /dev/null +++ b/examples/online_serving/glm_image/run_curl_text_to_image.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# GLM-Image text-to-image curl example + +set -euo pipefail + +PROMPT="${1:-A beautiful sunset over the ocean with sailing boats}" +SERVER="${SERVER:-http://localhost:8091}" +OUTPUT="${OUTPUT:-glm_image_t2i_output.png}" + +echo "Generating image..." +echo "Server: $SERVER" +echo "Prompt: $PROMPT" +echo "Output: $OUTPUT" + +curl -s "$SERVER/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"messages\": [ + {\"role\": \"user\", \"content\": \"$PROMPT\"} + ], + \"extra_body\": { + \"height\": 1024, + \"width\": 1024, + \"num_inference_steps\": 50, + \"guidance_scale\": 1.5, + \"seed\": 42 + } + }" | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > "$OUTPUT" + +echo "Image saved to: $OUTPUT" diff --git a/examples/online_serving/glm_image/run_server.sh b/examples/online_serving/glm_image/run_server.sh new file mode 100755 index 00000000000..b47d9f88504 --- /dev/null +++ b/examples/online_serving/glm_image/run_server.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# GLM-Image online serving startup script + +MODEL="${MODEL:-zai-org/GLM-Image}" +PORT="${PORT:-8091}" + +echo "Starting GLM-Image server..." +echo "Model: $MODEL" +echo "Port: $PORT" + +vllm serve "$MODEL" --omni \ + --port "$PORT" diff --git a/examples/online_serving/image_to_image/README.md b/examples/online_serving/image_to_image/README.md index 59b1f0e2c15..789258473fd 100644 --- a/examples/online_serving/image_to_image/README.md +++ b/examples/online_serving/image_to_image/README.md @@ -314,7 +314,6 @@ count, use `size` and `n` rather than `height`, `width`, or | `seed` | int | None | Random seed (reproducible) | | `negative_prompt` | str | None | Negative prompt | | `num_outputs_per_prompt` | int | 1 | Number of images to generate | -| `strength` | float | 0.6 | **Z-Image only** - Denoising start timestep for I2I. Range: [0.0, 1.0]. Lower preserves more of original image. | | `layers` | int | 4 | Number of layers (Qwen-Image-Layered) | | `resolution` | int | 640 | Resolution, 640 or 1024 (Qwen-Image-Layered) | diff --git a/examples/online_serving/image_to_video/README.md b/examples/online_serving/image_to_video/README.md index 285eeb27983..49283bd9a06 100644 --- a/examples/online_serving/image_to_video/README.md +++ b/examples/online_serving/image_to_video/README.md @@ -26,23 +26,6 @@ The script allows overriding: - `CACHE_BACKEND` (default: `none`) - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`) -### Ascend / Local LightX2V Example - -For a local Wan2.2-LightX2V Diffusers directory on Ascend/NPU, you can start the server like this: - -```bash -vllm serve /path/to/Wan2.2-I2V-A14B-LightX2V-Diffusers-Lightning \ - --omni \ - --port 8091 \ - --flow-shift 12 \ - --cfg-parallel-size 1 \ - --ulysses-degree 4 \ - --use-hsdp \ - --trust-remote-code \ - --allowed-local-media-path / \ - --seed 42 -``` - ## Async Job Behavior `POST /v1/videos` is asynchronous. It creates a video job and immediately @@ -86,35 +69,10 @@ curl -X POST http://localhost:8091/v1/videos/sync \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ - -F 'extra_params={"sample_solver":"euler"}' \ -F "seed=42" \ -o sync_i2v_output.mp4 ``` -For Wan Lightning/Distill checkpoints, pass `{"sample_solver":"euler"}` via `extra_params`. The default solver is `unipc`. - -Example matching the local LightX2V deployment above: - -```bash -curl -sS -X POST http://localhost:8091/v1/videos/sync \ - -H "Accept: video/mp4" \ - -F "prompt=A cat playing with yarn" \ - -F "input_reference=@/path/to/input.jpg" \ - -F "width=832" \ - -F "height=480" \ - -F "num_frames=81" \ - -F "fps=16" \ - -F "num_inference_steps=4" \ - -F "guidance_scale=1.0" \ - -F "guidance_scale_2=1.0" \ - -F "boundary_ratio=0.875" \ - -F "seed=42" \ - -F 'extra_params={"sample_solver":"euler"}' \ - -o ./output.mp4 -``` - -Use `/v1/videos/sync` if you want to write the MP4 directly to a file. `POST /v1/videos` is async and returns job metadata, not inline `b64_json`. - ## Storage Generated video files are stored on local disk by the async video API. @@ -138,9 +96,6 @@ export VLLM_OMNI_STORAGE_MAX_CONCURRENCY=8 # Basic image-to-video generation bash run_curl_image_to_video.sh -# Wan Lightning/Distill checkpoints -SAMPLE_SOLVER=euler bash run_curl_image_to_video.sh - # Or execute directly (OpenAI-style multipart) create_response=$(curl -s http://localhost:8091/v1/videos \ -H "Accept: application/json" \ @@ -156,7 +111,6 @@ create_response=$(curl -s http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ - -F 'extra_params={"sample_solver":"euler"}' \ -F "seed=42") video_id=$(echo "$create_response" | jq -r '.id') @@ -215,12 +169,9 @@ curl -X POST http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ - -F 'extra_params={"sample_solver":"euler"}' \ -F "seed=42" ``` -`sample_solver` is supported by Wan2.2 online serving through the existing `extra_params` field, which is merged into the pipeline `extra_args`. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints. - ## Create Response Format `POST /v1/videos` returns a job record, not inline base64 video data. diff --git a/examples/online_serving/image_to_video/run_curl_image_to_video.sh b/examples/online_serving/image_to_video/run_curl_image_to_video.sh index 6f6a6f96d59..f4c1496a69a 100644 --- a/examples/online_serving/image_to_video/run_curl_image_to_video.sh +++ b/examples/online_serving/image_to_video/run_curl_image_to_video.sh @@ -7,7 +7,6 @@ INPUT_IMAGE="${INPUT_IMAGE:-../../offline_inference/image_to_video/qwen-bear.png BASE_URL="${BASE_URL:-http://localhost:8099}" OUTPUT_PATH="${OUTPUT_PATH:-wan22_i2v_output.mp4}" NEGATIVE_PROMPT="${NEGATIVE_PROMPT:-}" -SAMPLE_SOLVER="${SAMPLE_SOLVER:-}" POLL_INTERVAL="${POLL_INTERVAL:-2}" if [ ! -f "$INPUT_IMAGE" ]; then @@ -35,10 +34,6 @@ if [ -n "${NEGATIVE_PROMPT}" ]; then create_cmd+=(-F "negative_prompt=${NEGATIVE_PROMPT}") fi -if [ -n "${SAMPLE_SOLVER}" ]; then - create_cmd+=(-F "extra_params={\"sample_solver\":\"${SAMPLE_SOLVER}\"}") -fi - create_response="$("${create_cmd[@]}")" video_id="$(echo "${create_response}" | jq -r '.id')" if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then diff --git a/examples/online_serving/mimo_audio/README.md b/examples/online_serving/mimo_audio/README.md index 9c1be7f21c8..9f70d59cbe8 100644 --- a/examples/online_serving/mimo_audio/README.md +++ b/examples/online_serving/mimo_audio/README.md @@ -13,10 +13,10 @@ Please refer to [README.md](../../../README.md) ```bash export MIMO_AUDIO_TOKENIZER_PATH="XiaomiMiMo/MiMo-Audio-Tokenizer" -vllm serve XiaomiMiMo/MiMo-Audio-7B-Instruct --omni \ - --served-model-name "MiMo-Audio-7B-Instruct" \ - --port 18091 \ - --chat-template ./examples/online_serving/mimo_audio/chat_template.jinja +vllm-omni serve XiaomiMiMo/MiMo-Audio-7B-Instruct --omni \ +--served-model-name "MiMo-Audio-7B-Instruct" \ +--port 18091 --stage-configs-path ./vllm_omni/model_executor/stage_configs/mimo_audio.yaml \ +--chat-template ./examples/online_serving/mimo_audio/chat_template.jinja ``` > ⚠️ **Important** > **MiMo-Audio is not compatible with the default chat template.** diff --git a/examples/online_serving/ming_flash_omni/README.md b/examples/online_serving/ming_flash_omni/README.md deleted file mode 100644 index 8b7d03e211a..00000000000 --- a/examples/online_serving/ming_flash_omni/README.md +++ /dev/null @@ -1,95 +0,0 @@ -# Ming-flash-omni 2.0 - -## Installation - -Please refer to [README.md](../../../README.md) - -## Deployment modes - -| Mode | Launch command | Output | -|------|---------------|--------| -| Thinker only (multimodal understanding) | `vllm serve ... --omni` | Text | -| Thinker + Talker (omni-speech) | `vllm serve ... --omni --stage-configs-path ming_flash_omni.yaml` | Text + Audio | - -For standalone TTS (talker only), see [`examples/online_serving/ming_flash_omni_tts/`](../ming_flash_omni_tts/). - -## Run examples (Ming-flash-omni 2.0) - -### Launch the Server - -**Thinker only (text output):** -```bash -vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091 -``` - -**Thinker + Talker (omni-speech, text + audio output):** -```bash -vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091 \ - --stage-configs-path vllm_omni/model_executor/stage_configs/ming_flash_omni.yaml -``` - -Pass `--stage-configs-path /path/to/your_config.yaml` to use a custom stage -config. - -### Send Multi-modal Request - -Shared Python client (supports `text | use_image | use_audio | use_video | -use_mixed_modalities`; pass `--image-path` / `--audio-path` / `--video-path` -for local files or URLs, `--modalities text` for output, `--help` for the -full flag list): - -```bash -python examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py \ - --model Jonathan1909/Ming-flash-omni-2.0 \ - --query-type use_mixed_modalities \ - --port 8091 --host localhost \ - --modalities text -``` - -Parameterized curl wrapper in this directory: - -```bash -bash run_curl_multimodal_generation.sh text -bash run_curl_multimodal_generation.sh use_image -bash run_curl_multimodal_generation.sh use_audio -bash run_curl_multimodal_generation.sh use_video -bash run_curl_multimodal_generation.sh use_mixed_modalities -``` - -## Modality control - -| `modalities` | Server config | Output | -|-------------|--------------|--------| -| `["text"]` or omitted | Thinker only | Text | -| `["audio"]` | Thinker + Talker | Audio (speech) | -| `["text", "audio"]` | Thinker + Talker | Text + Audio | - -For ready-to-copy curl examples (text / audio / multimodal input, SSE -streaming, reasoning mode), see the recipe at -[`recipes/inclusionAI/Ming-flash-omni-2.0.md`](../../../recipes/inclusionAI/Ming-flash-omni-2.0.md). - -## OpenAI Python SDK — streaming - -```python -from openai import OpenAI - -client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY") - -response = client.chat.completions.create( - model="Jonathan1909/Ming-flash-omni-2.0", - messages=[ - {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, - {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"}, - ], - modalities=["text"], - stream=True, -) -for chunk in response: - for choice in chunk.choices: - if hasattr(choice, "delta") and choice.delta.content: - print(choice.delta.content, end="", flush=True) -print() -``` - -The `--stream` flag on the Python client script above shows the same pattern -driven by the shared multimodal client. diff --git a/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh b/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh deleted file mode 100755 index 768a424e451..00000000000 --- a/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Server port -PORT="${PORT:-8091}" -# Default query type -QUERY_TYPE="${1:-text}" - -# Validate query type -if [[ ! "$QUERY_TYPE" =~ ^(text|use_audio|use_image|use_video|use_mixed_modalities)$ ]]; then - echo "Error: Invalid query type '$QUERY_TYPE'" - echo "Usage: $0 [text|use_audio|use_image|use_video|use_mixed_modalities]" - echo " text: Text-only query" - echo " use_audio: Audio + Text query" - echo " use_image: Image + Text query" - echo " use_video: Video + Text query" - echo " use_mixed_modalities: Audio + Image + Video + Text query" - exit 1 -fi - -thinker_sampling_params='{ - "temperature": 0.4, - "top_p": 0.9, - "top_k": -1, - "max_tokens": 16384, - "seed": 42, - "detokenize": true, - "repetition_penalty": 1.05 -}' -# Above is optional, it has a default setting in stage_configs of the corresponding model. - -# Define URLs for assets -MARY_HAD_LAMB_AUDIO_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/mary_had_lamb.ogg" -CHERRY_BLOSSOM_IMAGE_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg" -SAMPLE_VIDEO_URL="https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4" - -# Build user content based on query type -case "$QUERY_TYPE" in - text) - user_content='[ - { - "type": "text", - "text": "请详细介绍鹦鹉的生活习性。" - } - ]' - ;; - use_image) - user_content='[ - { - "type": "image_url", - "image_url": { - "url": "'"$CHERRY_BLOSSOM_IMAGE_URL"'" - } - }, - { - "type": "text", - "text": "Describe this image in detail." - } - ]' - ;; - use_audio) - user_content='[ - { - "type": "audio_url", - "audio_url": { - "url": "'"$MARY_HAD_LAMB_AUDIO_URL"'" - } - }, - { - "type": "text", - "text": "Please recognize the language of this speech and transcribe it. Format: oral." - } - ]' - ;; - use_video) - user_content='[ - { - "type": "video_url", - "video_url": { - "url": "'"$SAMPLE_VIDEO_URL"'" - } - }, - { - "type": "text", - "text": "Describe what is happening in this video." - } - ]' - ;; - use_mixed_modalities) - user_content='[ - { - "type": "image_url", - "image_url": { - "url": "'"$CHERRY_BLOSSOM_IMAGE_URL"'" - } - }, - { - "type": "audio_url", - "audio_url": { - "url": "'"$MARY_HAD_LAMB_AUDIO_URL"'" - } - }, - { - "type": "text", - "text": "Describe the image, and recognize the language of this speech and transcribe it. Format: oral" - } - ]' - ;; -esac - -echo "Running query type: $QUERY_TYPE" -echo "" - -request_body=$(cat < None: - payload = { - "model": args.model, - "input": args.text, - "response_format": args.response_format, - } - - instructions = args.instructions - if args.instruction_json: - if instructions: - sys.exit("--instructions and --instruction-json are mutually exclusive") - - try: - parsed = json.loads(args.instruction_json) - except json.JSONDecodeError as exc: - sys.exit(f"--instruction-json must be valid JSON: {exc}") - if not isinstance(parsed, dict): - sys.exit("--instruction-json must decode to a JSON object") - # Re-encode with ensure_ascii=False so UTF-8 Chinese keys/values - # arrive at the server intact rather than as \\uXXXX escapes. - instructions = json.dumps(parsed, ensure_ascii=False) - if instructions: - payload["instructions"] = instructions - - print(f"Model: {args.model}") - print(f"Text: {args.text}") - print("Generating audio...") - - api_url = f"{args.api_base}/v1/audio/speech" - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {args.api_key}", - } - - with httpx.Client(timeout=300.0) as client: - response = client.post(api_url, json=payload, headers=headers) - - if response.status_code != 200: - print(f"Error: {response.status_code}") - print(response.text) - return - - output_path = args.output or "ming_tts_output.wav" - with open(output_path, "wb") as f: - f.write(response.content) - print(f"Audio saved to: {output_path}") - - -def main(): - parser = argparse.ArgumentParser(description="Ming standalone TTS speech client") - parser.add_argument("--api-base", default=DEFAULT_API_BASE, help="API base URL") - parser.add_argument("--api-key", default=DEFAULT_API_KEY, help="API key") - parser.add_argument("--model", "-m", default=DEFAULT_MODEL, help="Model name or local path") - parser.add_argument("--text", required=True, help="Text to synthesize") - parser.add_argument( - "--response-format", - default="wav", - choices=["wav", "mp3", "flac", "pcm", "aac", "opus"], - help="Audio format (default: wav)", - ) - parser.add_argument("--output", "-o", default=None, help="Output file path") - parser.add_argument( - "--instructions", - default=None, - help="Free-form style description (mapped to caption 风格 on the server).", - ) - parser.add_argument( - "--instruction-json", - default=None, - help=( - "Structured caption JSON forwarded as `instructions`. Accepts Ming " - "caption keys: 方言, 风格, 语速, 基频, 音量, 情感, IP, 说话人, BGM. " - ), - ) - args = parser.parse_args() - run_tts(args) - - -if __name__ == "__main__": - main() diff --git a/examples/online_serving/qwen2_5_omni/README.md b/examples/online_serving/qwen2_5_omni/README.md index c528732064a..91aab3b6518 100644 --- a/examples/online_serving/qwen2_5_omni/README.md +++ b/examples/online_serving/qwen2_5_omni/README.md @@ -208,3 +208,11 @@ The gradio script supports the following arguments: - `--ip`: Host/IP for Gradio server (default: 127.0.0.1) - `--port`: Port for Gradio server (default: 7861) - `--share`: Share the Gradio demo publicly (creates a public link) + +### FAQ + +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` diff --git a/examples/online_serving/qwen3_omni/README.md b/examples/online_serving/qwen3_omni/README.md index c85970555f9..c3171e43667 100644 --- a/examples/online_serving/qwen3_omni/README.md +++ b/examples/online_serving/qwen3_omni/README.md @@ -12,221 +12,17 @@ Please refer to [README.md](../../../README.md) vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 ``` -The default deployment configuration, situated at `vllm_omni/deploy/qwen3_omni_moe.yaml`, is resolved and loaded -automatically via the model registry, obviating the `--deploy-config` flag in standard deployment topologies. -Asynchronous chunk streaming operates as **enabled by default** within this bundled configuration. -Additionally, NPU, ROCm, and XPU per-platform configuration deltas are deterministically merged from the -`platforms`: section of the corresponding YAML. - -**Note:** The OpenAI-style **`/v1/realtime`** WebSocket interface (facilitating streaming PCM audio input alongside audio and transcription output) -is currently **unsupported** while the `async_chunk` configuration attribute is enabled. -It is requisite to instantiate the default omni architecture or utilize a deployment configuration specifying `async_chunk: false` to facilitate real-time streaming sessions. - -To explicitly utilize a custom deployment YAML, mandate the configuration path accordingly: -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --deploy-config /path/to/your_deploy_config.yaml -``` - -### Launch individual stages (stage-based CLI) - -Use the stage-based CLI when you want to run one stage per process. - -**1. Stage 0 (Thinker + API server)** +If you want to open async chunking for qwen3-omni, launch the server with command below ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --port 8091 \ - --stage-id 0 \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml ``` -**2. Stage 1 (Talker)** - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --stage-id 1 \ - --headless \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - -**3. Stage 2 (Code2Wav)** - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --stage-id 2 \ - --headless \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - -Append `--deploy-config /path/to/your_deploy_config.yaml` to each node invocation if it is necessary -to explicitly override the bundled deployment YAML schema. - -For standard **unified-process** launcher, stage-specific CLI configuration tuning is conventionally implemented -via the `--stage-overrides` directive, as demonstrated below: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --stage-overrides '{"1": {"gpu_memory_utilization": 0.5}}' -``` - -Conversely, within the stage-based CLI paradigm, `--stage-overrides` modifiers are typically **unnecessary** -for this category of optimization. Given that each instantiation strictly initiates a single functional stage, -parameter flags can be systematically assigned directly onto that specific stage's command sequence: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni \ - --stage-id 1 \ - --headless \ - --gpu-memory-utilization 0.5 \ - --omni-master-address 127.0.0.1 \ - --omni-master-port 26000 -``` - -### Tuning deployment parameters - -Most engine knobs (`max_num_batched_tokens`, `max_model_len`, `enforce_eager`, -`gpu_memory_utilization`, `tensor_parallel_size`, …) can be tuned without -editing the YAML. There are three layers, in increasing specificity: - -#### 1. Global CLI flags (apply to every stage) - +If you have custom stage configs file, launch the server with command below ```bash -# Tighter memory budget on a smaller GPU -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --gpu-memory-utilization 0.85 - -# Disable cudagraphs (e.g. for debugging) -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --enforce-eager - -# Reduce context length -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --max-model-len 32768 - -# Toggle prefix caching on every stage (yaml default: off) -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --enable-prefix-caching -# ...or force it off if the yaml turned it on -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --no-enable-prefix-caching - -# Toggle pipeline-wide async chunked streaming between stages -# (yaml default for qwen3_omni_moe: on) -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --no-async-chunk +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file ``` -For the TTS counterpart (synchronous codec variant), see -[qwen3_tts README](../qwen3_tts/README.md#sync-vs-async-chunk-mode). - -Explicit CLI flags **override** the deploy YAML (which itself overrides the -parser defaults). If you don't pass a flag, the YAML value wins. - -> **Note on `--no-async-chunk`**: Flips the deploy yaml's `async_chunk:` -> bool. Pipelines that implement alternate processor functions for -> chunked vs end-to-end modes (e.g. qwen3_tts code2wav) dispatch -> automatically based on that bool — no extra flag or variant yaml is -> needed. - -> ⚠️ **For multi-stage models that share GPUs (qwen3_omni_moe by default -> shares cuda:1 between stages 1 and 2), avoid using global memory flags.** -> A global `--gpu-memory-utilization 0.85` would apply to every stage and -> oversubscribe the shared device. Use per-stage overrides instead — see -> below. - -#### 2. Per-stage overrides via `--stage-overrides` (recommended for memory) - -```bash -# Lower stage 1's memory budget; leave others at the YAML default -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --stage-overrides '{ - "1": {"gpu_memory_utilization": 0.5}, - "2": {"max_num_batched_tokens": 65536} - }' -``` - -Per-stage values are always treated as explicit and beat YAML defaults for -the named stage. Other stages keep their YAML values. - -If you switch to the stage-based CLI, the same per-stage tuning can usually be -passed directly on that stage's command instead of using `--stage-overrides`. - -#### 3. Custom deploy YAML - -When per-stage overrides get long, write a small overlay YAML that inherits -from the bundled default: - -```yaml -# my_qwen3_omni_overrides.yaml -base_config: /path/to/vllm_omni/deploy/qwen3_omni_moe.yaml - -stages: - - stage_id: 0 - max_num_batched_tokens: 65536 - enforce_eager: true - - stage_id: 1 - gpu_memory_utilization: 0.5 - - stage_id: 2 - max_model_len: 8192 -``` - -Then start the server with `--deploy-config my_qwen3_omni_overrides.yaml`. -The `base_config:` line tells the loader to inherit everything else (stages, -connectors, edges, platforms section) from the bundled production YAML, so -you only need to spell out the deltas. - -#### 4. Multi-node deployment (cross-host transfer connector) - -The bundled `qwen3_omni_moe.yaml` uses `SharedMemoryConnector` between stages, -which only works when all stages run on the same physical host. For -**cross-node** deployments, write a small overlay YAML that swaps in a -network-capable connector (e.g. `MooncakeStoreConnector`) and re-points each -stage's connector wiring at it. The connector spec carries your own server -addresses — there is no checked-in default because every cluster is -different. - -```yaml -# my_qwen3_omni_multinode.yaml -base_config: /path/to/vllm_omni/deploy/qwen3_omni_moe.yaml - -connectors: - mooncake_connector: - name: MooncakeStoreConnector - extra: - host: "127.0.0.1" - metadata_server: "http://YOUR_METADATA_HOST:8080/metadata" - master: "YOUR_MASTER_HOST:50051" - segment: 512000000 # 512 MB transfer segment - localbuf: 64000000 # 64 MB local buffer - proto: "tcp" - -stages: - - stage_id: 0 - output_connectors: - to_stage_1: mooncake_connector - - stage_id: 1 - input_connectors: - from_stage_0: mooncake_connector - output_connectors: - to_stage_2: mooncake_connector - - stage_id: 2 - input_connectors: - from_stage_1: mooncake_connector -``` - -Then launch with `--deploy-config my_qwen3_omni_multinode.yaml`. Same -pattern works for Qwen2.5-Omni — replace `base_config:` with the path to -`vllm_omni/deploy/qwen2_5_omni.yaml`. - -> ⚠️ Replace `YOUR_METADATA_HOST` / `YOUR_MASTER_HOST` with the actual -> mooncake server addresses for your cluster. The `base_config:` overlay -> inherits all stage budgets, devices, and edges from the bundled prod -> YAML — you only need to spell out the connector swap. - ### Send Multi-modal Request Get into the example folder @@ -242,43 +38,38 @@ python examples/online_serving/openai_chat_completion_client_for_multimodal_gene #### Realtime WebSocket client (`openai_realtime_client.py`) -[`openai_realtime_client.py`](./openai_realtime_client.py) connects to **`ws://:/v1/realtime`**, streams a local WAV as **PCM16 mono @ 16 kHz** in fixed-size chunks (OpenAI-style `input_audio_buffer.append` / `commit`), and receives **`response.audio.delta`** (incremental PCM for the reply) plus **`transcription.*`** events. By default it concatenates audio deltas and writes **`--output-wav`** (model output is typically **24 kHz**). Optional **`--delta-dump-dir`** saves each delta as `delta_000001.wav`, … for debugging. - -Streaming input works well for translation-style use cases; if the Thinker runs while input is still incomplete, consider limiting **`max_tokens`** in your session / server defaults to avoid over-generation. +[`openai_realtime_client.py`](./openai_realtime_client.py) connects to **`ws://:/v1/realtime`**, uploads a local audio file as **PCM16 mono @ 16 kHz** chunks (OpenAI-style `input_audio_buffer.append` / `commit`), and prints **streaming transcription** (`transcription.delta` / `transcription.done`). **Dependencies:** ```bash -pip install websockets +pip install websockets librosa numpy ``` +(ffmpeg may be required by `librosa` for some formats; see the FAQ below.) + **From this directory** (`examples/online_serving/qwen3_omni`): ```bash python openai_realtime_client.py \ - --url ws://localhost:8091/v1/realtime \ + --host localhost \ + --port 8091 \ --model Qwen/Qwen3-Omni-30B-A3B-Instruct \ - --input-wav /path/to/input_16k_mono.wav \ - --output-wav realtime_output.wav \ - --delta-dump-dir ./rt_delta_wavs + --audio_path /path/to/your.wav ``` +If `--audio_path` is omitted, the script uses a bundled default clip (`mary_had_lamb` via vLLM assets). + **Arguments:** | Flag | Default | Description | |------|---------|-------------| -| `--url` | `ws://localhost:8091/v1/realtime` | Full WebSocket URL including path | -| `--model` | `Qwen/Qwen3-Omni-30B-A3B-Instruct` | Must match the served model (sent in `session.update`) | -| `--input-wav` | *(required)* | Input WAV: mono, 16-bit PCM, **16 kHz** | -| `--output-wav` | `realtime_output.wav` | Output path for concatenated reply audio | -| `--output-text` | *(optional)* | If set, write final transcription text to this path | -| `--chunk-ms` | `200` | Size of each uploaded audio chunk (milliseconds of audio) | -| `--send-delay-ms` | `0` | Delay between chunk sends (simulate realtime upload) | -| `--delta-dump-dir` | *(optional)* | Directory to write per-`response.audio.delta` WAV files | -| `--num-requests` | `1` | Number of sequential sessions (see `--concurrency`) | -| `--concurrency` | `1` | Max concurrent WebSocket sessions when `--num-requests` > 1 | - -Ensure the server is running **without** `async_chunk` if you use `/v1/realtime`, for example: +| `--host` | `localhost` | API server host | +| `--port` | `8000` | API server port (match your `vllm serve` port, e.g. `8091`) | +| `--model` | `Qwen/Qwen3-Omni-30B-A3B-Instruct` | Must match the served model (also sent in `session.update`) | +| `--audio_path` | *(optional)* | Path to input audio; resampled to 16 kHz mono inside the client | + +Ensure the vLLM-Omni server is running with realtime support for this endpoint, for example: ```bash vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 @@ -314,6 +105,12 @@ bash run_curl_multimodal_generation.sh use_image ### FAQ +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` + ## Modality control You can control output modalities to specify which types of output the model should generate. This is useful when you only need text output and want to skip audio generation stages for better performance. @@ -487,7 +284,7 @@ The script supports the following arguments: - `--model`: Model name/path (default: Qwen/Qwen3-Omni-30B-A3B-Instruct) - `--server-port`: Port for vLLM server (default: 8091) - `--gradio-port`: Port for Gradio demo (default: 7861) -- `--deploy-config`: Path to custom deploy config YAML file (optional) +- `--stage-configs-path`: Path to custom stage configs YAML file (optional) - `--server-host`: Host for vLLM server (default: 0.0.0.0) - `--gradio-ip`: IP for Gradio demo (default: 127.0.0.1) - `--share`: Share Gradio demo publicly (creates a public link) @@ -502,7 +299,7 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 If you have custom stage configs file: ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --deploy-config /path/to/deploy_config_file +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file ``` **Step 2: Run the Gradio demo** diff --git a/examples/online_serving/qwen3_omni/openai_realtime_client.py b/examples/online_serving/qwen3_omni/openai_realtime_client.py index 79e30a3f50b..4fa043c481d 100644 --- a/examples/online_serving/qwen3_omni/openai_realtime_client.py +++ b/examples/online_serving/qwen3_omni/openai_realtime_client.py @@ -1,118 +1,81 @@ -"""Realtime client for vLLM-Omni /v1/realtime (audio + text events). - -This client: -1) Reads a local WAV file (must be mono, 16-bit PCM, 16kHz), -2) Streams PCM16 chunks to /v1/realtime with OpenAI-style events, -3) Receives response.audio.* and transcription.* events, -4) Saves synthesized audio to an output WAV file and optional text file. +""" +This script demonstrates how to use the vLLM-Omni Realtime WebSocket API to perform +audio transcription by uploading an audio file. -By default each ``response.audio.delta`` is treated as an **incremental PCM** -chunk and all chunks are concatenated into the final ``--output-wav``. +Before running this script, you must start the vLLM-Omni server with a realtime-capable +model, for example: -Optional debugging: pass ``--delta-dump-dir DIR`` to write every -``response.audio.delta`` payload as ``delta_000001.wav``, ``delta_000002.wav``, … + vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni -Usage: - python openai_realtime_client.py \ - --url ws://localhost:8091/v1/realtime \ - --model Qwen/Qwen3-Omni-30B-A3B-Instruct \ - --input-wav input_16k_mono.wav \ - --output-wav realtime_output.wav \ - --delta-dump-dir ./rt_delta_wavs +Requirements: +- vllm with audio support +- websockets +- librosa +- numpy -Dependencies: - pip install websockets +The script: +1. Connects to the Realtime WebSocket endpoint +2. Converts an audio file to PCM16 @ 16kHz +3. Sends audio chunks to the server +4. Receives and prints transcription as it streams """ -from __future__ import annotations - import argparse import asyncio import base64 import json -import wave -from pathlib import Path - -try: - import websockets -except ImportError: - print("Please install websockets: pip install websockets") - raise SystemExit(1) - - -def _read_wav_pcm16(path: Path) -> bytes: - with wave.open(str(path), "rb") as wf: - nchannels = wf.getnchannels() - sampwidth = wf.getsampwidth() - framerate = wf.getframerate() - comptype = wf.getcomptype() - nframes = wf.getnframes() - - if nchannels != 1: - raise ValueError(f"Input WAV must be mono (got {nchannels} channels).") - if sampwidth != 2: - raise ValueError(f"Input WAV must be 16-bit PCM (got sample width={sampwidth}).") - if framerate != 16000: - raise ValueError(f"Input WAV must be 16kHz (got {framerate} Hz).") - if comptype != "NONE": - raise ValueError(f"Input WAV must be uncompressed PCM (got comptype={comptype}).") - if nframes <= 0: - raise ValueError("Input WAV has no audio frames.") - - return wf.readframes(nframes) - - -def _write_wav_pcm16(path: Path, pcm16_bytes: bytes, sample_rate_hz: int) -> None: - with wave.open(str(path), "wb") as wf: - wf.setnchannels(1) - wf.setsampwidth(2) - wf.setframerate(sample_rate_hz) - wf.writeframes(pcm16_bytes) - - -async def run_client( - url: str, - model: str, - input_wav: Path, - output_wav: Path, - output_text: Path | None, - chunk_ms: int, - send_delay_ms: int, - delta_dump_dir: Path | None, - request_idx: int = 1, - total_requests: int = 1, -) -> None: - log_prefix = f"[req {request_idx:02d}/{total_requests:02d}] " if total_requests > 1 else "" - pcm16 = _read_wav_pcm16(input_wav) - bytes_per_ms = 16000 * 2 // 1000 # mono PCM16 at 16kHz - chunk_bytes = max(bytes_per_ms * chunk_ms, 2) - incremental_pcm_parts: list[bytes] = [] - output_sample_rate = 24000 - delta_index = 0 - text_chunks: list[str] = [] - final_text: str = "" - - if delta_dump_dir is not None: - delta_dump_dir.mkdir(parents=True, exist_ok=True) - - async with websockets.connect(url, max_size=64 * 1024 * 1024) as ws: - # 1) Validate model. - await ws.send( - json.dumps( - { - "type": "session.update", - "model": model, - } - ) - ) - - # 2) Start generation once (non-final commit). - await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": False})) - - # 3) Stream audio chunks. - for i in range(0, len(pcm16), chunk_bytes): - chunk = pcm16[i : i + chunk_bytes] +import librosa +import numpy as np +import websockets +from vllm.assets.audio import AudioAsset + + +def audio_to_pcm16_base64(audio_path: str) -> str: + """ + Load an audio file and convert it to base64-encoded PCM16 @ 16kHz. + """ + # Load audio and resample to 16kHz mono + audio, _ = librosa.load(audio_path, sr=16000, mono=True) + # Convert to PCM16 + pcm16 = (audio * 32767).astype(np.int16) + # Encode as base64 + return base64.b64encode(pcm16.tobytes()).decode("utf-8") + + +async def realtime_transcribe(audio_path: str, host: str, port: int, model: str): + """ + Connect to the Realtime API and transcribe an audio file. + """ + uri = f"ws://{host}:{port}/v1/realtime" + + async with websockets.connect(uri) as ws: + # Wait for session.created + response = json.loads(await ws.recv()) + if response["type"] == "session.created": + print(f"Session created: {response['id']}") + else: + print(f"Unexpected response: {response}") + return + + # Validate model + await ws.send(json.dumps({"type": "session.update", "model": model})) + + # Signal ready to start + await ws.send(json.dumps({"type": "input_audio_buffer.commit"})) + + # Convert audio file to base64 PCM16 + print(f"Loading audio from: {audio_path}") + audio_base64 = audio_to_pcm16_base64(audio_path) + + # Send audio in chunks (4KB of raw audio = ~8KB base64) + chunk_size = 4096 + audio_bytes = base64.b64decode(audio_base64) + total_chunks = (len(audio_bytes) + chunk_size - 1) // chunk_size + + print(f"Sending {total_chunks} audio chunks...") + for i in range(0, len(audio_bytes), chunk_size): + chunk = audio_bytes[i : i + chunk_size] await ws.send( json.dumps( { @@ -121,212 +84,63 @@ async def run_client( } ) ) - if send_delay_ms > 0: - await asyncio.sleep(send_delay_ms / 1000.0) - # 4) Final commit closes input stream. + # Signal all audio is sent await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True})) + print("Audio sent. Waiting for transcription...\n") - # 5) Receive server events until audio done. + # Receive transcription + print("Transcription: ", end="", flush=True) while True: - message = await ws.recv() - if isinstance(message, bytes): - # We only expect JSON text frames. - continue - - event = json.loads(message) - event_type = event.get("type") - - if event_type == "session.created": - continue - - if event_type == "response.audio.delta": - sr = event.get("sample_rate_hz") - if isinstance(sr, int) and sr > 0: - output_sample_rate = sr - audio_b64 = event.get("audio", "") - if audio_b64: - pcm_delta = base64.b64decode(audio_b64) - incremental_pcm_parts.append(pcm_delta) - if delta_dump_dir is not None and pcm_delta: - delta_index += 1 - dump_path = delta_dump_dir / f"delta_{delta_index:06d}.wav" - _write_wav_pcm16(dump_path, pcm_delta, output_sample_rate) - print( - f"{log_prefix}delta dump #{delta_index}: {dump_path} " - f"(pcm bytes={len(pcm_delta)}, sr={output_sample_rate})" - ) - continue - - if event_type == "transcription.delta": - delta = event.get("delta", "") - if delta: - text_chunks.append(delta) - print(delta, end="", flush=True) - continue - - if event_type == "transcription.done": - final_text = event.get("text", "") or "".join(text_chunks) - usage = event.get("usage") - final_text_with_tag = f"Final transcription: {final_text}" - if text_chunks: - print() - print(f"{log_prefix}{final_text_with_tag}") - if usage: - print(f"{log_prefix}text usage: {usage}") - continue - - if event_type == "response.audio.done": + response = json.loads(await ws.recv()) + if response["type"] == "transcription.delta": + print(response["delta"], end="", flush=True) + elif response["type"] == "transcription.done": + print(f"\n\nFinal transcription: {response['text']}") + if response.get("usage"): + print(f"Usage: {response['usage']}") + break + elif response["type"] == "error": + print(f"\nError: {response['error']}") break - if event_type == "error": - raise RuntimeError(f"Server error: {event}") - - all_pcm16 = b"".join(incremental_pcm_parts) - if not all_pcm16: - raise RuntimeError("No audio received from server.") - - output_wav.parent.mkdir(parents=True, exist_ok=True) - _write_wav_pcm16(output_wav, all_pcm16, output_sample_rate) - print(f"{log_prefix}Saved realtime audio to: {output_wav} (incremental chunks joined)") - - if output_text is not None: - text_to_save = final_text if final_text else "".join(text_chunks) - output_text.parent.mkdir(parents=True, exist_ok=True) - output_text.write_text(text_to_save, encoding="utf-8") - print(f"{log_prefix}Saved realtime text to: {output_text}") - - -def _indexed_output_path(path: Path | None, index: int, total: int) -> Path | None: - if path is None or total <= 1: - return path - return path.with_name(f"{path.stem}_{index:02d}{path.suffix}") - - -async def run_clients_concurrent( - *, - url: str, - model: str, - input_wav: Path, - output_wav: Path, - output_text: Path | None, - chunk_ms: int, - send_delay_ms: int, - delta_dump_dir: Path | None, - num_requests: int, - concurrency: int, -) -> None: - sem = asyncio.Semaphore(concurrency) - - async def _run_one(index: int) -> tuple[int, bool, str | None]: - per_output_wav = _indexed_output_path(output_wav, index, num_requests) - per_output_text = _indexed_output_path(output_text, index, num_requests) - per_delta_dir = None - if delta_dump_dir is not None: - per_delta_dir = delta_dump_dir / f"req_{index:02d}" - async with sem: - try: - await run_client( - url=url, - model=model, - input_wav=input_wav, - output_wav=per_output_wav, - output_text=per_output_text, - chunk_ms=chunk_ms, - send_delay_ms=send_delay_ms, - delta_dump_dir=per_delta_dir, - request_idx=index, - total_requests=num_requests, - ) - return index, True, None - except Exception as exc: - return index, False, str(exc) - tasks = [asyncio.create_task(_run_one(i), name=f"rt-client-{i}") for i in range(1, num_requests + 1)] - results = await asyncio.gather(*tasks) +def main(args): + if args.audio_path: + audio_path = args.audio_path + else: + # Use default audio asset + audio_path = str(AudioAsset("mary_had_lamb").get_local_path()) + print(f"No audio path provided, using default: {audio_path}") - failed = [(idx, err) for idx, ok, err in results if not ok] - succeeded = num_requests - len(failed) - print(f"[summary] succeeded={succeeded}, failed={len(failed)}, total={num_requests}") - if failed: - for idx, err in failed: - print(f"[summary] req {idx:02d} failed: {err}") - raise RuntimeError(f"{len(failed)} concurrent request(s) failed") + asyncio.run(realtime_transcribe(audio_path, args.host, args.port, args.model)) -def main() -> None: - parser = argparse.ArgumentParser(description="Realtime audio/text client for vLLM-Omni") - parser.add_argument("--url", default="ws://localhost:8091/v1/realtime", help="WebSocket URL") +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Realtime WebSocket Transcription Client") parser.add_argument( "--model", + type=str, default="Qwen/Qwen3-Omni-30B-A3B-Instruct", - help="Model name for session.update", + help="Model that is served and should be pinged.", ) - parser.add_argument("--input-wav", required=True, type=Path, help="Input WAV (mono, PCM16, 16kHz)") - parser.add_argument("--output-wav", default=Path("realtime_output.wav"), type=Path, help="Output WAV path") parser.add_argument( - "--output-text", + "--audio_path", + type=str, default=None, - type=Path, - help="Optional output text path for final transcription", + help="Path to the audio file to transcribe.", ) - parser.add_argument("--chunk-ms", type=int, default=200, help="Input chunk size in milliseconds") parser.add_argument( - "--send-delay-ms", - type=int, - default=0, - help="Delay between chunk sends; set >0 to simulate realtime upload", + "--host", + type=str, + default="localhost", + help="vLLM-Omni server host (default: localhost)", ) parser.add_argument( - "--delta-dump-dir", - type=Path, - default=None, - help="If set, each response.audio.delta is saved as delta_NNNNNN.wav under this directory", - ) - parser.add_argument("--num-requests", type=int, default=1, help="Total number of requests to send") - parser.add_argument( - "--concurrency", + "--port", type=int, - default=1, - help="Maximum number of concurrent websocket requests", + default=8000, + help="vLLM-Omni server port (default: 8000)", ) args = parser.parse_args() - - if args.num_requests <= 0: - raise ValueError("--num-requests must be >= 1") - if args.concurrency <= 0: - raise ValueError("--concurrency must be >= 1") - concurrency = min(args.concurrency, args.num_requests) - - if args.num_requests == 1: - asyncio.run( - run_client( - url=args.url, - model=args.model, - input_wav=args.input_wav, - output_wav=args.output_wav, - output_text=args.output_text, - chunk_ms=args.chunk_ms, - send_delay_ms=args.send_delay_ms, - delta_dump_dir=args.delta_dump_dir, - ) - ) - else: - asyncio.run( - run_clients_concurrent( - url=args.url, - model=args.model, - input_wav=args.input_wav, - output_wav=args.output_wav, - output_text=args.output_text, - chunk_ms=args.chunk_ms, - send_delay_ms=args.send_delay_ms, - delta_dump_dir=args.delta_dump_dir, - num_requests=args.num_requests, - concurrency=concurrency, - ) - ) - - -if __name__ == "__main__": - main() + main(args) diff --git a/examples/online_serving/qwen3_omni/streaming_video_client.py b/examples/online_serving/qwen3_omni/streaming_video_client.py deleted file mode 100644 index 58f26d24557..00000000000 --- a/examples/online_serving/qwen3_omni/streaming_video_client.py +++ /dev/null @@ -1,208 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Example WebSocket client for the /v1/video/chat/stream endpoint. - -Sends video frames from a local file (or generates synthetic ones), submits a -query, and prints the streamed text response. - -Requirements: - pip install websockets pillow - -Usage: - # With a video file (requires opencv-python): - python streaming_video_client.py --video my_clip.mp4 \\ - --query "What is happening in this video?" - - # Synthetic frames (no extra deps): - python streaming_video_client.py \\ - --query "Describe what you see." \\ - --synthetic-frames 10 - - # With audio (Phase 3): - python streaming_video_client.py --video my_clip.mp4 \\ - --audio my_audio.pcm \\ - --query "What is the person saying and doing?" -""" - -from __future__ import annotations - -import argparse -import asyncio -import base64 -import io -import json -import sys - -try: - import websockets -except ImportError: - print("Please install websockets: pip install websockets") - sys.exit(1) - -from PIL import Image - - -def _generate_synthetic_frame(index: int, width: int = 320, height: int = 240) -> bytes: - """Generate a simple synthetic JPEG frame with a colour gradient.""" - r = (index * 37) % 256 - g = (index * 73) % 256 - b = (index * 113) % 256 - img = Image.new("RGB", (width, height), (r, g, b)) - buf = io.BytesIO() - img.save(buf, format="JPEG", quality=80) - return buf.getvalue() - - -def _load_video_frames(path: str, max_frames: int = 64, fps: int = 2) -> list[bytes]: - """Extract frames from a video file using OpenCV.""" - try: - import cv2 - except ImportError: - print("opencv-python is required to read video files: pip install opencv-python") - sys.exit(1) - - cap = cv2.VideoCapture(path) - if not cap.isOpened(): - print(f"Cannot open video: {path}") - sys.exit(1) - - video_fps = cap.get(cv2.CAP_PROP_FPS) or 30.0 - frame_interval = max(1, int(video_fps / fps)) - - frames: list[bytes] = [] - idx = 0 - while len(frames) < max_frames: - ret, frame = cap.read() - if not ret: - break - if idx % frame_interval == 0: - _, buf = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 80]) - frames.append(buf.tobytes()) - idx += 1 - - cap.release() - print(f"Loaded {len(frames)} frames from {path} (interval={frame_interval})") - return frames - - -async def run(args: argparse.Namespace) -> None: - uri = f"ws://{args.host}:{args.port}/v1/video/chat/stream" - - # Prepare frames - if args.video: - frames = _load_video_frames(args.video, max_frames=args.max_frames, fps=args.fps) - else: - frames = [_generate_synthetic_frame(i) for i in range(args.synthetic_frames)] - print(f"Generated {len(frames)} synthetic frames") - - # Prepare audio (optional, Phase 3) - audio_data: bytes | None = None - if args.audio: - with open(args.audio, "rb") as f: - audio_data = f.read() - print(f"Loaded audio: {len(audio_data)} bytes") - - async with websockets.connect(uri, max_size=16 * 1024 * 1024) as ws: - # 1. Send session.config - config = { - "type": "session.config", - "model": args.model, - "modalities": ["text", "audio"] if audio_data else ["text"], - "max_frames": args.max_frames, - "num_frames": args.num_sample_frames, - "enable_frame_filter": args.evs, - "frame_filter_threshold": args.evs_threshold, - "use_audio_in_video": bool(audio_data), - } - await ws.send(json.dumps(config)) - print(f"Sent session.config: model={args.model} evs={args.evs}") - - # 2. Send frames - for i, frame in enumerate(frames): - msg = { - "type": "video.frame", - "data": base64.b64encode(frame).decode(), - } - await ws.send(json.dumps(msg)) - if (i + 1) % 10 == 0: - print(f" Sent {i + 1}/{len(frames)} frames") - print(f"Sent all {len(frames)} frames") - - # 3. Send audio chunks (Phase 3) - if audio_data: - chunk_size = 16000 * 2 # 1 second of 16 kHz 16-bit PCM - for offset in range(0, len(audio_data), chunk_size): - chunk = audio_data[offset : offset + chunk_size] - msg = { - "type": "audio.chunk", - "data": base64.b64encode(chunk).decode(), - } - await ws.send(json.dumps(msg)) - print(f"Sent audio in {(len(audio_data) + chunk_size - 1) // chunk_size} chunks") - - # 4. Send query, then immediately send video.done so the server - # knows the session is complete (avoids deadlock where client - # waits for session.done while server waits for video.done). - await ws.send(json.dumps({"type": "video.query", "text": args.query})) - print(f"\nQuery: {args.query}") - print("Response: ", end="", flush=True) - - # Signal end of session right after the query. The server will - # process the query first (it's already queued), then handle - # video.done and reply with session.done. - await ws.send(json.dumps({"type": "video.done"})) - - # 5. Receive response until session.done - recv_timeout = 120 # seconds — avoid infinite hang if server stalls - while True: - raw = await asyncio.wait_for(ws.recv(), timeout=recv_timeout) - data = json.loads(raw) - msg_type = data.get("type") - - if msg_type == "response.text.delta": - print(data.get("delta", ""), end="", flush=True) - elif msg_type == "response.text.done": - print() # newline - elif msg_type == "response.evs_stats": - retained = data.get("retained_count", 0) - dropped = data.get("dropped_count", 0) - rate = data.get("drop_rate", 0) - print(f"\nEVS stats: retained={retained} dropped={dropped} drop_rate={rate:.1%}") - elif msg_type == "session.done": - print("Session complete.") - break - elif msg_type == "error": - print(f"\nError: {data.get('message')}") - break - elif msg_type == "response.start": - pass # expected - else: - print(f"\n[unknown message] {data}") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Streaming video chat client") - parser.add_argument("--host", default="localhost") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--model", default="Qwen/Qwen3-Omni-MoE") - parser.add_argument("--video", help="Path to video file (requires opencv-python)") - parser.add_argument("--audio", help="Path to raw PCM 16kHz audio file (Phase 3)") - parser.add_argument("--query", default="What do you see in this video?") - parser.add_argument( - "--synthetic-frames", type=int, default=10, help="Number of synthetic frames if --video is not set" - ) - parser.add_argument("--max-frames", type=int, default=64) - parser.add_argument("--num-sample-frames", type=int, default=16) - parser.add_argument("--fps", type=int, default=2, help="Frame extraction rate from video") - parser.add_argument( - "--no-evs", dest="evs", action="store_false", help="Disable EVS frame filtering (enabled by default)" - ) - parser.set_defaults(evs=True) - parser.add_argument("--evs-threshold", type=float, default=0.95) - args = parser.parse_args() - asyncio.run(run(args)) - - -if __name__ == "__main__": - main() diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index 350fcb71cac..5504b5737a8 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -43,7 +43,7 @@ Then open http://localhost:7860 in your browser. ### Launch the Server -The default deploy config is located at `vllm_omni/deploy/qwen3_tts.yaml` and is loaded automatically by the model registry — no `--deploy-config` flag needed for default use. Platform-specific deltas (NPU, ROCm, XPU) are merged in automatically from the `platforms:` block of the same YAML based on the detected runtime. +The default stage config is located at `vllm_omni/model_executor/stage_configs/qwen3_tts.yaml`. For other platforms (e.g., NPU), refer to `vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml`. ```bash # CustomVoice model (predefined speakers) @@ -70,22 +70,6 @@ vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ --port 8091 ``` -#### Sync vs async-chunk mode - -Qwen3-TTS supports both **chunked streaming** (default, lower latency) and -**synchronous end-to-end** modes from the same deploy YAML. The bundled -`qwen3_tts.yaml` ships with `async_chunk: true`; flip with `--no-async-chunk` -and the pipeline automatically dispatches to the end-to-end codec processor: - -```bash -vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice --omni --port 8091 \ - --no-async-chunk -``` - -No variant YAML or extra flag is needed — the `StagePipelineConfig` on each -stage declares both processor functions and the runtime picks based on the -`async_chunk:` bool. - Alternatively, use the convenience script: ```bash ./run_server.sh # Default: CustomVoice model @@ -208,6 +192,14 @@ with open("output.wav", "wb") as f: f.write(response.content) ``` +### FAQ + +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` + ## API Reference ### Voices Endpoint @@ -394,54 +386,6 @@ Server -> Client: {"type": "session.done", "total_sentences": 1} ``` -## Choosing an Execution Backend: Uniproc vs Multiprocessing - -Qwen3-TTS stage configs support two execution backends controlled by the -`distributed_executor_backend` engine arg. The performance tradeoff between -them is **both hardware- and task-dependent**, so there is no single best -default (see [#2603](https://github.com/vllm-project/vllm-omni/issues/2603), -[#2604](https://github.com/vllm-project/vllm-omni/pull/2604) for the full -investigation). - -| Backend | Stage config setting | Behaviour | -| ------- | -------------------- | --------- | -| **Uniproc** (default, world_size=1) | `distributed_executor_backend` omitted | Both stages run inside the orchestrator process. Avoids IPC serialisation, D2H copies, and msgpack overhead between stages. | -| **Multiprocessing** | `distributed_executor_backend: "mp"` | Each stage runs in its own subprocess. The Talker can continue decoding while Code2Wav runs the vocoder in parallel, improving pipeline utilisation under concurrency. | - -> **Note:** When `distributed_executor_backend` is omitted and `world_size=1`, -> vLLM [automatically uses the uniproc executor](https://github.com/vllm-project/vllm/blob/main/vllm/config/parallel.py#L825). -> When `world_size > 1`, it defaults to `mp`. - -### When uniproc wins - -The uniproc path eliminates inter-process data transfer (D2H copies, -msgpack serialisation/deserialisation, tensor detaching). This matters most -when per-request processing is heavy relative to autoregressive decode. - -The Base cloning task involves reference-audio encoding on every request, making IPC -overhead a larger fraction of total cost. Qwen3-Omni shows a similar pattern. - -### When multiprocessing (`mp`) wins - -For lighter per-request workloads, process-level parallelism between the -Talker and Code2Wav stages dominates. - -CustomVoice is lighter per-request (no reference audio encoding), so the -process-level parallelism of `mp` outweighs its serialisation cost at -concurrency ≥ 4. - -### How to switch - -To use the uniproc executor on a single-GPU setup, pass the -`qwen3_tts_uniproc.yaml` stage config: - -```bash -vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base \ - --omni \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_uniproc.yaml \ - --port 8091 -``` - ## Limitations - **Single request**: Batch processing is not yet optimized for online serving. diff --git a/examples/online_serving/qwen3_tts/batch_speech_client.py b/examples/online_serving/qwen3_tts/batch_speech_client.py index 47fdc3691c7..7d48e650f88 100644 --- a/examples/online_serving/qwen3_tts/batch_speech_client.py +++ b/examples/online_serving/qwen3_tts/batch_speech_client.py @@ -5,13 +5,11 @@ batch level and generate many utterances in the cloned voice without repeating the reference for each item. -Start the server (with batch-optimized stage settings for best throughput): +Start the server (with batch-optimized config for best throughput): vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --omni \ - --trust-remote-code \ - --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2}, - "1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml \ + --trust-remote-code Examples: # Batch with a predefined voice diff --git a/examples/online_serving/qwen3_tts/openai_speech_client.py b/examples/online_serving/qwen3_tts/openai_speech_client.py index 77e13b08ed2..4741a47158c 100644 --- a/examples/online_serving/qwen3_tts/openai_speech_client.py +++ b/examples/online_serving/qwen3_tts/openai_speech_client.py @@ -71,7 +71,7 @@ def run_tts_generation(args) -> None: payload = { "model": args.model, "input": args.text, - "voice": args.speaker, + "speaker": args.speaker, "response_format": args.response_format, } diff --git a/examples/online_serving/qwen3_tts/run_gradio_demo.sh b/examples/online_serving/qwen3_tts/run_gradio_demo.sh index d79be3c2abd..bcc0ddb7cf5 100644 --- a/examples/online_serving/qwen3_tts/run_gradio_demo.sh +++ b/examples/online_serving/qwen3_tts/run_gradio_demo.sh @@ -127,7 +127,7 @@ echo "Starting vLLM server..." LOG_FILE="/tmp/vllm_tts_server_${SERVER_PORT}.log" vllm-omni serve "$MODEL" \ - --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ --host "$SERVER_HOST" \ --port "$SERVER_PORT" \ --gpu-memory-utilization 0.9 \ diff --git a/examples/online_serving/qwen3_tts/run_server.sh b/examples/online_serving/qwen3_tts/run_server.sh index 78dd2c305d3..6f4aa83a0b9 100755 --- a/examples/online_serving/qwen3_tts/run_server.sh +++ b/examples/online_serving/qwen3_tts/run_server.sh @@ -31,7 +31,7 @@ esac echo "Starting Qwen3-TTS server with model: $MODEL" vllm-omni serve "$MODEL" \ - --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ --host 0.0.0.0 \ --port 8091 \ --gpu-memory-utilization 0.9 \ diff --git a/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py b/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py index 7790fa51276..e6786f8869f 100644 --- a/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py +++ b/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py @@ -5,7 +5,7 @@ using SLERP and sends the result to the /v1/audio/speech API. Requirements: - pip install torch soundfile numpy httpx + pip install torch librosa soundfile numpy httpx Examples: # Extract and save an embedding @@ -143,18 +143,17 @@ def _load_speaker_encoder_weights(encoder: torch.nn.Module, model_path: str) -> def compute_mel_spectrogram(audio: np.ndarray, sr: int = 24000) -> torch.Tensor: """Compute 128-bin mel spectrogram matching Qwen3-TTS's extraction pipeline.""" - from vllm.multimodal.audio import AudioResampler + import librosa # Resample to 24kHz if needed if sr != 24000: - resampler = AudioResampler(target_sr=24000) - audio = resampler.resample(audio.astype(np.float32), orig_sr=sr) + audio = librosa.resample(audio.astype(np.float32), orig_sr=sr, target_sr=24000) y = torch.from_numpy(audio).unsqueeze(0).float() - from vllm_omni.utils.audio import mel_filter_bank + from librosa.filters import mel as librosa_mel_fn - mel_basis = mel_filter_bank(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000) + mel_basis = torch.from_numpy(librosa_mel_fn(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000)).float() n_fft = 1024 hop_size = 256 @@ -181,9 +180,9 @@ def compute_mel_spectrogram(audio: np.ndarray, sr: int = 24000) -> torch.Tensor: @torch.inference_mode() def extract_embedding(encoder: torch.nn.Module, audio_path: str, device: str = "cpu") -> np.ndarray: """Extract a 1024-dim speaker embedding from an audio file.""" - from vllm.multimodal.media.audio import load_audio + import librosa - audio, sr = load_audio(audio_path, sr=None, mono=True) + audio, sr = librosa.load(audio_path, sr=None, mono=True) mel = compute_mel_spectrogram(audio, sr).to(device) embedding = encoder(mel.to(next(encoder.parameters()).dtype))[0] return embedding.float().cpu().numpy() diff --git a/examples/online_serving/text_to_image/README.md b/examples/online_serving/text_to_image/README.md index 17d377ea3e2..87b6a56438e 100644 --- a/examples/online_serving/text_to_image/README.md +++ b/examples/online_serving/text_to_image/README.md @@ -231,8 +231,6 @@ count, use `size` and `n` rather than `height`, `width`, or | `seed` | int | None | Random seed (reproducible) | | `negative_prompt` | str | None | Negative prompt | | `num_outputs_per_prompt` | int | 1 | Number of images to generate | -| `use_system_prompt` | str | None | System prompt preset: `en_unified`, `en_vanilla`, `en_recaption`, `en_think_recaption`, `dynamic`, `None`, or custom text string. Only for HunyuanImage-3.0. | -| `system_prompt` | str | None | Custom system prompt text. Only used when `use_system_prompt` is set to `custom`. Only for HunyuanImage-3.0. | ## Response Format diff --git a/examples/online_serving/text_to_image/openai_chat_client.py b/examples/online_serving/text_to_image/openai_chat_client.py index f3c43086a14..828827aba2d 100644 --- a/examples/online_serving/text_to_image/openai_chat_client.py +++ b/examples/online_serving/text_to_image/openai_chat_client.py @@ -28,8 +28,6 @@ def generate_image( lora_name: str | None = None, lora_scale: float | None = None, lora_int_id: int | None = None, - use_system_prompt: str | None = None, - system_prompt: str | None = None, ) -> bytes | None: """Generate an image using the images generation API. @@ -47,8 +45,6 @@ def generate_image( lora_name: LoRA name (optional, defaults to path stem) lora_scale: LoRA scale factor (default: 1.0) lora_int_id: LoRA integer ID (optional, derived from path if not provided) - use_system_prompt: System prompt for generation. - system_prompt: Custom system prompt. Returns: Image bytes or None if failed @@ -74,10 +70,7 @@ def generate_image( payload["negative_prompt"] = negative_prompt if seed is not None: payload["seed"] = seed - if use_system_prompt is not None: - payload["use_system_prompt"] = use_system_prompt - if system_prompt is not None: - payload["system_prompt"] = system_prompt + # Add LoRA if provided if lora_path: lora_body: dict = { @@ -135,21 +128,9 @@ def main(): default=None, help="LoRA integer id (cache key). If omitted, the server derives a stable id from lora_path.", ) - parser.add_argument( - "--use-system-prompt", - type=str, - default=None, - help=( - "System prompt for generation. Use predefined types: 'en_unified', 'en_vanilla', 'en_recaption', 'en_think_recaption', 'dynamic', or 'None'; Or provide custom text string directly. Recommended en_unified. " - ), - ) - parser.add_argument( - "--system-prompt", - type=str, - default=None, - help=("Custom system prompt. Used when --use-system-prompt is custom. "), - ) + args = parser.parse_args() + print(f"Generating image for: {args.prompt}") image_bytes = generate_image( @@ -165,8 +146,6 @@ def main(): lora_name=args.lora_name, lora_scale=args.lora_scale if args.lora_path else None, lora_int_id=args.lora_int_id if args.lora_path else None, - use_system_prompt=args.use_system_prompt, - system_prompt=args.system_prompt, ) if image_bytes: diff --git a/examples/online_serving/text_to_video/README.md b/examples/online_serving/text_to_video/README.md index c01e0602ff9..44e676671fe 100644 --- a/examples/online_serving/text_to_video/README.md +++ b/examples/online_serving/text_to_video/README.md @@ -1,27 +1,16 @@ # Text-To-Video -This example demonstrates how to deploy text-to-video models for online video generation using vLLM-Omni. +This example demonstrates how to deploy the Wan2.2 text-to-video model for online video generation using vLLM-Omni. -## Supported Models +## Start Server -| Model | Model ID | -|-------|----------| -| Wan2.1 T2V (1.3B) | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | -| Wan2.1 T2V (14B) | `Wan-AI/Wan2.1-T2V-14B-Diffusers` | -| Wan2.2 T2V | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | -| LTX-2 | `Lightricks/LTX-2` | - -## Wan2.2 T2V - -### Start Server - -#### Basic Start +### Basic Start ```bash vllm serve Wan-AI/Wan2.2-T2V-A14B-Diffusers --omni --port 8091 ``` -#### Start with Parameters +### Start with Parameters Or use the startup script: @@ -241,82 +230,3 @@ while true; do sleep 2 done ``` - -## LTX-2 - -### Start Server - -#### Basic Start - -```bash -vllm serve Lightricks/LTX-2 --omni --port 8098 \ - --enforce-eager --flow-shift 1.0 --boundary-ratio 1.0 -``` - -#### Start with Optimization Presets - -Use the LTX-2 startup script with built-in optimization presets: - -```bash -# Baseline (1 GPU, eager) -bash run_server_ltx2.sh baseline - -# 4-GPU Ulysses sequence parallelism (lossless) -bash run_server_ltx2.sh ulysses4 - -# Cache-DiT lossy acceleration (1 GPU, ~1.4× speedup) -bash run_server_ltx2.sh cache-dit - -# Best combo: 4-GPU Ulysses SP + Cache-DiT (~2.2× speedup) -bash run_server_ltx2.sh best-combo -``` - -#### Optimization Benchmarks - -Benchmarked on H800, online serving (480×768, 41 frames, 20 steps, `seed=42`). -"Inference" is the server-reported inference time; excludes HTTP/poll overhead. - -| Preset | Server Command | Inference (s) | Speedup | Type | -|--------|---------------|---------------|---------|------| -| `baseline` | `--enforce-eager` | 10.3 | 1.00× | — | -| `compile` | *(default, no --enforce-eager)* | ~10.3 (warm) | ~1.00× | Lossless | -| `ulysses4` | `--enforce-eager --usp 4` | ~10.3 | ~1.00× | Lossless | -| `cache-dit` | `--enforce-eager --cache-backend cache_dit` | 7.4 avg | ~1.4× | Lossy | -| `best-combo` | `--enforce-eager --usp 4 --cache-backend cache_dit` | 4.7 avg | **~2.2×** | Lossless + Lossy | - -**Observations**: -- **torch.compile**: On H800, warm-request inference time matches the eager baseline (~10.3s). - The first request pays ~6s compilation overhead. Benefit depends on model architecture and GPU. -- **Ulysses SP (4 GPU)**: No measurable speedup alone for 41-frame generation at this resolution. - Communication overhead outweighs gains at this sequence length. -- **Cache-DiT**: Inference varies per request (6–10s) due to dynamic caching decisions. - Average is ~7.4s (~1.4× speedup) with slight quality tradeoff. -- **Best combo**: 4-GPU Ulysses SP + Cache-DiT synergize well — Cache-DiT reduces per-step - computation, making the communication overhead of Ulysses SP worthwhile. Average ~4.7s - (~2.2× speedup). -- **FP8 quantization**: Reduces VRAM but does not speed up LTX-2 on H800 (compute-bound). - -**Deployment Recommendations**: -- For **production with quality priority**: use `baseline` with `--enforce-eager` -- For **maximum throughput** (4 GPUs, quality tradeoff): use `best-combo` (~2.2× speedup) -- For **single-GPU throughput**: use `cache-dit` (~1.4× speedup) -- `--enforce-eager` is recommended to avoid torch.compile warmup latency on first request - -### Send Requests (curl) - -```bash -# Using the provided script -bash run_curl_ltx2.sh - -# Or directly -curl -sS -X POST http://localhost:8098/v1/videos \ - -H "Accept: application/json" \ - -F "prompt=A serene lakeside sunrise with mist over the water." \ - -F "width=768" \ - -F "height=480" \ - -F "num_frames=41" \ - -F "fps=24" \ - -F "num_inference_steps=20" \ - -F "guidance_scale=3.0" \ - -F "seed=42" -``` diff --git a/examples/online_serving/text_to_video/run_curl_ltx2.sh b/examples/online_serving/text_to_video/run_curl_ltx2.sh deleted file mode 100644 index b82f672eaab..00000000000 --- a/examples/online_serving/text_to_video/run_curl_ltx2.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# -# LTX-2 text-to-video curl example using the async video job API. -# Start the server first: bash run_server_ltx2.sh best-combo - -set -euo pipefail - -BASE_URL="${BASE_URL:-http://localhost:8098}" -OUTPUT_PATH="${OUTPUT_PATH:-ltx2_output.mp4}" -POLL_INTERVAL="${POLL_INTERVAL:-2}" - -PROMPT="${PROMPT:-A serene lakeside sunrise with mist over the water.}" - -create_response=$( - curl -sS -X POST "${BASE_URL}/v1/videos" \ - -H "Accept: application/json" \ - -F "prompt=${PROMPT}" \ - -F "width=768" \ - -F "height=480" \ - -F "num_frames=41" \ - -F "fps=24" \ - -F "num_inference_steps=20" \ - -F "guidance_scale=3.0" \ - -F "seed=42" -) - -video_id="$(echo "${create_response}" | jq -r '.id')" -if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then - echo "Failed to create video job:" - echo "${create_response}" | jq . - exit 1 -fi - -echo "Created video job ${video_id}" -echo "${create_response}" | jq . - -while true; do - status_response="$(curl -sS "${BASE_URL}/v1/videos/${video_id}")" - status="$(echo "${status_response}" | jq -r '.status')" - - case "${status}" in - queued|in_progress) - echo "Video job ${video_id} status: ${status}" - sleep "${POLL_INTERVAL}" - ;; - completed) - echo "${status_response}" | jq . - break - ;; - failed) - echo "Video generation failed:" - echo "${status_response}" | jq . - exit 1 - ;; - *) - echo "Unexpected status response:" - echo "${status_response}" | jq . - exit 1 - ;; - esac -done - -curl -sS -L "${BASE_URL}/v1/videos/${video_id}/content" -o "${OUTPUT_PATH}" -echo "Saved video to ${OUTPUT_PATH}" diff --git a/examples/online_serving/text_to_video/run_server_ltx2.sh b/examples/online_serving/text_to_video/run_server_ltx2.sh deleted file mode 100644 index f4597d3cd28..00000000000 --- a/examples/online_serving/text_to_video/run_server_ltx2.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# -# LTX-2 online serving startup script with optimization presets. -# -# Usage: -# bash run_server_ltx2.sh # baseline (1 GPU, eager) -# bash run_server_ltx2.sh ulysses4 # 4-GPU Ulysses SP -# bash run_server_ltx2.sh cache-dit # 1 GPU + Cache-DiT -# bash run_server_ltx2.sh best-combo # 4-GPU Ulysses SP + Cache-DiT -# -# Online serving benchmarks on H800 (480×768, 41 frames, 20 steps): -# baseline : 10.3s inference (1.00×) -# compile : ~10.3s warm (~1.00×) first request +6s warmup -# ulysses4 : ~10.3s (~1.00×) no gain at 41 frames -# cache-dit : 7.4s avg (~1.4×) lossy, variable per request -# best-combo : 4.7s avg (~2.2×) 4-GPU ulysses + cache-dit - -set -euo pipefail - -MODEL="${MODEL:-Lightricks/LTX-2}" -PORT="${PORT:-8098}" -FLOW_SHIFT="${FLOW_SHIFT:-1.0}" -BOUNDARY_RATIO="${BOUNDARY_RATIO:-1.0}" - -PRESET="${1:-baseline}" - -EXTRA_ARGS=() -case "$PRESET" in - baseline) - echo "=== LTX-2 Preset: baseline (1 GPU, enforce-eager) ===" - EXTRA_ARGS+=(--enforce-eager) - ;; - ulysses2) - echo "=== LTX-2 Preset: 2-GPU Ulysses SP (lossless) ===" - EXTRA_ARGS+=(--enforce-eager --usp 2) - ;; - ulysses4) - echo "=== LTX-2 Preset: 4-GPU Ulysses SP (lossless) ===" - EXTRA_ARGS+=(--enforce-eager --usp 4) - ;; - cache-dit) - echo "=== LTX-2 Preset: Cache-DiT (1 GPU, lossy) ===" - EXTRA_ARGS+=(--enforce-eager --cache-backend cache_dit) - ;; - best-combo) - echo "=== LTX-2 Preset: 4-GPU Ulysses SP + Cache-DiT (best combo) ===" - EXTRA_ARGS+=(--enforce-eager --usp 4 --cache-backend cache_dit) - ;; - compile) - echo "=== LTX-2 Preset: torch.compile (1 GPU, lossless) ===" - # torch.compile is the default (no --enforce-eager) - ;; - *) - echo "Usage: $0 {baseline|ulysses2|ulysses4|cache-dit|best-combo|compile}" - echo "" - echo "Presets:" - echo " baseline - 1 GPU, eager execution (reference)" - echo " ulysses2 - 2-GPU Ulysses SP (lossless)" - echo " ulysses4 - 4-GPU Ulysses SP (lossless)" - echo " cache-dit - 1 GPU + Cache-DiT (lossy, ~1.4× speedup)" - echo " best-combo - 4-GPU Ulysses SP + Cache-DiT (~2.2× speedup)" - echo " compile - 1 GPU + torch.compile (slower first request)" - echo "" - echo "Environment variables:" - echo " MODEL - Model path (default: Lightricks/LTX-2)" - echo " PORT - Server port (default: 8098)" - echo " FLOW_SHIFT - Scheduler flow shift (default: 1.0)" - echo " BOUNDARY_RATIO - Boundary ratio (default: 1.0)" - exit 1 - ;; -esac - -echo "Model: $MODEL" -echo "Port: $PORT" -echo "Flow shift: $FLOW_SHIFT" -echo "Boundary ratio: $BOUNDARY_RATIO" - -vllm serve "$MODEL" --omni \ - --port "$PORT" \ - --flow-shift "$FLOW_SHIFT" \ - --boundary-ratio "$BOUNDARY_RATIO" \ - "${EXTRA_ARGS[@]}" diff --git a/examples/online_serving/voxcpm/README.md b/examples/online_serving/voxcpm/README.md deleted file mode 100644 index 78e1bf4aaa3..00000000000 --- a/examples/online_serving/voxcpm/README.md +++ /dev/null @@ -1,166 +0,0 @@ -# VoxCPM - -## Prerequisites - -Install VoxCPM in one of these ways: - -```bash -pip install voxcpm -``` - -or point vLLM-Omni to a local VoxCPM source tree: - -```bash -export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/VoxCPM/src -``` - -If the native VoxCPM `config.json` lacks HF metadata such as `model_type`, -prepare a persistent HF-compatible config directory and export: - -```bash -export VLLM_OMNI_VOXCPM_HF_CONFIG_PATH=/tmp/voxcpm_hf_config -mkdir -p "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH" -cp "$VOXCPM_MODEL/config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/config.json" -cp "$VOXCPM_MODEL/generation_config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/generation_config.json" 2>/dev/null || true -python3 -c 'import json, os; p=os.path.join(os.environ["VLLM_OMNI_VOXCPM_HF_CONFIG_PATH"], "config.json"); cfg=json.load(open(p, "r", encoding="utf-8")); cfg["model_type"]="voxcpm"; cfg.setdefault("architectures", ["VoxCPMForConditionalGeneration"]); json.dump(cfg, open(p, "w", encoding="utf-8"), indent=2, ensure_ascii=False)' -``` - -The VoxCPM stage configs read `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH` directly. The `python3 -c` form above avoids heredoc/indentation issues in interactive shells. - -## Launch the Server - -Use the async-chunk stage config by default: - -```bash -export VOXCPM_MODEL=/path/to/voxcpm-model -cd examples/online_serving/voxcpm -./run_server.sh -``` - -Use the non-streaming stage config: - -```bash -./run_server.sh sync -``` - -You can also launch the server directly: - -```bash -vllm serve "$VOXCPM_MODEL" \ - --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ - --trust-remote-code \ - --enforce-eager \ - --omni \ - --port 8091 -``` - -## Send Requests - -### Basic text-to-speech - -```bash -python openai_speech_client.py \ - --model "$VOXCPM_MODEL" \ - --text "This is a VoxCPM online text-to-speech example." -``` - -### Voice cloning - -```bash -python openai_speech_client.py \ - --model "$VOXCPM_MODEL" \ - --text "This sentence is synthesized with a cloned voice." \ - --ref-audio /path/to/reference.wav \ - --ref-text "The exact transcript spoken in reference.wav." -``` - -`ref_text` must be the real transcript of the reference audio. Placeholder text or mismatched text will usually degrade quality badly. - -### Streaming PCM output - -```bash -python openai_speech_client.py \ - --model "$VOXCPM_MODEL" \ - --text "This is a streaming VoxCPM request." \ - --stream \ - --output voxcpm_stream.pcm -``` - -### Using curl - -```bash -curl -X POST http://localhost:8091/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{ - "model": "OpenBMB/VoxCPM1.5", - "input": "Hello from VoxCPM online serving.", - "response_format": "wav" - }' --output output.wav -``` - -Voice cloning: - -```bash -curl -X POST http://localhost:8091/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{ - "model": "OpenBMB/VoxCPM1.5", - "input": "This sentence uses a cloned voice.", - "ref_audio": "https://example.com/reference.wav", - "ref_text": "The exact transcript spoken in the reference audio.", - "response_format": "wav" - }' --output cloned.wav -``` - -Streaming PCM: - -```bash -curl -X POST http://localhost:8091/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{ - "model": "OpenBMB/VoxCPM1.5", - "input": "This is a streaming VoxCPM request.", - "stream": true, - "response_format": "pcm" - }' --output output.pcm -``` - -## Supported Request Shape - -VoxCPM online serving currently supports: - -- plain text-to-speech -- voice cloning with `ref_audio` + `ref_text` -- `stream=true` with `response_format=pcm` or `wav` - -VoxCPM online serving does not use these generic TTS fields: - -- `voice` -- `instructions` -- `language` -- `speaker_embedding` -- `x_vector_only_mode` - -## Streaming vs Non-Streaming - -- `voxcpm_async_chunk.yaml` enables async-chunk streaming and is best for single-request streaming latency. -- `voxcpm.yaml` performs one-shot latent generation then VAE decode. - -Like native VoxCPM, the async streaming path should be treated as single-request. If you need stable throughput benchmarking, prefer `voxcpm.yaml`. - -Do not use `voxcpm_async_chunk.yaml` for concurrent online streaming or `/v1/audio/speech/batch`. For multiple requests, prefer `voxcpm.yaml`. - -## Benchmark - -The serving benchmark reports TTFP and RTF: - -```bash -python benchmarks/voxcpm/vllm_omni/bench_tts_serve.py \ - --host 127.0.0.1 \ - --port 8091 \ - --num-prompts 10 \ - --max-concurrency 1 \ - --result-dir /tmp/voxcpm_bench -``` - -For the async-chunk server, keep `--max-concurrency 1`. diff --git a/examples/online_serving/voxcpm/openai_speech_client.py b/examples/online_serving/voxcpm/openai_speech_client.py deleted file mode 100644 index c400114e8be..00000000000 --- a/examples/online_serving/voxcpm/openai_speech_client.py +++ /dev/null @@ -1,155 +0,0 @@ -"""OpenAI-compatible client for VoxCPM via /v1/audio/speech. - -Examples: - # Basic text-to-speech - python openai_speech_client.py --text "Hello from VoxCPM" - - # Voice cloning - python openai_speech_client.py \ - --text "This sentence uses the cloned voice." \ - --ref-audio /path/to/reference.wav \ - --ref-text "The exact transcript spoken in the reference audio." - - # Streaming PCM output - python openai_speech_client.py \ - --text "This is a streaming VoxCPM request." \ - --stream \ - --output output.pcm -""" - -import argparse -import base64 -import os - -import httpx - -DEFAULT_API_BASE = "http://localhost:8091" -DEFAULT_API_KEY = "EMPTY" -DEFAULT_MODEL = "OpenBMB/VoxCPM1.5" - - -def encode_audio_to_base64(audio_path: str) -> str: - """Encode a local audio file to base64 data URL.""" - if not os.path.exists(audio_path): - raise FileNotFoundError(f"Audio file not found: {audio_path}") - - ext = audio_path.lower().rsplit(".", 1)[-1] - mime_map = { - "wav": "audio/wav", - "mp3": "audio/mpeg", - "flac": "audio/flac", - "ogg": "audio/ogg", - } - mime_type = mime_map.get(ext, "audio/wav") - - with open(audio_path, "rb") as f: - audio_b64 = base64.b64encode(f.read()).decode("utf-8") - return f"data:{mime_type};base64,{audio_b64}" - - -def build_payload(args) -> dict[str, object]: - payload: dict[str, object] = { - "model": args.model, - "input": args.text, - "response_format": "pcm" if args.stream else args.response_format, - } - - if args.ref_audio: - if args.ref_audio.startswith(("http://", "https://", "data:")): - payload["ref_audio"] = args.ref_audio - else: - payload["ref_audio"] = encode_audio_to_base64(args.ref_audio) - if args.ref_text: - payload["ref_text"] = args.ref_text - if args.max_new_tokens is not None: - payload["max_new_tokens"] = args.max_new_tokens - if args.stream: - payload["stream"] = True - - return payload - - -def run_tts(args) -> None: - payload = build_payload(args) - api_url = f"{args.api_base}/v1/audio/speech" - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {args.api_key}", - } - - print(f"Model: {args.model}") - print(f"Text: {args.text}") - if args.ref_audio: - print("Mode: voice cloning") - print(f"Reference audio: {args.ref_audio}") - else: - print("Mode: text-to-speech") - - if args.stream: - output_path = args.output or "voxcpm_output.pcm" - with httpx.Client(timeout=300.0) as client: - with client.stream("POST", api_url, json=payload, headers=headers) as response: - if response.status_code != 200: - print(f"Error: {response.status_code}") - print(response.read().decode("utf-8", errors="ignore")) - return - - total_bytes = 0 - with open(output_path, "wb") as f: - for chunk in response.iter_bytes(): - if not chunk: - continue - f.write(chunk) - total_bytes += len(chunk) - print(f"Streamed {total_bytes} bytes to: {output_path}") - return - - with httpx.Client(timeout=300.0) as client: - response = client.post(api_url, json=payload, headers=headers) - - if response.status_code != 200: - print(f"Error: {response.status_code}") - print(response.text) - return - - try: - text = response.content.decode("utf-8") - if text.startswith('{"error"'): - print(f"Error: {text}") - return - except UnicodeDecodeError: - pass - - output_path = args.output or "voxcpm_output.wav" - with open(output_path, "wb") as f: - f.write(response.content) - print(f"Audio saved to: {output_path}") - - -def main(): - parser = argparse.ArgumentParser(description="VoxCPM OpenAI-compatible speech client") - parser.add_argument("--api-base", default=DEFAULT_API_BASE, help="API base URL") - parser.add_argument("--api-key", default=DEFAULT_API_KEY, help="API key") - parser.add_argument("--model", "-m", default=DEFAULT_MODEL, help="Model name or path") - parser.add_argument("--text", required=True, help="Text to synthesize") - parser.add_argument("--ref-audio", default=None, help="Reference audio path, URL, or data URL") - parser.add_argument( - "--ref-text", - default=None, - help="The exact transcript spoken in the reference audio", - ) - parser.add_argument("--stream", action="store_true", help="Enable streaming PCM output") - parser.add_argument( - "--response-format", - default="wav", - choices=["wav", "pcm", "flac", "mp3", "aac", "opus"], - help="Audio format for non-streaming mode (default: wav)", - ) - parser.add_argument("--max-new-tokens", type=int, default=None, help="Maximum tokens to generate") - parser.add_argument("--output", "-o", default=None, help="Output file path") - args = parser.parse_args() - run_tts(args) - - -if __name__ == "__main__": - main() diff --git a/examples/online_serving/voxcpm/run_server.sh b/examples/online_serving/voxcpm/run_server.sh deleted file mode 100755 index ab4b6fe854e..00000000000 --- a/examples/online_serving/voxcpm/run_server.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# Launch vLLM-Omni server for VoxCPM online speech serving. -# -# Usage: -# ./run_server.sh # default: async_chunk stage config -# ./run_server.sh async # async_chunk stage config -# ./run_server.sh sync # no-async-chunk stage config -# VOXCPM_MODEL=/path/to/model ./run_server.sh - -set -e - -MODE="${1:-async}" -MODEL="${VOXCPM_MODEL:-OpenBMB/VoxCPM1.5}" - -case "$MODE" in - async) - STAGE_CONFIG="vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml" - ;; - sync) - STAGE_CONFIG="vllm_omni/model_executor/stage_configs/voxcpm.yaml" - ;; - *) - echo "Unknown mode: $MODE" - echo "Supported: async, sync" - exit 1 - ;; -esac - -echo "Starting VoxCPM server with model: $MODEL" -echo "Stage config: $STAGE_CONFIG" - -vllm serve "$MODEL" \ - --stage-configs-path "$STAGE_CONFIG" \ - --host 0.0.0.0 \ - --port 8091 \ - --trust-remote-code \ - --enforce-eager \ - --omni diff --git a/examples/online_serving/voxcpm2/README.md b/examples/online_serving/voxcpm2/README.md deleted file mode 100644 index 9ca2ae708a3..00000000000 --- a/examples/online_serving/voxcpm2/README.md +++ /dev/null @@ -1,43 +0,0 @@ -# VoxCPM2 Online Serving - -Serve VoxCPM2 TTS via the OpenAI-compatible `/v1/audio/speech` endpoint. - -## Start the Server - -```bash -vllm serve openbmb/VoxCPM2 --omni --host 0.0.0.0 --port 8000 -``` - -The deploy config is auto-loaded from `vllm_omni/deploy/voxcpm2.yaml`. Pass -`--deploy-config ` to override, or `--stage-N- ` (e.g. -`--stage-0-max-num-seqs 8`) for per-stage runtime overrides. - -## Zero-shot Synthesis - -```bash -python openai_speech_client.py --text "Hello, this is VoxCPM2." -``` - -Or with curl: - -```bash -curl -X POST http://localhost:8000/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{"model": "voxcpm2", "input": "Hello, this is VoxCPM2.", "voice": "default"}' \ - --output output.wav -``` - -## Voice Cloning - -Clone a speaker's voice using a reference audio file: - -```bash -python openai_speech_client.py \ - --text "This should sound like the reference speaker." \ - --ref-audio /path/to/reference.wav -``` - -The `--ref-audio` parameter accepts: -- Local file path (auto-encoded to base64) -- URL (`https://...`) -- Base64 data URI (`data:audio/wav;base64,...`) diff --git a/examples/online_serving/voxcpm2/gradio_demo.py b/examples/online_serving/voxcpm2/gradio_demo.py deleted file mode 100644 index c6706198ae4..00000000000 --- a/examples/online_serving/voxcpm2/gradio_demo.py +++ /dev/null @@ -1,599 +0,0 @@ -"""Gradio demo for VoxCPM2 TTS with gapless streaming audio playback. - -Uses a custom AudioWorklet-based player for gap-free streaming -(adapted from the Qwen3-TTS demo). Audio is streamed from the vLLM -server through a same-origin proxy and played via the Web Audio API's -AudioWorklet, which maintains a FIFO buffer queue and plays samples at -the audio clock rate. - -Usage: - # Start the vLLM server first: - vllm serve openbmb/VoxCPM2 --omni --host 0.0.0.0 --port 8000 - - # Then launch the demo: - python gradio_demo.py --api-base http://localhost:8000 -""" - -from __future__ import annotations - -import argparse -import base64 -import io -import json -import logging - -import gradio as gr -import httpx -import numpy as np -import soundfile as sf -from fastapi import FastAPI, Request -from fastapi.responses import Response, StreamingResponse - -logger = logging.getLogger(__name__) - -SAMPLE_RATE = 48000 - -# ── AudioWorklet processor (loaded in browser via Blob URL) ────────── -WORKLET_JS = r""" -class TTSPlaybackProcessor extends AudioWorkletProcessor { - constructor() { - super(); - this.queue = []; - this.buf = null; - this.pos = 0; - this.playing = false; - this.played = 0; - this.port.onmessage = (e) => { - if (e.data && e.data.type === 'clear') { - this.queue = []; this.buf = null; this.pos = 0; this.played = 0; - if (this.playing) { this.playing = false; this.port.postMessage({type:'stopped'}); } - return; - } - this.queue.push(e.data); - }; - } - process(inputs, outputs) { - const out = outputs[0][0]; - for (let i = 0; i < out.length; i++) { - if (!this.buf || this.pos >= this.buf.length) { - if (this.queue.length > 0) { - this.buf = this.queue.shift(); this.pos = 0; - } else { - for (let j = i; j < out.length; j++) out[j] = 0; - if (this.playing) { this.playing = false; this.port.postMessage({type:'stopped', played:this.played}); } - return true; - } - } - out[i] = this.buf[this.pos++] / 32768; - this.played++; - } - if (!this.playing) { this.playing = true; this.port.postMessage({type:'started'}); } - return true; - } -} -registerProcessor('tts-playback-processor', TTSPlaybackProcessor); -""" - -PLAYER_HTML = """ -

-
-
- Ready - -
- - - -
-""" - - -def _build_player_js() -> str: - return f""" - -""" - - -def _encode_audio(audio_data: tuple) -> str: - sr, audio_np = audio_data - if audio_np.dtype in (np.float32, np.float64): - audio_np = np.clip(audio_np, -1.0, 1.0) - audio_np = (audio_np * 32767).astype(np.int16) - elif audio_np.dtype != np.int16: - audio_np = audio_np.astype(np.int16) - buf = io.BytesIO() - sf.write(buf, audio_np, sr, format="WAV") - return f"data:audio/wav;base64,{base64.b64encode(buf.getvalue()).decode()}" - - -def create_app(api_base: str): - app = FastAPI() - _pending: dict[str, dict] = {} - - @app.post("/proxy/v1/audio/speech") - async def proxy_speech(request: Request): - body = await request.json() - req_id = body.get("_req_id") - if req_id and req_id in _pending: - body = _pending.pop(req_id) - logger.info("Proxy: %s", {k: (f"<{len(str(v))} chars>" if k == "ref_audio" else v) for k, v in body.items()}) - try: - client = httpx.AsyncClient(timeout=300) - resp = await client.send( - client.build_request( - "POST", - f"{api_base}/v1/audio/speech", - json=body, - headers={"Authorization": "Bearer EMPTY", "Content-Type": "application/json"}, - ), - stream=True, - ) - except Exception as exc: - logger.exception("Proxy connection error") - await client.aclose() - return Response(content=str(exc), status_code=502) - if resp.status_code != 200: - content = await resp.aread() - await resp.aclose() - await client.aclose() - return Response(content=content, status_code=resp.status_code) - - async def relay(): - try: - async for chunk in resp.aiter_bytes(): - yield chunk - finally: - await resp.aclose() - await client.aclose() - - return StreamingResponse(relay(), media_type="application/octet-stream") - - css = """ - #generate-btn button { width: 100%; } - #streaming-player { border: 1px solid var(--border-color-primary) !important; border-radius: var(--block-radius) !important; padding: var(--block-padding) !important; } - """ - theme = gr.themes.Default( - primary_hue=gr.themes.Color( - c50="#f0f5ff", - c100="#dce6f9", - c200="#b8cef3", - c300="#8eb2eb", - c400="#6496e0", - c500="#4A90D9", - c600="#3a7bc8", - c700="#2d66b0", - c800="#1f4f8f", - c900="#163a6e", - c950="#0e2650", - ), - ) - - with gr.Blocks(title="VoxCPM2 TTS Demo") as demo: - gr.HTML(f""" -
- vLLM-Omni -
-

VoxCPM2 Streaming Demo

- - Served by vLLM-Omni - · {api_base} - · 48 kHz - -
-
- """) - - gr.Markdown( - "**Three modes:** " - "**Voice Design** (control instruction only) · " - "**Controllable Cloning** (ref audio + optional style control) · " - "**Ultimate Cloning** (ref audio + transcript for audio continuation)" - ) - - with gr.Row(): - with gr.Column(scale=3): - text_input = gr.Textbox( - label="Target Text", - placeholder="Enter text to synthesize...", - lines=4, - ) - control_instruction = gr.Textbox( - label="Control Instruction (optional)", - placeholder="e.g. A warm young woman / Excited and fast-paced", - lines=2, - info="Describe voice style, emotion, pace. Works for both Voice Design and Controllable Cloning.", - ) - - with gr.Accordion("Voice Cloning", open=False): - ref_audio = gr.Audio( - label="Reference Audio (upload for cloning)", - type="numpy", - sources=["upload", "microphone"], - ) - ref_audio_url = gr.Textbox( - label="or Reference Audio URL", - placeholder="https://example.com/reference.wav", - ) - ultimate_clone = gr.Checkbox( - label="Ultimate Cloning Mode", - value=False, - info="Provide transcript of ref audio for audio continuation (disables control instruction)", - ) - prompt_text = gr.Textbox( - label="Reference Audio Transcript", - placeholder="Transcript of your reference audio (for ultimate cloning)", - lines=2, - visible=False, - ) - - with gr.Row(): - stream_checkbox = gr.Checkbox( - label="Stream (gapless)", - value=True, - info="AudioWorklet streaming", - ) - with gr.Row(): - generate_btn = gr.Button( - "Generate Speech", - variant="primary", - size="lg", - elem_id="generate-btn", - scale=3, - ) - reset_btn = gr.Button("Reset", variant="secondary", size="lg", scale=1) - - with gr.Column(scale=2): - player_html = gr.HTML( - value=PLAYER_HTML, - visible=True, - label="streaming player", - elem_id="streaming-player", - ) - audio_output = gr.Audio( - label="generated audio", - interactive=False, - autoplay=True, - visible=False, - ) - gr.Examples( - examples=[ - ["Hello, this is a VoxCPM2 demo running on vLLM-Omni.", ""], - [ - "I have a dream that my four little children will one day live in a nation " - "where they will not be judged by the color of their skin but by the content " - "of their character.", - "", - ], - [ - "I never asked you to stay. It's not like I care or anything. " - "But why does it still hurt so much now that you're gone?", - "A young girl with a soft, sweet voice. Speaks slowly with a melancholic tone.", - ], - ], - inputs=[text_input, control_instruction], - label="examples", - ) - gr.HTML(""" -
- - vLLM-Omni - -
- """) - - hidden_payload = gr.Textbox(visible=False, elem_id="tts-payload") - - def on_ultimate_toggle(checked): - return ( - gr.update(visible=checked), # prompt_text - gr.update(interactive=not checked), # control_instruction - ) - - ultimate_clone.change( - fn=on_ultimate_toggle, - inputs=[ultimate_clone], - outputs=[prompt_text, control_instruction], - ) - - def on_stream_change(stream: bool): - if stream: - return gr.update(visible=True), gr.update(visible=False) - return gr.update(visible=False), gr.update(visible=True) - - stream_checkbox.change( - fn=on_stream_change, - inputs=[stream_checkbox], - outputs=[player_html, audio_output], - ) - - def on_reset(): - return "", "", None, "", False, "", PLAYER_HTML - - reset_btn.click( - fn=on_reset, - outputs=[ - text_input, - control_instruction, - audio_output, - hidden_payload, - ultimate_clone, - prompt_text, - player_html, - ], - js="() => { if (window.ttsStop) window.ttsStop(); }", - ) - - def on_generate(stream_enabled, text, ctrl_instr, ref_a, ref_url, ult_clone, p_text): - import time as _time - - if not text or not text.strip(): - raise gr.Error("Please enter text to synthesize.") - - # VoxCPM2 uses "(instruction)text" format for control - ctrl = ctrl_instr.strip() if ctrl_instr and not ult_clone else "" - final_text = f"({ctrl}){text.strip()}" if ctrl else text.strip() - - payload: dict = { - "input": final_text, - "voice": "default", - "response_format": "pcm" if stream_enabled else "wav", - "stream": stream_enabled, - } - - # Reference audio for cloning - ref_url_s = ref_url.strip() if ref_url else "" - if ref_url_s: - payload["ref_audio"] = ref_url_s - elif ref_a is not None: - payload["ref_audio"] = _encode_audio(ref_a) - - # Ultimate cloning: prompt_audio + prompt_text for continuation - if ult_clone and p_text and p_text.strip(): - if ref_url_s: - payload["prompt_audio"] = ref_url_s - elif ref_a is not None: - payload["prompt_audio"] = payload.get("ref_audio", "") - payload["prompt_text"] = p_text.strip() - - if stream_enabled: - if ref_a is not None and not ref_url_s: - req_id = f"req-{int(_time.time() * 1000)}" - _pending[req_id] = payload - browser_payload = {"_req_id": req_id, "_nonce": int(_time.time() * 1000)} - return json.dumps(browser_payload), gr.update() - payload["_nonce"] = int(_time.time() * 1000) - return json.dumps(payload), gr.update() - else: - try: - with httpx.Client(timeout=300.0) as client: - resp = client.post( - f"{api_base}/v1/audio/speech", - json=payload, - headers={"Content-Type": "application/json", "Authorization": "Bearer EMPTY"}, - ) - except httpx.ConnectError: - raise gr.Error(f"Cannot connect to server at {api_base}.") - if resp.status_code != 200: - raise gr.Error(f"Server error ({resp.status_code}): {resp.text[:200]}") - audio_np, sr = sf.read(io.BytesIO(resp.content)) - if audio_np.ndim > 1: - audio_np = audio_np[:, 0] - return "", (sr, audio_np.astype(np.float32)) - - generate_btn.click( - fn=on_generate, - inputs=[ - stream_checkbox, - text_input, - control_instruction, - ref_audio, - ref_audio_url, - ultimate_clone, - prompt_text, - ], - outputs=[hidden_payload, audio_output], - ).then( - fn=lambda p: p, - inputs=[hidden_payload], - outputs=[hidden_payload], - js="(p) => { if (p && p.trim()) { const d = JSON.parse(p); delete d._nonce; window.ttsGenerate(d); } return p; }", - ) - - demo.queue() - - return gr.mount_gradio_app(app, demo, path="/", css=css, theme=theme, head=_build_player_js()) - - -def main(): - parser = argparse.ArgumentParser(description="VoxCPM2 streaming Gradio demo") - parser.add_argument("--api-base", default="http://localhost:8000", help="vLLM API server URL") - parser.add_argument("--host", default="0.0.0.0", help="Gradio server host") - parser.add_argument("--port", type=int, default=7860, help="Gradio server port") - args = parser.parse_args() - - logging.basicConfig(level=logging.INFO) - print(f"Connecting to vLLM server at: {args.api_base}") - - import uvicorn - - uvicorn.run(create_app(args.api_base), host=args.host, port=args.port) - - -if __name__ == "__main__": - main() diff --git a/examples/online_serving/voxcpm2/openai_speech_client.py b/examples/online_serving/voxcpm2/openai_speech_client.py deleted file mode 100644 index 127b8cebb09..00000000000 --- a/examples/online_serving/voxcpm2/openai_speech_client.py +++ /dev/null @@ -1,105 +0,0 @@ -"""OpenAI-compatible client for VoxCPM2 TTS via /v1/audio/speech endpoint. - -Examples: - # Zero-shot synthesis - python openai_speech_client.py --text "Hello, this is VoxCPM2." - - # Voice cloning with a local reference audio file - python openai_speech_client.py --text "Hello world" \ - --ref-audio /path/to/reference.wav - - # Voice cloning with a URL - python openai_speech_client.py --text "Hello world" \ - --ref-audio "https://example.com/reference.wav" - -Server setup: - vllm serve openbmb/VoxCPM2 --omni --host 0.0.0.0 --port 8000 -""" - -from __future__ import annotations - -import argparse -import base64 -import os - -import httpx - -DEFAULT_API_BASE = "http://localhost:8000" -DEFAULT_API_KEY = "sk-empty" - - -def encode_audio_to_base64(audio_path: str) -> str: - """Encode a local audio file to a base64 data URL.""" - if not os.path.exists(audio_path): - raise FileNotFoundError(f"Audio file not found: {audio_path}") - - ext = audio_path.lower().rsplit(".", 1)[-1] - mime = { - "wav": "audio/wav", - "mp3": "audio/mpeg", - "flac": "audio/flac", - "ogg": "audio/ogg", - }.get(ext, "audio/wav") - - with open(audio_path, "rb") as f: - b64 = base64.b64encode(f.read()).decode("utf-8") - return f"data:{mime};base64,{b64}" - - -def main() -> None: - parser = argparse.ArgumentParser(description="VoxCPM2 OpenAI speech client") - parser.add_argument("--text", type=str, required=True, help="Text to synthesize") - parser.add_argument( - "--ref-audio", - type=str, - default=None, - help="Reference audio for voice cloning (local path, URL, or data: URI)", - ) - parser.add_argument("--model", type=str, default="voxcpm2") - parser.add_argument("--output", type=str, default="output.wav") - parser.add_argument("--api-base", type=str, default=DEFAULT_API_BASE) - parser.add_argument("--api-key", type=str, default=DEFAULT_API_KEY) - parser.add_argument("--response-format", type=str, default="wav") - args = parser.parse_args() - - # VoxCPM2 has no predefined voices. The "voice" field is required by - # the OpenAI API schema but ignored by VoxCPM2 — use any placeholder. - # For voice cloning, pass --ref-audio instead. - payload: dict = { - "model": args.model, - "input": args.text, - "voice": "default", - "response_format": args.response_format, - } - - if args.ref_audio: - ref = args.ref_audio - if ref.startswith(("http://", "https://", "data:")): - payload["ref_audio"] = ref - else: - payload["ref_audio"] = encode_audio_to_base64(ref) - - url = f"{args.api_base}/v1/audio/speech" - print(f"POST {url}") - print(f" text: {args.text}") - if args.ref_audio: - print(f" ref_audio: {args.ref_audio[:80]}...") - - with httpx.Client(timeout=300) as client: - resp = client.post( - url, - json=payload, - headers={"Authorization": f"Bearer {args.api_key}"}, - ) - - if resp.status_code != 200: - print(f"Error {resp.status_code}: {resp.text[:500]}") - return - - with open(args.output, "wb") as f: - f.write(resp.content) - print(f"Saved: {args.output} ({len(resp.content):,} bytes)") - - -if __name__ == "__main__": - main() diff --git a/examples/online_serving/voxtral_tts/gradio_demo.py b/examples/online_serving/voxtral_tts/gradio_demo.py index 7905c62618c..35d6b590c97 100644 --- a/examples/online_serving/voxtral_tts/gradio_demo.py +++ b/examples/online_serving/voxtral_tts/gradio_demo.py @@ -216,7 +216,6 @@ def update_voice_dropdown(language: str) -> gr.Dropdown: def run_inference( voice_name: str, text_prompt: str, - cfg_alpha: float, base_url: str, model: str, ) -> tuple[int, np.ndarray]: @@ -234,7 +233,6 @@ def run_inference( "model": model, "response_format": "wav", "voice": voice_name, - "extra_params": {"cfg_alpha": cfg_alpha}, } response = httpx.post( @@ -379,14 +377,6 @@ def main( placeholder="Enter the text you want to synthesize...", lines=4, ) - cfg_alpha_slider = gr.Slider( - minimum=1.0, - maximum=2.0, - step=0.1, - value=1.2, - label="CFG Alpha", - info="Flow-matching guidance strength (default: 1.2)", - ) with gr.Row(): reset_btn = gr.Button("Clear") submit_btn = gr.Button("Generate audio", interactive=False) @@ -425,9 +415,9 @@ def _toggle_submit(text: str): ) # --- Wiring inference + persistence to the button --- - def _on_submit(voice: str, text: str, cfg_alpha: float): + def _on_submit(voice: str, text: str): assert text.strip() != "" - sr, audio_array = run_inference(voice, text, cfg_alpha, base_url, model) + sr, audio_array = run_inference(voice, text, base_url, model) if outputs_dir is not None: share_id, saved_audio_path = _save_example( outputs_dir, @@ -442,7 +432,7 @@ def _on_submit(voice: str, text: str, cfg_alpha: float): submit_btn.click( fn=_on_submit, - inputs=[voice_name, text_prompt, cfg_alpha_slider], + inputs=[voice_name, text_prompt], outputs=[output_audio, share_link_box], ) @@ -456,7 +446,6 @@ def _on_reset(): language, # language_dropdown voice, # voice_name "", # text_prompt - 1.2, # cfg_alpha_slider None, # output_audio gr.update(interactive=False), # submit_btn "", # share_link_box @@ -467,15 +456,7 @@ def _on_reset(): reset_btn.click( fn=make_on_reset(languages, language_voices), inputs=[], - outputs=[ - language_dropdown, - voice_name, - text_prompt, - cfg_alpha_slider, - output_audio, - submit_btn, - share_link_box, - ], + outputs=[language_dropdown, voice_name, text_prompt, output_audio, submit_btn, share_link_box], ) def make_load_from_share(outputs_dir: Path | None, languages: list[str], language_voices: dict[str, list[str]]): diff --git a/mkdocs.yml b/mkdocs.yml index 1e184439bd1..6461c65f220 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -97,7 +97,6 @@ plugins: exclude: - "re:vllm_omni\\._.*" # Internal modules - "vllm_omni.diffusion.models.qwen_image" # avoid importing vllm in mkdocs building - - "vllm_omni.diffusion.models.dreamid_omni.wan2_2" # docstring signature warnings break strict docs - "vllm_omni.diffusion.quantization" # avoid importing vllm in mkdocs building - "vllm_omni.quantization" # avoid importing vllm in mkdocs building - "vllm_omni.entrypoints.async_diffusion" # avoid importing vllm in mkdocs building diff --git a/pyproject.toml b/pyproject.toml index 8346693f129..e49aa6e3251 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,12 +55,6 @@ dev = [ "pyttsx3>=2.99", "opencc>=1.2.0", "mistune>=3.2.0", # for example tests - "torchmetrics>=1.4.0", # for accuracy similarity metrics - "jiwer>=3.0.0", - "zhon>=2.0.0", - "zhconv>=1.4.2", - "scipy>=1.10.0", - "funasr>=1.0.0", ] demo = [ @@ -121,13 +115,12 @@ exclude = [ [tool.ruff.lint] select = [ - "E", # pycodestyle errors - "W", # pycodestyle warnings - "F", # pyflakes - "I", # isort (handled separately, but included for compatibility) - "N", # pep8-naming - "UP", # pyupgrade - "TID251", # flake8-tidy-imports.banned-api + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort (handled separately, but included for compatibility) + "N", # pep8-naming + "UP", # pyupgrade ] ignore = [ "E203", # whitespace before ':' (conflicts with black) @@ -142,9 +135,6 @@ ignore = [ "examples/**" = ["E501"] # Allow long lines in examples "tests/**" = ["E501"] # Allow long lines in tests -[tool.ruff.lint.flake8-tidy-imports.banned-api] -"librosa".msg = "The librosa module is banned, use vllm.multimodal helpers instead" - [tool.mypy] python_version = "3.12, 3.13" warn_return_any = true @@ -173,8 +163,7 @@ addopts = [ markers = [ # ci/cd required "core_model: L1&L2 tests (run in each PR)", - "advanced_model: L3 level tests (run on each merge)", - "full_model: L4 level tests (run nightly)", + "advanced_model: L3&L4 level tests (run in each merge or nightly)", # function module markers "diffusion: Diffusion model tests", "omni: Omni model tests", @@ -193,7 +182,6 @@ markers = [ "H100: Tests that require H100 GPU", "L4: Tests that require L4 GPU", "MI325: Tests that require MI325 GPU (AMD/ROCm)", - "B60: Tests that require Intel Arc Pro B60 XPU", "S5000: Tests that require S5000 GPU (Moore Threads/MUSA)", "A2: Tests that require A2 NPU", "A3: Tests that require A3 NPU", @@ -236,5 +224,3 @@ ue = "ue" semantics = "semantics" fullset = "fullset" Vai = "Vai" -tockens = "tockens" -CANN = "CANN" diff --git a/recipes/LTX/LTX-2.3.md b/recipes/LTX/LTX-2.3.md deleted file mode 100644 index 8d92562fb04..00000000000 --- a/recipes/LTX/LTX-2.3.md +++ /dev/null @@ -1,112 +0,0 @@ -# LTX-2.3 Text-to-Video with Audio on 1x GPU (96GB VRAM) - -> 22B parameter text-to-video + audio generation model served via vLLM-Omni - -## Summary - -- Vendor: Lightricks -- Model: `dg845/LTX-2.3-Diffusers` -- Task: Text-to-video with synchronized audio generation -- Mode: Online serving (pure diffusion) -- Maintainer: @oglok - -## When to use this recipe - -Use this recipe when you want to serve LTX-2.3 for text-to-video generation -with audio. The model generates videos up to 20+ seconds at 768x512 resolution -with 48kHz audio, all from a single text prompt. Requires a GPU with at least -96GB VRAM due to the 22B parameter transformer (~44GB weights) plus text -encoder, VAE, and vocoder components. - -## References - -- Model: -- Requires `diffusers >= 0.38.0` (install from git: `pip install git+https://github.com/huggingface/diffusers.git`) - -## Serving - -### Command - -```bash -vllm serve dg845/LTX-2.3-Diffusers \ - --omni \ - --model-class-name LTX23Pipeline \ - --stage-init-timeout 600 -``` - -### Verification - -```bash -# Health check -curl http://localhost:8000/health - -# Generate a 3-second video (81 frames at 24fps) -curl -X POST http://localhost:8000/v1/videos \ - -F "prompt=A majestic bald eagle soaring over a misty mountain valley at dawn, golden sunlight breaking through clouds" \ - -F "negative_prompt=blurry, low quality, distorted, watermark" \ - -F "model=dg845/LTX-2.3-Diffusers" \ - -F "num_frames=81" \ - -F "fps=24" \ - -F "size=768x512" \ - -F "num_inference_steps=30" \ - -F "guidance_scale=4.0" \ - -F "seed=42" - -# Generate a 10-second video (241 frames) -curl -X POST http://localhost:8000/v1/videos \ - -F "prompt=A cozy Japanese ramen shop at night in the rain, steam rising from bowls, neon signs reflecting on wet cobblestone streets" \ - -F "model=dg845/LTX-2.3-Diffusers" \ - -F "num_frames=241" \ - -F "fps=24" \ - -F "size=768x512" \ - -F "num_inference_steps=30" \ - -F "guidance_scale=4.0" - -# Generate a 20-second video (481 frames) -curl -X POST http://localhost:8000/v1/videos \ - -F "prompt=An underwater coral reef teeming with tropical fish, sea turtles gliding gracefully, National Geographic documentary style" \ - -F "model=dg845/LTX-2.3-Diffusers" \ - -F "num_frames=481" \ - -F "fps=24" \ - -F "size=768x512" \ - -F "num_inference_steps=30" \ - -F "guidance_scale=4.0" -``` - -### Notes - -- Memory usage: Model loads at ~36 GiB, peaks at ~62 GiB during inference -- Key flags: - - `--stage-init-timeout 600`: Required for the initial `torch.compile` warmup (~90-140 seconds on first request) - - `--model-class-name LTX23Pipeline`: Selects the LTX-2.3 pipeline (not LTX-2) -- Audio: 48kHz AAC via BWE vocoder, automatically synced with video -- CPU offloading: Text encoder (Gemma-3-12B), connectors, VAE, audio VAE, and vocoder stay on CPU and are moved to GPU only when needed -- Supported resolutions: 768x512, 512x384 (must be divisible by 32) -- Frame rate: 24 fps -- Duration: Controlled by `num_frames` (frames = duration_seconds * 24 + 1) -- Known limitations: - - No image-to-video support yet (LTX23ImageToVideoPipeline is a placeholder) - - No CFG-parallel support (single-GPU only) - - Requires `diffusers >= 0.38.0` (not yet on PyPI, install from git) - -## Hardware Support - -## GPU - -### 1x NVIDIA RTX PRO 6000 Blackwell (96GB) - -#### Environment - -- OS: Ubuntu 22.04 -- Python: 3.10+ -- Driver / runtime: CUDA 13.0, Driver 580.126.09 -- vLLM version: 0.19.x -- vLLM-Omni version: 0.19.x - -### Validated configurations - -| Duration | Frames | Resolution | Steps | Guidance | Inference Time | Peak VRAM | -|----------|--------|------------|-------|----------|----------------|-----------| -| 3s | 81 | 768x512 | 30 | 4.0 | ~110s | ~62 GB | -| 10s | 241 | 768x512 | 30 | 4.0 | ~130s | ~62 GB | -| 20s | 481 | 768x512 | 30 | 4.0 | ~420s | ~62 GB | diff --git a/recipes/Qwen/Qwen3-Omni.md b/recipes/Qwen/Qwen3-Omni.md deleted file mode 100644 index f78e4dda2aa..00000000000 --- a/recipes/Qwen/Qwen3-Omni.md +++ /dev/null @@ -1,99 +0,0 @@ -# Qwen3-Omni for multimodal chat on 1x A100 80GB - -## Summary - -- Vendor: Qwen -- Model: `Qwen/Qwen3-Omni-30B-A3B-Instruct` -- Task: Multimodal chat with text, image, audio, or video input -- Mode: Online serving with the OpenAI-compatible API -- Maintainer: Community - -## When to use this recipe - -Use this recipe when you want a known-good starting point for serving -`Qwen/Qwen3-Omni-30B-A3B-Instruct` with vLLM-Omni on a single 80 GB A100 and -validate the deployment with the existing multimodal client examples in this -repository. - -## References - -- Upstream or canonical docs: - [`docs/user_guide/examples/online_serving/qwen3_omni.md`](../../docs/user_guide/examples/online_serving/qwen3_omni.md) -- Related example under `examples/`: - [`examples/online_serving/qwen3_omni/README.md`](../../examples/online_serving/qwen3_omni/README.md) -- Related issue or discussion: - [RFC: add recipes folder](https://github.com/vllm-project/vllm-omni/issues/2645) - -## Hardware Support - -This recipe currently documents one tested-style reference configuration for -CUDA GPU serving. Add more sections for other hardware as community validation -lands. - -## GPU - -### 1x A100 80GB - -#### Environment - -- OS: Linux -- Python: 3.10+ -- Driver / runtime: NVIDIA CUDA environment with an A100 80 GB GPU -- vLLM version: Match the repository requirements for your checkout -- vLLM-Omni version or commit: Use the commit you are deploying from - -#### Command - -Start the server from the repository root: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 -``` - -Async chunking is enabled by default in the bundled deployment config. For -common runtime tuning, prefer CLI overrides instead of editing or passing a -custom YAML file: - -```bash -# Disable async chunking for /v1/realtime sessions -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --no-async-chunk -``` - -Use a custom deploy config only for advanced cases such as custom topology, -connector wiring, or a larger overlay of stage defaults: - -```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ - --deploy-config /path/to/your_qwen3_omni_overrides.yaml -``` - -#### Verification - -Run one of the existing example clients after the server is ready: - -```bash -python examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py \ - --model Qwen/Qwen3-Omni-30B-A3B-Instruct \ - --query-type use_image \ - --port 8091 \ - --host localhost -``` - -For a quick API smoke test, request text-only output: - -```bash -curl http://localhost:8091/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "messages": [{"role": "user", "content": "Describe vLLM in brief."}], - "modalities": ["text"] - }' -``` - -#### Notes - -- Memory usage: Size depends on runtime options and output modalities; leave headroom for multimodal workloads. Prefer CLI overrides such as `--gpu-memory-utilization` for routine tuning. -- Key flags: `--omni` is required; async chunking is enabled by default; use `--no-async-chunk` for realtime sessions and `--deploy-config` only for advanced custom deployments. -- Known limitations: The `/v1/realtime` WebSocket flow is currently unsupported while async chunking is enabled. This starter recipe is intentionally narrow and focuses on the single-GPU online-serving path already documented in the repo examples. diff --git a/recipes/README.md b/recipes/README.md deleted file mode 100644 index 69ce4d7504d..00000000000 --- a/recipes/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Community Recipes - -This directory contains community-maintained recipes for answering a -practical user question: - -> How do I run model X on hardware Y for task Z? - -Add recipes for this repository under this in-repo `recipes/` directory. To -keep naming and layout consistent, organize recipes by model vendor in a way -that is aligned with -[`vllm-project/recipes`](https://github.com/vllm-project/recipes), but treat -that external repository as a reference for structure rather than the place to -add files for this repo. Use one Markdown file per model family by default. - -Example layout: - -```text -recipes/ - Qwen/ - Qwen3-Omni.md - Qwen3-TTS.md - Tencent-Hunyuan/ - HunyuanVideo.md -``` - -## Available Recipes - -- [`Qwen/Qwen3-Omni.md`](./Qwen/Qwen3-Omni.md): online serving recipe for - multimodal chat on `1x A100 80GB` -- [`Wan-AI/Wan2.2-I2V.md`](./Wan-AI/Wan2.2-I2V.md): image-to-video serving - recipe for Wan2.2 14B on `8x Ascend NPU (A2/A3)` -- [`inclusionAI/Ming-flash-omni-2.0.md`](./inclusionAI/Ming-flash-omni-2.0.md): - online serving recipe for multimodal chat (`4x H100 80GB`) and standalone TTS (`1x H100 80GB`) - -Within a single recipe file, include different hardware support sections such -as `GPU`, `ROCm`, and `NPU`, and add concrete tested configurations like -`1x A100 80GB` or `2x L40S` inside those sections when applicable. - -See [TEMPLATE.md](./TEMPLATE.md) for the recommended format. diff --git a/recipes/TEMPLATE.md b/recipes/TEMPLATE.md deleted file mode 100644 index 9bf8cb9c759..00000000000 --- a/recipes/TEMPLATE.md +++ /dev/null @@ -1,82 +0,0 @@ -# Recipe Title - -> Example: Qwen3-Omni for speech chat on 1x A100 80GB - -## Summary - -- Vendor: -- Model: -- Task: -- Mode: -- Maintainer: - -## When to use this recipe - -Briefly describe the concrete scenario this recipe covers. - -## References - -- Upstream or canonical docs: -- Related example under `examples/`: -- Related issue or discussion: - -## Hardware Support - -Add one section per platform, such as `GPU`, `ROCm`, or `NPU`. Under each -platform section, document one or more tested hardware configurations. - -## GPU - -### 1x A100 80GB - -#### Environment - -- OS: -- Python: -- Driver / runtime: -- vLLM version: -- vLLM-Omni version or commit: - -#### Command - -```bash -# Add the exact command(s) here -``` - -#### Verification - -```bash -# Add a quick validation command or expected output here -``` - -#### Notes - -- Memory usage: -- Key flags: -- Known limitations: - -### 2x L40S - -Repeat the same structure for other hardware setups as needed. - -## ROCm - -### Example hardware configuration - -Repeat the same nested structure for ROCm setups as needed: - -- `#### Environment` -- `#### Command` -- `#### Verification` -- `#### Notes` - -## NPU - -### Example hardware configuration - -Repeat the same nested structure for NPU setups as needed: - -- `#### Environment` -- `#### Command` -- `#### Verification` -- `#### Notes` diff --git a/recipes/Wan-AI/Wan2.2-I2V.md b/recipes/Wan-AI/Wan2.2-I2V.md deleted file mode 100644 index 99ceac3cebe..00000000000 --- a/recipes/Wan-AI/Wan2.2-I2V.md +++ /dev/null @@ -1,136 +0,0 @@ -# Wan2.2 Image To Video - -## Summary - -- Vendor: Wan-AI -- Model: `Wan-AI/Wan2.2-I2V-A14B-Diffusers` -- Task: Image-to-video generation -- Mode: Online serving with the OpenAI-compatible API -- Maintainer: Community - -## When to use this recipe - -Use this recipe when you want to deploy the Wan2.2 14B image-to-video model -with vLLM-Omni using multi-card parallelism. Two configurations are provided: - -1. **Distilled model (no negative-prompt / CFG computation)** — higher - throughput, recommended when using a distilled checkpoint that does not - require classifier-free guidance. -2. **Official open-source model (with CFG)** — uses `--cfg 2` to run negative - and positive samples in parallel for the original released weights. - -## References - -- Upstream model card: - -## Hardware Support - -## NPU - -### 8x Ascend A2 / A3 - -#### Environment - -- OS: Linux -- Python: 3.10+ -- Driver / runtime: Ascend NPU driver with CANN toolkit -- Recommended operator library: **mindie-sd** (Ascend high-performance fused - operators — enables `adalayernorm` and other fused kernels automatically upon - installation) -- vLLM version: Match the repository requirements for your checkout -- vLLM-Omni version or commit: Use the commit you are deploying from - -#### Prerequisites - -Install the **mindie-sd** operator library to enable Ascend-optimized fused -operators (`adalayernorm`, etc.): - -```bash -git clone https://gitcode.com/Ascend/MindIE-SD.git && cd MindIE-SD - -# Comment out the tik_ops build step (not needed for this use case) -sed -i 's|^\(\s*\)source ${current_script_dir}/build_tik_ops.sh|\1# source ${current_script_dir}/build_tik_ops.sh|' build/build_ops.sh - -python setup.py bdist_wheel -cd dist -pip install mindiesd-*.whl -``` - -After installation, enable the Laser Attention kernel for significant -long-sequence speedups (up to ~40% at 720p in tested workloads): - -```bash -export MINDIE_SD_FA_TYPE=ascend_laser_attention -``` - -When using HSDP with FSDP2, set the following environment variable to work -around a PyTorch NPU multi-stream memory reuse issue -([pytorch/pytorch#147168](https://github.com/pytorch/pytorch/issues/147168)). -This issue has been fixed on CUDA but still applies to NPU: - -```bash -export MULTI_STREAM_MEMORY_REUSE=2 -``` - -#### Command - -**Distilled model (no CFG, recommended for distilled checkpoints):** - -```bash -export MINDIE_SD_FA_TYPE=ascend_laser_attention -export MULTI_STREAM_MEMORY_REUSE=2 - -vllm serve \ - --omni Wan-AI/Wan2.2-I2V-A14B-Diffusers \ - --use-hsdp \ - --usp 8 \ - --vae-patch-parallel-size 8 \ - --vae-use-tiling -``` - -**Official open-source model (with CFG):** - -```bash -export MINDIE_SD_FA_TYPE=ascend_laser_attention -export MULTI_STREAM_MEMORY_REUSE=2 - -vllm serve \ - --omni Wan-AI/Wan2.2-I2V-A14B-Diffusers \ - --use-hsdp \ - --usp 4 \ - --cfg 2 \ - --vae-patch-parallel-size 8 \ - --vae-use-tiling -``` - -> **Why the difference?** With `--cfg 2`, two copies of the input (positive and -> negative prompts) are processed in parallel, effectively doubling the compute -> for the DiT backbone. USP is therefore halved from 8 to 4 so that the total -> parallelism across the 8 cards remains balanced (`usp * cfg = 8`). - -#### Verification - -After the server is ready, see -[`examples/online_serving/image_to_video/README.md`](../../examples/online_serving/image_to_video/README.md) -for complete client examples and request formats. - -#### Notes - -- **Key flags:** - - `--omni` — enables vLLM-Omni diffusion serving. - - `--use-hsdp` — enables Hybrid Sharded Data Parallelism for the DiT model - weights. - - `--usp ` — Unified Sequence Parallelism degree. - - `--cfg ` — Classifier-Free Guidance parallelism; set to 2 for models - that require negative-prompt computation, omit for distilled models. - - `--vae-patch-parallel-size 8` — parallelizes VAE decoding across all 8 - cards. - - `--vae-use-tiling` — enables tiled VAE decoding to reduce peak memory. -- **Performance tips:** - - Installing mindie-sd and enabling Laser Attention - (`MINDIE_SD_FA_TYPE=ascend_laser_attention`) provides up to ~40% - performance improvement at 720p resolution due to long-sequence attention - optimization. -- **Known limitations:** - - `MULTI_STREAM_MEMORY_REUSE=2` is required on NPU when using HSDP/FSDP2 - due to a multi-stream memory reuse bug. This is not needed on CUDA. diff --git a/recipes/inclusionAI/Ming-flash-omni-2.0.md b/recipes/inclusionAI/Ming-flash-omni-2.0.md deleted file mode 100644 index 873158c8adc..00000000000 --- a/recipes/inclusionAI/Ming-flash-omni-2.0.md +++ /dev/null @@ -1,210 +0,0 @@ -# Ming-flash-omni 2.0 for omni-speech chat and standalone TTS - -## Summary - -- Vendor: inclusionAI -- Model: `Jonathan1909/Ming-flash-omni-2.0` -- Task: Multimodal chat with text, image, audio, or video input; standalone text-to-speech (TTS); -and image generation -- Mode: Online serving with the OpenAI-compatible API -- Maintainer: Community - -## When to use this recipe - -Use this recipe when you want a known-good starting point for serving -`Jonathan1909/Ming-flash-omni-2.0` with vLLM-Omni in one of three modes: - -- **Thinker only** — multimodal understanding with text output. -- **Thinker + Talker (omni-speech)** — multimodal understanding with text and spoken output. -- **Talker only (TTS)** — standalone text-to-speech via the OpenAI `/v1/audio/speech` endpoint. - -## References - -- Upstream model: - [`inclusionAI/Ming`](https://github.com/inclusionAI/Ming) -- For offline inference and additional client variants, see - `examples/offline_inference/ming_flash_omni{,_tts}/` and - `examples/online_serving/ming_flash_omni{,_tts}/`. - - -## Hardware Support - -This recipe documents reference GPU configurations for the two-stage -omni-speech deployment and the standalone TTS deployment. -Other hardware and configurations are welcome as community validation lands. - -## GPU - -### 4x H100 80GB — omni-speech/chat (thinker + talker) - -The bundled `ming_flash_omni.yaml` runs the thinker with tensor parallel size -4 on GPUs 0–3 and the talker on GPU 3. -Adjust `devices` in the YAML to match your hardware. - -#### Environment - -- OS: Linux -- Python: 3.10+ -- CUDA Driver Version: 590.48.01 -- CUDA 12.5 -- vLLM version: 0.19.0 -- vLLM-Omni version or commit: 0.19.0rc1 - -#### Command - -Thinker only (text output): - -```bash -vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091 -``` - -Thinker + talker (text and/or audio output): - -```bash -vllm serve Jonathan1909/Ming-flash-omni-2.0 \ - --omni \ - --port 8091 \ - --stage-configs-path vllm_omni/model_executor/stage_configs/ming_flash_omni.yaml \ - --log-stats -``` - -`--log-stats` is optional but recommended while validating the deployment. - -#### Verification - -Text output from a multimodal (image) input: - -```bash -curl http://localhost:8091/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Jonathan1909/Ming-flash-omni-2.0", - "messages": [ - {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, - {"role": "user", "content": [ - {"type": "image_url", "image_url": {"url": "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg"}}, - {"type": "text", "text": "Describe this image in detail."} - ]} - ], - "modalities": ["text"] - }' -``` - -Spoken response from a text query (save the WAV bytes): - -```bash -curl http://localhost:8091/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Jonathan1909/Ming-flash-omni-2.0", - "messages": [ - {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, - {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"} - ], - "modalities": ["audio"] - }' | jq -r '.choices[0].message.audio.data' | base64 -d > ming_omni_parrot.wav -``` - -Text + audio output from an audio input (swap `audio_url` for `video_url` -or `image_url` to exercise the other multimodal input paths): - -```bash -curl http://localhost:8091/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Jonathan1909/Ming-flash-omni-2.0", - "messages": [ - {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, - {"role": "user", "content": [ - {"type": "audio_url", "audio_url": {"url": "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/mary_had_lamb.ogg"}}, - {"type": "text", "text": "Please recognize the language of this speech and transcribe it. Format: oral."} - ]} - ], - "modalities": ["text", "audio"] - }' | jq -r '.choices[0].message.content' -``` - -Streaming text output via SSE (set `"stream": true`): - -```bash -curl -N http://localhost:8091/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Jonathan1909/Ming-flash-omni-2.0", - "messages": [ - {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, - {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"} - ], - "modalities": ["text"], - "stream": true - }' -``` - -Each SSE event carries a `data:` line with a chat-completion chunk; text -deltas appear at `choices[0].delta.content`. - -#### Notes - -- Output modality is selected by the request body: `"modalities": ["text"]`, - `["audio"]`, or `["text", "audio"]`. The two-stage omni-speech server must be launched - for any request containing `audio`. -- Reasoning mode: flip the system prompt suffix from `detailed thinking off` - to `detailed thinking on` in any request above. -- Memory usage: size depends on output modalities and multimodal input; leave - headroom for video frames and audio caches. - -### 1x H100 80GB — standalone TTS (talker only) - -The bundled `ming_flash_omni_tts.yaml` runs the talker on a single GPU and exposes the OpenAI `/v1/audio/speech` endpoint. - -#### Environment - -- OS: Linux -- Python: 3.10+ -- CUDA Driver Version: 590.48.01 -- CUDA 12.5 -- vLLM version: 0.19.0 -- vLLM-Omni version or commit: 0.19.0rc1 - -#### Command - -```bash -vllm serve Jonathan1909/Ming-flash-omni-2.0 \ - --omni \ - --stage-configs-path vllm_omni/model_executor/stage_configs/ming_flash_omni_tts.yaml \ - --port 8091 \ - --log-stats -``` - -`--log-stats` is optional but recommended while validating the deployment. - -#### Verification - -Basic curl: - -```bash -curl -X POST http://localhost:8091/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Jonathan1909/Ming-flash-omni-2.0", - "input": "我会一直在这里陪着你。", - "response_format": "wav" - }' --output ming_online.wav -``` - -Speaker selection (e.g. `lingguang`): - -```bash -curl -X POST http://localhost:8091/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Jonathan1909/Ming-flash-omni-2.0", - "input": "春天来了,万物复苏,大地一片生机盎然。田野里的油菜花开得金灿灿的,蜜蜂在花丛中忙碌地采蜜。远处的山坡上,桃花和杏花竞相绽放,粉的白的交织在一起,美不胜收。清晨的微风带着泥土的芬芳,轻轻拂过脸颊,让人感到无比惬意。孩子们在田间小路上追逐嬉戏,老人们坐在门前晒太阳,享受着这份宁静与美好。", - "speaker": "lingguang", - "response_format": "wav" - }' --output ming_online_lingguang.wav -``` - -#### Notes - -- The OpenAI `instructions` field is forwarded to the talker as the caption JSON — pass a raw string for `风格` (style) only, or a JSON-encoded object for multiple entries such as `方言` (dialect) and `情感` (emotion). diff --git a/requirements/common.txt b/requirements/common.txt index 63e16d580ff..138a61ed222 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,6 +1,7 @@ # Common dependencies for all platforms -av>=14.0.0 omegaconf>=2.3.0 +librosa>=0.11.0 +resampy>=0.4.3 diffusers>=0.36.0 accelerate==1.12.0 soundfile>=0.13.1 @@ -9,6 +10,7 @@ tqdm>=4.66.0 torchsde>=0.2.6 openai-whisper>=20250625 imageio[ffmpeg]>=2.37.2 +sox>=1.5.0 x-transformers>=2.12.2 einops>=0.8.1 prettytable>=3.8.0 diff --git a/requirements/musa.txt b/requirements/musa.txt index c100c70cf05..112f3260465 100644 --- a/requirements/musa.txt +++ b/requirements/musa.txt @@ -1,6 +1,4 @@ -r common.txt # MUSA platform dependencies -torchada>=0.1.50 +torchada>=0.1.46 onnxruntime>=1.23.2 -mate>=0.2.0 -flash_attn_3>=0.1.4 diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py deleted file mode 100644 index 7af6c3f8cb8..00000000000 --- a/tests/benchmarks/conftest.py +++ /dev/null @@ -1,103 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -"""conftest.py for benchmarks unit tests. - -Installs lightweight mock stubs for ``vllm`` (and sub-packages) so the -data-module unit tests can run without a full vLLM installation. Only the -symbols actually imported by -``vllm_omni.benchmarks.data_modules.seed_tts_dataset`` are emulated. -""" - -from __future__ import annotations - -import sys -import types -from dataclasses import dataclass -from typing import Any - - -def _install_vllm_stubs() -> None: - """Register minimal vllm stubs in sys.modules. - - Only installs when real vllm is unavailable. We actively probe the - import because an empty or partial vllm may not yet have imported - the submodules we rely on, and unconditionally registering stubs - would shadow the real package for sibling tests (e.g. - ``tests/benchmarks/metrics/test_metrics.py`` needs the real - ``vllm.benchmarks.serve``). - """ - try: - import vllm.benchmarks.datasets # noqa: F401 - import vllm.tokenizers # noqa: F401 - except ImportError: - pass - else: - return # real vllm available — do not shadow it - if "vllm.benchmarks.datasets" in sys.modules: - return - - # ------------------------------------------------------------------ # - # vllm.benchmarks.datasets # - # ------------------------------------------------------------------ # - @dataclass - class SampleRequest: - prompt: str = "" - prompt_len: int = 0 - expected_output_len: int = 0 - multi_modal_data: Any = None - request_id: str = "" - - class BenchmarkDataset: - def __init__( - self, - dataset_path: str = "", - random_seed: int = 0, - disable_shuffle: bool = False, - **kwargs: Any, - ) -> None: - self.dataset_path = dataset_path - self.random_seed = random_seed - self.disable_shuffle = disable_shuffle - - def maybe_oversample_requests( - self, - out: list, - num_requests: int, - request_id_prefix: str, - no_oversample: bool, - ) -> None: - pass - - # ------------------------------------------------------------------ # - # vllm.tokenizers / vllm.tokenizers.hf # - # ------------------------------------------------------------------ # - class TokenizerLike: - pass - - def get_cached_tokenizer(t: Any) -> Any: - return t - - # ------------------------------------------------------------------ # - # Wire up sys.modules # - # ------------------------------------------------------------------ # - vllm_mod = types.ModuleType("vllm") - vllm_benchmarks = types.ModuleType("vllm.benchmarks") - vllm_benchmarks_datasets = types.ModuleType("vllm.benchmarks.datasets") - vllm_tokenizers = types.ModuleType("vllm.tokenizers") - vllm_tokenizers_hf = types.ModuleType("vllm.tokenizers.hf") - - vllm_benchmarks_datasets.BenchmarkDataset = BenchmarkDataset # type: ignore[attr-defined] - vllm_benchmarks_datasets.SampleRequest = SampleRequest # type: ignore[attr-defined] - vllm_tokenizers.TokenizerLike = TokenizerLike # type: ignore[attr-defined] - vllm_tokenizers_hf.get_cached_tokenizer = get_cached_tokenizer # type: ignore[attr-defined] - - sys.modules["vllm"] = vllm_mod - sys.modules["vllm.benchmarks"] = vllm_benchmarks - sys.modules["vllm.benchmarks.datasets"] = vllm_benchmarks_datasets - sys.modules["vllm.tokenizers"] = vllm_tokenizers - sys.modules["vllm.tokenizers.hf"] = vllm_tokenizers_hf - - -# Install stubs immediately at collection time (before any test import). -_install_vllm_stubs() diff --git a/tests/benchmarks/metrics/test_metrics.py b/tests/benchmarks/metrics/test_metrics.py deleted file mode 100644 index f531a5026a3..00000000000 --- a/tests/benchmarks/metrics/test_metrics.py +++ /dev/null @@ -1,67 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -""" -Unit tests for metrics.py -""" - -import pytest -from vllm.benchmarks.serve import TaskType - -from vllm_omni.benchmarks.metrics.metrics import calculate_metrics -from vllm_omni.benchmarks.patch.patch import MixRequestFuncOutput - -pytestmark = [pytest.mark.core_model, pytest.mark.benchmark, pytest.mark.cpu] - - -def _make_output(prompt_len: int, output_tokens: int = 10) -> MixRequestFuncOutput: - """Build a minimal successful MixRequestFuncOutput for metrics aggregation.""" - output = MixRequestFuncOutput() - output.success = True - output.prompt_len = prompt_len - output.output_tokens = output_tokens - output.generated_text = "x" * output_tokens - output.ttft = 0.1 - output.text_latency = 1.0 - output.latency = 1.0 - output.start_time = 0.0 - output.itl = [0.1] * max(output_tokens - 1, 0) - output.audio_ttfp = 0.0 - output.audio_rtf = 0.0 - output.audio_duration = 0.0 - output.audio_frames = 0 - output.input_audio_duration = 0.0 - output.error = "" - return output - - -# ============================================================================ -# total_input Tests -# ============================================================================ - - -def test_total_input_aggregated_from_output_prompt_len(): - """Test that total_input sums outputs[i].prompt_len, not input_requests[i].prompt_len.""" - outputs = [_make_output(4992), _make_output(3000)] - - metrics, _ = calculate_metrics( - input_requests=[], - outputs=outputs, - dur_s=10.0, - tokenizer=None, - selected_percentiles=[99.0], - goodput_config_dict={}, - task_type=TaskType.GENERATION, - selected_percentile_metrics=[], - max_concurrency=None, - request_rate=float("inf"), - benchmark_duration=10.0, - ) - - assert metrics.total_input == 7992, ( - "total_input should aggregate from outputs[i].prompt_len to reflect the true multimodal input token count" - ) - - -if __name__ == "__main__": - pytest.main([__file__, "-v", "-s"]) diff --git a/tests/benchmarks/patch/test_patch.py b/tests/benchmarks/patch/test_patch.py index 35a18aea33c..39b7f84fb49 100644 --- a/tests/benchmarks/patch/test_patch.py +++ b/tests/benchmarks/patch/test_patch.py @@ -574,59 +574,5 @@ async def test_text_latency_value_consistency(self, mocker: MockerFixture): ) -# ============================================================================ -# prompt_len Tests -# ============================================================================ - - -@pytest.mark.asyncio -async def test_prompt_len_assigned_from_usage(mocker: MockerFixture): - # Arrange: request claims prompt_len=100, but server reports 4992 (multimodal). - request_input = RequestFuncInput( - model="test-model", - model_name="test-model", - prompt="test prompt", - api_url="http://test.com/v1/chat/completions", - prompt_len=100, - output_len=20, - ) - - chunks = [ - create_sse_chunk( - { - "choices": [{"delta": {"content": "Hello"}}], - "modality": "text", - } - ), - create_sse_chunk( - { - "choices": [{"delta": {"content": " world"}}], - "modality": "text", - } - ), - # Final usage chunk emitted because stream_options.include_usage=True. - create_sse_chunk( - { - "choices": [], - "usage": {"prompt_tokens": 4992, "completion_tokens": 2, "total_tokens": 4994}, - } - ), - b"data: [DONE]\n\n", - ] - - mock_response = MockResponse(200, chunks) - mock_session = mocker.AsyncMock() - mock_session.post = mocker.MagicMock(return_value=mock_response) - - # Act - output = await async_request_openai_chat_omni_completions(request_input, mock_session) - - # Assert - assert output.success is True - assert output.prompt_len == 4992, ( - "prompt_len should be overridden by usage.prompt_tokens to reflect the true multimodal input token count" - ) - - if __name__ == "__main__": pytest.main([__file__, "-v", "-s"]) diff --git a/tests/benchmarks/test_accuracy_bench_utils.py b/tests/benchmarks/test_accuracy_bench_utils.py index 6ceebb11b79..a0479fb1bad 100644 --- a/tests/benchmarks/test_accuracy_bench_utils.py +++ b/tests/benchmarks/test_accuracy_bench_utils.py @@ -1,17 +1,11 @@ # ruff: noqa: E402, I001 -import argparse import math -import os import sys -import types from pathlib import Path import pytest from PIL import Image -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] - - REPO_ROOT = Path(__file__).resolve().parents[2] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) @@ -42,64 +36,8 @@ summarize_generated_records as summarize_gebench_generated_records, summarize_gebench_results, ) -from tests.e2e.accuracy.qwen3_omni.qwen3_omni_acc_bench_core import seed_tts_bench_argv -from tests.e2e.accuracy.qwen3_omni.run_qwen_omni_acc_benchmark import sync_dataset_env_from_ns -from vllm_omni.benchmarks.data_modules.seed_tts_dataset import resolve_seed_tts_root - - -def test_seed_tts_bench_argv_preserves_hf_repo_id_from_env(monkeypatch): - monkeypatch.setenv("VLLM_SEED_TTS_DATASET_PATH", "zhaochenyang20/seed-tts-eval") - monkeypatch.delenv("VLLM_SEED_TTS_REPO", raising=False) - - argv = seed_tts_bench_argv(locale="en") - - dataset_idx = argv.index("--dataset-path") - assert argv[dataset_idx + 1] == "zhaochenyang20/seed-tts-eval" - - -def test_sync_dataset_env_preserves_seed_tts_hf_repo_id(monkeypatch): - ns = argparse.Namespace( - daily_omni_repo=None, - daily_omni_qa_json=None, - daily_omni_video_dir=None, - seed_tts_dataset_path="zhaochenyang20/seed-tts-eval", - seed_tts_root=None, - ) - monkeypatch.delenv("VLLM_SEED_TTS_DATASET_PATH", raising=False) - sync_dataset_env_from_ns(ns) - - assert os.environ["VLLM_SEED_TTS_DATASET_PATH"] == "zhaochenyang20/seed-tts-eval" - - -def test_resolve_seed_tts_root_downloads_only_requested_locale(monkeypatch, tmp_path: Path): - downloaded_root = tmp_path / "seed_tts_cache" - (downloaded_root / "zh" / "prompt-wavs").mkdir(parents=True) - (downloaded_root / "zh" / "meta.lst").write_text("", encoding="utf-8") - captured: dict[str, object] = {} - - def fake_snapshot_download(*, repo_id, repo_type, allow_patterns): - captured["repo_id"] = repo_id - captured["repo_type"] = repo_type - captured["allow_patterns"] = allow_patterns - return str(downloaded_root) - - monkeypatch.setitem( - sys.modules, - "huggingface_hub", - types.SimpleNamespace(snapshot_download=fake_snapshot_download), - ) - - resolved = resolve_seed_tts_root( - "zhaochenyang20/seed-tts-eval", - explicit_root=None, - locale="zh", - ) - - assert resolved == downloaded_root.resolve() - assert captured["repo_id"] == "zhaochenyang20/seed-tts-eval" - assert captured["repo_type"] == "dataset" - assert captured["allow_patterns"] == ["zh/**"] +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] def test_summarize_gebench_generated_records_groups_by_type(): diff --git a/tests/benchmarks/test_bench_tts_cli.py b/tests/benchmarks/test_bench_tts_cli.py deleted file mode 100644 index b8a487f80c6..00000000000 --- a/tests/benchmarks/test_bench_tts_cli.py +++ /dev/null @@ -1,139 +0,0 @@ -"""Tests for the universal benchmarks/tts/bench_tts.py CLI.""" - -from __future__ import annotations - -import json -import sys -from pathlib import Path - -import pytest -import yaml - -# Add benchmarks/tts to path for import -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "benchmarks" / "tts")) -import bench_tts - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -@pytest.fixture() -def model_configs_path(tmp_path: Path) -> Path: - cfg = { - "models": { - "test/ModelA": { - "stage_config": "model_a.yaml", - "supported_tasks": ["voice_clone", "default_voice"], - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "task_extra_body": { - "voice_clone": {"task_type": "Base"}, - "default_voice": {"voice": "Vivian", "task_type": "CustomVoice"}, - }, - }, - "test/ModelB": { - "stage_config": "model_b.yaml", - "supported_tasks": ["voice_clone"], - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "task_extra_body": {"voice_clone": {}}, - }, - } - } - p = tmp_path / "model_configs.yaml" - p.write_text(yaml.dump(cfg), encoding="utf-8") - return p - - -def test_load_model_configs(model_configs_path: Path) -> None: - configs = bench_tts.load_model_configs(model_configs_path) - assert "test/ModelA" in configs - assert "test/ModelB" in configs - assert configs["test/ModelA"]["supported_tasks"] == ["voice_clone", "default_voice"] - - -def test_build_bench_args_voice_clone(model_configs_path: Path) -> None: - configs = bench_tts.load_model_configs(model_configs_path) - cmd = bench_tts.build_bench_args( - host="localhost", - port=8000, - model="test/ModelA", - task="voice_clone", - model_cfg=configs["test/ModelA"], - locale="en", - num_prompts=10, - concurrency=1, - dataset_path="/data/seed-tts", - wer_eval=False, - output_dir=None, - result_filename=None, - extra_cli_args=[], - ) - assert "--dataset-name" in cmd - idx = cmd.index("--dataset-name") - assert cmd[idx + 1] == "seed-tts" - assert "--max-concurrency" in cmd - assert "--extra-body" in cmd - extra_body = json.loads(cmd[cmd.index("--extra-body") + 1]) - assert extra_body.get("task_type") == "Base" - - -def test_build_bench_args_default_voice_has_voice_param(model_configs_path: Path) -> None: - configs = bench_tts.load_model_configs(model_configs_path) - cmd = bench_tts.build_bench_args( - host="localhost", - port=8000, - model="test/ModelA", - task="default_voice", - model_cfg=configs["test/ModelA"], - locale="en", - num_prompts=10, - concurrency=1, - dataset_path="/data/seed-tts", - wer_eval=False, - output_dir=None, - result_filename=None, - extra_cli_args=[], - ) - idx = cmd.index("--dataset-name") - assert cmd[idx + 1] == "seed-tts-text" - extra_body = json.loads(cmd[cmd.index("--extra-body") + 1]) - assert extra_body.get("voice") == "Vivian" - - -def test_build_bench_args_wer_eval_adds_flag(model_configs_path: Path) -> None: - configs = bench_tts.load_model_configs(model_configs_path) - cmd = bench_tts.build_bench_args( - host="localhost", - port=8000, - model="test/ModelA", - task="voice_clone", - model_cfg=configs["test/ModelA"], - locale="en", - num_prompts=10, - concurrency=1, - dataset_path="/data/seed-tts", - wer_eval=True, - output_dir=None, - result_filename=None, - extra_cli_args=[], - ) - assert "--seed-tts-wer-eval" in cmd - - -def test_unsupported_task_exits(model_configs_path: Path, capsys: pytest.CaptureFixture, mocker) -> None: - # ModelB does not support voice_design - mocker.patch.object( - sys, - "argv", - [ - "bench_tts.py", - "--model", - "test/ModelB", - "--task", - "voice_design", - "--model-configs", - str(model_configs_path), - ], - ) - with pytest.raises(SystemExit): - bench_tts.main() diff --git a/tests/benchmarks/test_diffusion_backends_metrics.py b/tests/benchmarks/test_diffusion_backends_metrics.py deleted file mode 100644 index 2d51d0f1d38..00000000000 --- a/tests/benchmarks/test_diffusion_backends_metrics.py +++ /dev/null @@ -1,107 +0,0 @@ -import pytest - -from benchmarks.diffusion.backends import RequestFuncInput, async_request_chat_completions - - -class _MockResponse: - def __init__(self, payload: dict, status: int = 200): - self._payload = payload - self.status = status - - async def __aenter__(self): - return self - - async def __aexit__(self, exc_type, exc, tb): - return False - - async def json(self): - return self._payload - - async def text(self): - return str(self._payload) - - -class _MockSession: - def __init__(self, payload: dict): - self._payload = payload - - def post(self, *args, **kwargs): - return _MockResponse(self._payload) - - -@pytest.mark.core_model -@pytest.mark.benchmark -@pytest.mark.cpu -@pytest.mark.asyncio -async def test_chat_completions_metrics_fallback_to_top_level(): - payload = { - "choices": [ - { - "message": { - "content": [ - { - "type": "image_url", - "image_url": {"url": "data:image/png;base64,abc"}, - } - ] - } - } - ], - "metrics": { - "stage_durations": {"diffusion": 1.25}, - "peak_memory_mb": 4096.0, - }, - } - - output = await async_request_chat_completions( - RequestFuncInput( - prompt="draw a cat", - api_url="http://test.local/v1/chat/completions", - model="ByteDance-Seed/BAGEL-7B-MoT", - ), - session=_MockSession(payload), - ) - - assert output.success is True - assert output.stage_durations == {"diffusion": 1.25} - assert output.peak_memory_mb == 4096.0 - - -@pytest.mark.core_model -@pytest.mark.benchmark -@pytest.mark.cpu -@pytest.mark.asyncio -async def test_chat_completions_metrics_message_level_takes_precedence(): - payload = { - "choices": [ - { - "message": { - "content": [ - { - "type": "image_url", - "image_url": {"url": "data:image/png;base64,abc"}, - "stage_durations": {"message_stage": 0.7}, - "peak_memory_mb": 1234.0, - } - ] - } - } - ], - "metrics": { - "stage_durations": {"top_level_stage": 9.9}, - "peak_memory_mb": 9999.0, - }, - } - - output = await async_request_chat_completions( - RequestFuncInput( - prompt="draw a dog", - api_url="http://test.local/v1/chat/completions", - model="ByteDance-Seed/BAGEL-7B-MoT", - ), - session=_MockSession(payload), - ) - - assert output.success is True - assert output.stage_durations == {"message_stage": 0.7} - assert output.peak_memory_mb == 1234.0 diff --git a/tests/benchmarks/test_seed_tts_dataset_variants.py b/tests/benchmarks/test_seed_tts_dataset_variants.py deleted file mode 100644 index 7fa5747bdfd..00000000000 --- a/tests/benchmarks/test_seed_tts_dataset_variants.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Tests for SeedTTSTextDataset, SeedTTSTextSampleRequest, SeedTTSDesignDataset, -and SeedTTSDesignSampleRequest. - -vllm stubs are installed by tests/benchmarks/conftest.py before collection. -""" - -from __future__ import annotations - -import importlib.util -import sys -from pathlib import Path - -import pytest - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - -# Load the data module directly (bypasses vllm_omni.__init__ heavy imports). -_REPO_ROOT = Path(__file__).resolve().parents[2] -_MODULE_PATH = _REPO_ROOT / "vllm_omni" / "benchmarks" / "data_modules" / "seed_tts_dataset.py" -_MODULE_NAME = "vllm_omni.benchmarks.data_modules.seed_tts_dataset" - -if _MODULE_NAME not in sys.modules: - _spec = importlib.util.spec_from_file_location(_MODULE_NAME, _MODULE_PATH) - _mod = importlib.util.module_from_spec(_spec) - sys.modules[_MODULE_NAME] = _mod - _spec.loader.exec_module(_mod) - -from vllm_omni.benchmarks.data_modules.seed_tts_dataset import ( # noqa: E402 - SeedTTSDesignDataset, - SeedTTSDesignSampleRequest, - SeedTTSTextDataset, - SeedTTSTextSampleRequest, -) - -# --------------------------------------------------------------------------- -# Fixtures -# --------------------------------------------------------------------------- - - -@pytest.fixture() -def seed_tts_root(tmp_path: Path) -> Path: - """Minimal seed-tts-style directory with 5 entries.""" - locale_dir = tmp_path / "en" - locale_dir.mkdir() - wav_dir = locale_dir / "prompt-wavs" - wav_dir.mkdir() - for i in range(5): - (wav_dir / f"utt{i:03d}.wav").write_bytes(b"RIFF\x00\x00\x00\x00WAVE") - meta = "\n".join(f"utt{i:03d}|ref text {i}|prompt-wavs/utt{i:03d}.wav|target text {i}" for i in range(5)) - (locale_dir / "meta.lst").write_text(meta, encoding="utf-8") - return tmp_path - - -@pytest.fixture() -def mock_tokenizer(mocker): - tokenizer = mocker.MagicMock() - tokenizer.encode = lambda text, **kw: [0] * len(text.split()) - tokenizer.get_vocab.return_value = {"": 0} - tokenizer.all_special_ids = [] - tokenizer.all_special_tokens = [] - tokenizer.vocab_size = 1 - return tokenizer - - -# --------------------------------------------------------------------------- -# Tests -# --------------------------------------------------------------------------- - - -def test_seed_tts_text_dataset_omits_ref_audio(seed_tts_root, mock_tokenizer): - ds = SeedTTSTextDataset( - dataset_path=str(seed_tts_root), - random_seed=0, - locale="en", - disable_shuffle=True, - ) - requests = ds.sample(mock_tokenizer, num_requests=3) - assert len(requests) == 3 - for req in requests: - assert isinstance(req, SeedTTSTextSampleRequest) - assert req.seed_tts_speech_extra is None or "ref_audio" not in (req.seed_tts_speech_extra or {}) - assert req.seed_tts_ref_wav_path == "" - assert "target text" in req.prompt - - -# --------------------------------------------------------------------------- -# SeedTTSDesignDataset tests -# --------------------------------------------------------------------------- - - -@pytest.fixture() -def seed_tts_design_root(tmp_path: Path) -> Path: - """seed-tts-design directory with 5-field meta.lst entries.""" - locale_dir = tmp_path / "en" - locale_dir.mkdir() - meta = "\n".join( - f"des{i:03d}|||target text {i}|A warm {['female', 'male'][i % 2]} voice with neutral accent." for i in range(5) - ) - (locale_dir / "meta.lst").write_text(meta, encoding="utf-8") - return tmp_path - - -def test_seed_tts_design_dataset_has_instructions(seed_tts_design_root, mock_tokenizer): - ds = SeedTTSDesignDataset( - dataset_path=str(seed_tts_design_root), - random_seed=0, - locale="en", - disable_shuffle=True, - ) - requests = ds.sample(mock_tokenizer, num_requests=3) - assert len(requests) == 3 - for req in requests: - assert isinstance(req, SeedTTSDesignSampleRequest) - extra = req.seed_tts_speech_extra or {} - assert "instructions" in extra - assert extra["instructions"], "instructions must be non-empty" - assert extra.get("task_type") == "VoiceDesign" - assert "ref_audio" not in extra - assert req.seed_tts_ref_wav_path == "" - - -def test_seed_tts_design_dataset_rejects_missing_description(seed_tts_design_root, mock_tokenizer): - """Lines without a voice_description should be skipped.""" - locale_dir = seed_tts_design_root / "en" - # The bad line has 4 fields, not 5, so will be filtered - meta = "bad|||target text without description\n" + "\n".join( - f"ok|||target text {i}|A clear female voice." for i in range(9) - ) - (locale_dir / "meta.lst").write_text(meta, encoding="utf-8") - ds = SeedTTSDesignDataset( - dataset_path=str(seed_tts_design_root), - random_seed=0, - locale="en", - disable_shuffle=True, - ) - requests = ds.sample(mock_tokenizer, num_requests=10, no_oversample=True) - assert len(requests) == 9 # since we filter the bad row out and don't oversample - for req in requests: - assert isinstance(req, SeedTTSDesignSampleRequest) - assert req.seed_tts_utterance_id == "ok" - - -def test_attach_sets_seed_tts_row_even_without_extra_body(): - """seed_tts_row=True must be set for SeedTTSTextSampleRequest (no extra body).""" - from vllm_omni.benchmarks.data_modules.seed_tts_dataset import SeedTTSTextSampleRequest - - req = SeedTTSTextSampleRequest( - prompt="hello world", - prompt_len=2, - expected_output_len=100, - multi_modal_data=None, - request_id="test-0", - seed_tts_speech_extra=None, - seed_tts_ref_wav_path="", - ) - assert req.seed_tts_speech_extra is None - assert req.seed_tts_ref_wav_path == "" - # The fix ensures that even with speech_extra=None, the function - # sets seed_tts_row=True. We verify the source code has the fix. - import inspect - - import vllm_omni.benchmarks.patch.patch as patch_mod - - src = inspect.getsource(patch_mod._attach_seed_tts_to_request_func_input) - # seed_tts_row must be set BEFORE the 'if not ex: return' check - row_pos = src.index("seed_tts_row") - not_ex_pos = src.index("if not ex:") - assert row_pos < not_ex_pos, "seed_tts_row must be set before 'if not ex: return'" diff --git a/tests/comfyui/conftest.py b/tests/comfyui/conftest.py index 4280d3506ff..0b4565e9465 100644 --- a/tests/comfyui/conftest.py +++ b/tests/comfyui/conftest.py @@ -9,8 +9,8 @@ import os import sys -from types import ModuleType, SimpleNamespace from typing import BinaryIO, TypedDict +from unittest.mock import MagicMock def pytest_configure(config): @@ -58,15 +58,15 @@ def save_to(self, file: str | BinaryIO): else: file.write(self._data) - mock_comfy_api = ModuleType("comfy_api") - mock_comfy_api_input = ModuleType("comfy_api.input") + mock_comfy_api = MagicMock() + mock_comfy_api_input = MagicMock() mock_comfy_api_input.AudioInput = AudioInput mock_comfy_api_input.VideoInput = VideoInput mock_comfy_api.input = mock_comfy_api_input - mock_comfy_api_latest = ModuleType("comfy_api.latest") - mock_comfy_api_latest.Types = SimpleNamespace(VideoComponents=lambda **kwargs: kwargs) - mock_comfy_api_latest.InputImpl = SimpleNamespace( - VideoFromComponents=lambda _: VideoInput(b"mock_video_from_components") + mock_comfy_api_latest = MagicMock() + mock_comfy_api_latest.Types.VideoComponents = MagicMock(side_effect=lambda **kwargs: kwargs) + mock_comfy_api_latest.InputImpl.VideoFromComponents = MagicMock( + side_effect=lambda _: VideoInput(b"mock_video_from_components") ) mock_comfy_api.latest = mock_comfy_api_latest @@ -76,8 +76,8 @@ def mock_load(_: str | BinaryIO): sample_rate = 24000 return waveform, sample_rate - mock_comfy_extras = ModuleType("comfy_extras") - mock_nodes_audio = ModuleType("comfy_extras.nodes_audio") + mock_comfy_extras = MagicMock() + mock_nodes_audio = MagicMock() mock_nodes_audio.load = mock_load mock_comfy_extras.nodes_audio = mock_nodes_audio diff --git a/tests/comfyui/test_comfyui_integration.py b/tests/comfyui/test_comfyui_integration.py index 80e86d82412..f6ce82f9b28 100644 --- a/tests/comfyui/test_comfyui_integration.py +++ b/tests/comfyui/test_comfyui_integration.py @@ -13,6 +13,7 @@ from enum import StrEnum, auto from types import SimpleNamespace from typing import Any, NamedTuple +from unittest.mock import AsyncMock, MagicMock, patch import pytest import requests @@ -27,7 +28,6 @@ ) from comfyui_vllm_omni.utils.types import AutoregressionSamplingParams, DiffusionSamplingParams, WanModelSpecificParams from PIL import Image -from pytest_mock import MockerFixture from vllm import SamplingParams from vllm.outputs import CompletionOutput, RequestOutput from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -217,10 +217,9 @@ def _build_diffusion_video_output() -> OmniRequestOutput: def _build_diffusion_image_output_for_chat_endpoint() -> OmniRequestOutput: - request_output = SimpleNamespace( - images=[_build_image_output(color="blue")], - finished=True, - ) + request_output = MagicMock() + request_output.images = [_build_image_output(color="blue")] + request_output.finished = True return OmniRequestOutput( request_id="test_req_img_chat", finished=True, @@ -390,55 +389,51 @@ def sampling_case(request) -> SamplingCase: @pytest.fixture -def mock_async_omni( - server_case: ServerCase, - sampling_case: SamplingCase, - monkeypatch: pytest.MonkeyPatch, - mocker: MockerFixture, -): +def mock_async_omni(server_case: ServerCase, sampling_case: SamplingCase): async def _mock_preprocess_chat(self, *args, **kwargs): return ([{"role": "user", "content": "test"}], [{"prompt": "test prompt"}]) # Need to mock AsyncOmni itself (not only its generate method) because # 1. The API layer uses its stage_list and stage_configs attributes # 2. Its __init__ method has slow side effects (model & config loading). - mock_async_omni_cls = mocker.patch("vllm_omni.entrypoints.openai.api_server.AsyncOmni") - monkeypatch.setattr( - "vllm_omni.entrypoints.openai.serving_chat.OmniOpenAIServingChat._preprocess_chat", - _mock_preprocess_chat, - ) - - mock_instance = mocker.AsyncMock(spec=RealAsyncOmni) - mock_instance.generate = _build_mock_outputs(server_case.outputs, sampling_case, server_case) - - mock_instance.stage_list = server_case.stage_list - mock_instance.stage_configs = server_case.stage_configs - mock_instance.output_modalities = _build_output_modalities(server_case.stage_configs) - mock_instance.default_sampling_params_list = [ - SamplingParams() if _stage_type(stage) != "diffusion" else mocker.MagicMock() - for stage in server_case.stage_configs - ] - mock_instance.errored = False - mock_instance.dead_error = RuntimeError("Mock engine error") - mock_instance.model_config = mocker.MagicMock( - max_model_len=4096, - io_processor_plugin=None, - allowed_local_media_path=None, - allowed_media_domains=None, - ) - # Mimic Qwen3-TTS talker speaker config so CustomVoice validation passes. - mock_instance.model_config.hf_config = mocker.MagicMock() - mock_instance.model_config.hf_config.talker_config = mocker.MagicMock() - mock_instance.model_config.hf_config.talker_config.speaker_id = {"Vivian": 0} - mock_instance.io_processor = mocker.MagicMock() - mock_instance.input_processor = mocker.MagicMock() - mock_instance.shutdown = mocker.MagicMock() - mock_instance.get_vllm_config = mocker.AsyncMock(return_value=None) - mock_instance.get_supported_tasks = mocker.AsyncMock(return_value=["generate"]) - mock_instance.get_tokenizer = mocker.AsyncMock(return_value=None) + with ( + patch("vllm_omni.entrypoints.openai.api_server.AsyncOmni") as MockAsyncOmni, + patch( + "vllm_omni.entrypoints.openai.serving_chat.OmniOpenAIServingChat._preprocess_chat", + new=_mock_preprocess_chat, + ), + ): + mock_instance = AsyncMock(spec=RealAsyncOmni) + mock_instance.generate = _build_mock_outputs(server_case.outputs, sampling_case, server_case) + + mock_instance.stage_list = server_case.stage_list + mock_instance.stage_configs = server_case.stage_configs + mock_instance.output_modalities = _build_output_modalities(server_case.stage_configs) + mock_instance.default_sampling_params_list = [ + SamplingParams() if _stage_type(stage) != "diffusion" else MagicMock() + for stage in server_case.stage_configs + ] + mock_instance.errored = False + mock_instance.dead_error = RuntimeError("Mock engine error") + mock_instance.model_config = MagicMock( + max_model_len=4096, + io_processor_plugin=None, + allowed_local_media_path=None, + allowed_media_domains=None, + ) + # Mimic Qwen3-TTS talker speaker config so CustomVoice validation passes. + mock_instance.model_config.hf_config = MagicMock() + mock_instance.model_config.hf_config.talker_config = MagicMock() + mock_instance.model_config.hf_config.talker_config.speaker_id = {"Vivian": 0} + mock_instance.io_processor = MagicMock() + mock_instance.input_processor = MagicMock() + mock_instance.shutdown = MagicMock() + mock_instance.get_vllm_config = AsyncMock(return_value=None) + mock_instance.get_supported_tasks = AsyncMock(return_value=["generate"]) + mock_instance.get_tokenizer = AsyncMock(return_value=None) - mock_async_omni_cls.return_value = mock_instance - yield mock_async_omni_cls + MockAsyncOmni.return_value = mock_instance + yield MockAsyncOmni @pytest.fixture @@ -588,9 +583,9 @@ async def test_image_generation_node(api_server: str, model: str, image_input: b ServerCase( served_model="Qwen/Qwen2.5-Omni-7B", stage_list=[ - SimpleNamespace(is_comprehension=True, model_stage="llm"), - SimpleNamespace(is_comprehension=False, model_stage="llm"), - SimpleNamespace(is_comprehension=False, model_stage="llm"), + MagicMock(is_comprehension=True, model_stage="llm"), + MagicMock(is_comprehension=False, model_stage="llm"), + MagicMock(is_comprehension=False, model_stage="llm"), ], stage_configs=[ _make_stage_config("llm", is_comprehension=True, model_stage="thinker"), diff --git a/tests/config/__init__.py b/tests/config/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/config/test_pipeline_registry.py b/tests/config/test_pipeline_registry.py deleted file mode 100644 index 6cc7c9258ed..00000000000 --- a/tests/config/test_pipeline_registry.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for the central pipeline registry (2.5/N).""" - -from __future__ import annotations - -import pytest - -from vllm_omni.config.pipeline_registry import _OMNI_PIPELINES -from vllm_omni.config.stage_config import ( - _PIPELINE_REGISTRY, - PipelineConfig, - StageExecutionType, - StagePipelineConfig, - register_pipeline, -) - - -class TestCentralRegistryDeclarations: - """Every in-tree pipeline must be declared exactly once in the central registry.""" - - def test_omni_entries_visible_in_registry(self): - for key in _OMNI_PIPELINES: - assert key in _PIPELINE_REGISTRY - - def test_expected_omni_pipelines_present(self): - # Guard against accidental removal during future refactors. - assert "qwen2_5_omni" in _OMNI_PIPELINES - assert "qwen2_5_omni_thinker_only" in _OMNI_PIPELINES - assert "qwen3_omni_moe" in _OMNI_PIPELINES - assert "qwen3_tts" in _OMNI_PIPELINES - - -class TestLazyLoading: - """Pipelines are imported only on first access.""" - - def test_contains_without_import(self): - # ``in`` hits the lazy map, not the loaded cache. - assert "qwen3_omni_moe" in _PIPELINE_REGISTRY - - def test_getitem_loads_correct_pipeline(self): - pipeline = _PIPELINE_REGISTRY["qwen3_omni_moe"] - assert pipeline.model_type == "qwen3_omni_moe" - assert pipeline.model_arch == "Qwen3OmniMoeForConditionalGeneration" - - def test_unknown_model_type_returns_none_via_get(self): - assert _PIPELINE_REGISTRY.get("not_a_real_pipeline") is None - - def test_unknown_model_type_raises_keyerror_via_getitem(self): - with pytest.raises(KeyError): - _PIPELINE_REGISTRY["not_a_real_pipeline"] - - def test_iteration_yields_registered_pipelines(self): - keys = set(_PIPELINE_REGISTRY) - assert "qwen2_5_omni" in keys - assert "qwen3_omni_moe" in keys - - -class TestDynamicRegistration: - """``register_pipeline()`` still works for plugins and tests.""" - - def test_register_adds_to_registry(self): - custom = PipelineConfig( - model_type="_test_dynamic_registration", - model_arch="DynamicTestModel", - stages=( - StagePipelineConfig( - stage_id=0, - model_stage="test", - execution_type=StageExecutionType.LLM_AR, - input_sources=(), - final_output=True, - ), - ), - ) - register_pipeline(custom) - try: - assert "_test_dynamic_registration" in _PIPELINE_REGISTRY - assert _PIPELINE_REGISTRY["_test_dynamic_registration"] is custom - finally: - # Don't leak the test registration into other tests. - if "_test_dynamic_registration" in _PIPELINE_REGISTRY: - del _PIPELINE_REGISTRY["_test_dynamic_registration"] - - def test_dynamic_registration_overrides_lazy_entry(self): - # Build a substitute for qwen3_omni_moe that we can distinguish. - original = _PIPELINE_REGISTRY["qwen3_omni_moe"] - override = PipelineConfig( - model_type="qwen3_omni_moe", - model_arch="OverriddenArch", - stages=original.stages, - ) - register_pipeline(override) - try: - assert _PIPELINE_REGISTRY["qwen3_omni_moe"].model_arch == "OverriddenArch" - finally: - # Remove the dynamic override so later tests see the original. - if "qwen3_omni_moe" in _PIPELINE_REGISTRY._loaded: - del _PIPELINE_REGISTRY["qwen3_omni_moe"] diff --git a/tests/conftest.py b/tests/conftest.py index 77075f9525a..8e9a7bf9280 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,62 +1,3180 @@ -""" -Root pytest entrypoint for the vLLM-Omni test suite. - -- `tests/conftest.py` stays thin: plugin registration + compatibility re-exports. -- Importable utilities live under `tests/helpers/`. -- Fixtures live under `tests/helpers/fixtures/` and are loaded via `pytest_plugins`. -""" - -from __future__ import annotations - -pytest_plugins = ( - "tests.helpers.fixtures.env", - "tests.helpers.fixtures.log", - "tests.helpers.fixtures.run_args", - "tests.helpers.fixtures.runtime", -) - - -def pytest_terminal_summary(terminalreporter, exitstatus, config): - # Marker for Buildkite log folding before pytest summary lines. - terminalreporter.write_line("--- Running Summary") - - -# Backward-compatible lazy re-exports. -# (Many tests still import from `tests.conftest`; migrate these imports to `tests.helpers.*` over time.) -# Keep these lazy so conftest import does not trigger heavy helper dependencies. -_ASSERTION_EXPORT_NAMES = ( - "assert_audio_speech_response", - "assert_diffusion_response", - "assert_image_diffusion_response", - "assert_image_valid", - "assert_omni_response", - "assert_video_diffusion_response", - "assert_video_valid", -) -_MEDIA_EXPORT_NAMES = ( - "convert_audio_bytes_to_text", - "convert_audio_file_to_text", - "cosine_similarity_text", - "decode_b64_image", - "generate_synthetic_audio", - "generate_synthetic_image", - "generate_synthetic_video", -) -_STAGE_CONFIG_EXPORT_NAMES = ("modify_stage_config",) -_RUNTIME_EXPORT_NAMES = ( - "DiffusionResponse", - "OmniResponse", - "OmniRunner", - "OmniRunnerHandler", - "OmniServer", - "OmniServerParams", - "OmniServerStageCli", - "OpenAIClientHandler", - "dummy_messages_from_mix_data", -) -_LAZY_EXPORT_MODULES = { - **{name: "tests.helpers.assertions" for name in _ASSERTION_EXPORT_NAMES}, - **{name: "tests.helpers.media" for name in _MEDIA_EXPORT_NAMES}, - **{name: "tests.helpers.stage_config" for name in _STAGE_CONFIG_EXPORT_NAMES}, - **{name: "tests.helpers.runtime" for name in _RUNTIME_EXPORT_NAMES}, +import base64 +import datetime +import io +import json +import math +import os +import random +import re +import tempfile + +import requests + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +# Set CPU device for CI environments without GPU +if "VLLM_TARGET_DEVICE" not in os.environ: + os.environ["VLLM_TARGET_DEVICE"] = "cpu" + +import concurrent.futures +import gc +import multiprocessing +import socket +import subprocess +import sys +import threading +import time +import uuid +from collections.abc import Generator +from dataclasses import dataclass +from io import BytesIO +from pathlib import Path +from typing import Any, NamedTuple + +import cv2 +import numpy as np +import psutil +import pytest +import soundfile as sf +import torch +import yaml +from openai import OpenAI, omit +from PIL import Image +from transformers import pipeline +from vllm import TextPrompt +from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from vllm.logger import init_logger +from vllm.utils.network_utils import get_open_port + +from vllm_omni.entrypoints.omni import Omni +from vllm_omni.inputs.data import OmniSamplingParams +from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform + +logger = init_logger(__name__) + +PromptAudioInput = list[tuple[Any, int]] | tuple[Any, int] | None +PromptImageInput = list[Any] | Any | None +PromptVideoInput = list[Any] | Any | None + +_GENDER_PIPELINE = None +# transformers.Pipeline is not thread-safe; concurrent e2e requests must serialize inference. +_GENDER_PIPELINE_LOCK = threading.Lock() + +# int16 mono PCM from /v1/audio/speech when response_format=pcm (Qwen3-TTS code2wav output rate). +_PCM_SPEECH_SAMPLE_RATE_HZ = 24_000 + + +class OmniServerParams(NamedTuple): + model: str + port: int | None = None + stage_config_path: str | None = None + server_args: list[str] | None = None + env_dict: dict[str, str] | None = None + use_omni: bool = True + + +def assert_image_diffusion_response( + response, + request_config: dict[str, Any], + run_level: str = None, +) -> None: + """ + Validate image diffusion response. + + Expected request_config schema: + { + "request_type": "image", + "extra_body": { + "num_outputs_per_prompt": 1, + "width": ..., + "height": ..., + ... + } + } + """ + assert response.images is not None, "Image response is None" + assert len(response.images) > 0, "No images in response" + + extra_body = request_config.get("extra_body") or {} + + num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt") + if num_outputs_per_prompt is not None: + assert len(response.images) == num_outputs_per_prompt, ( + f"Expected {num_outputs_per_prompt} images, got {len(response.images)}" + ) + + if run_level == "advanced_model": + width = extra_body.get("width") + height = extra_body.get("height") + + if width is not None or height is not None: + for img in response.images: + assert_image_valid(img, width=width, height=height) + + +def assert_video_diffusion_response( + response, + request_config: dict[str, Any], + run_level: str = None, +) -> None: + """ + Validate video diffusion response. + + Expected request_config schema: + { + "request_type": "video", + "form_data": { + "prompt": "...", + "num_frames": ..., + "width": ..., + "height": ..., + "fps": ..., + ... + } + } + """ + form_data = request_config.get("form_data", {}) + + assert response.videos is not None, "Video response is None" + assert len(response.videos) > 0, "No videos in response" + + expected_frames = _maybe_int(form_data.get("num_frames")) + expected_width = _maybe_int(form_data.get("width")) + expected_height = _maybe_int(form_data.get("height")) + expected_fps = _maybe_int(form_data.get("fps")) + + for vid_bytes in response.videos: + assert_video_valid( + vid_bytes, + num_frames=expected_frames, + width=expected_width, + height=expected_height, + fps=expected_fps, + ) + + +def assert_audio_diffusion_response( + response, + request_config: dict[str, Any], + run_level: str = None, +) -> None: + """ + Validate audio diffusion response. + """ + raise NotImplementedError("Audio validation is not implemented yet") + # consider using assert_audio_valid defined above + + +def _maybe_int(value: Any) -> int | None: + if value is None: + return None + return int(value) + + +def assert_image_valid(image: Path | Image.Image, *, width: int | None = None, height: int | None = None): + """Assert the file is a loadable image with optional exact dimensions.""" + if isinstance(image, Path): + assert image.exists(), f"Image not found: {image}" + image = Image.open(image) + image.load() + assert image.width > 0 and image.height > 0 + if width is not None: + assert image.width == width, f"Expected width={width}, got {image.width}" + if height is not None: + assert image.height == height, f"Expected height={height}, got {image.height}" + return image + + +def assert_video_valid( + video: Path | bytes | BytesIO, + *, + num_frames: int | None = None, + width: int | None = None, + height: int | None = None, + fps: float | None = None, +) -> dict[str, int | float]: + """Assert the MP4 has the expected resolution and exact frame count.""" + temp_path = None + cap = None + try: + # Normalize input to file path + if isinstance(video, Path): + if not video.exists(): + raise AssertionError(f"Video file not found: {video}") + video_path = str(video) + else: + # Create temp file for bytes/BytesIO + suffix = ".mp4" + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix, mode="wb") as tmp: + if isinstance(video, bytes): + tmp.write(video) + elif isinstance(video, BytesIO): + tmp.write(video.getvalue()) + else: + raise TypeError(f"Unsupported video type: {type(video)}") + temp_path = Path(tmp.name) + video_path = str(temp_path) + + # Open video capture + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + raise AssertionError(f"Failed to open video: {video_path}") + + # Extract properties + actual_num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + actual_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + actual_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + actual_fps = cap.get(cv2.CAP_PROP_FPS) + + actual_num_frames = 0 + while True: + ok, _frame = cap.read() + if not ok: + break + actual_num_frames += 1 + + # Basic validity checks + if actual_num_frames <= 0: + raise AssertionError(f"Invalid frame count: {actual_num_frames} (must be > 0)") + if actual_width <= 0 or actual_height <= 0: + raise AssertionError(f"Invalid dimensions: {actual_width}x{actual_height} (must be > 0)") + if actual_fps <= 0: + raise AssertionError(f"Invalid FPS: {actual_fps} (must be > 0)") + + # Validate against expectations + if num_frames is not None: + expected_num_frames = (num_frames // 4) * 4 + 1 + assert actual_num_frames == expected_num_frames, ( + f"Frame count mismatch: expected {num_frames}, got {actual_num_frames}" + ) + if width is not None: + assert actual_width == width, f"Width mismatch: expected {width}px, got {actual_width}px" + if height is not None: + assert actual_height == height, f"Height mismatch: expected {height}px, got {actual_height}px" + if fps is not None: + # Use tolerance for float comparison (codec rounding) + assert abs(actual_fps - fps) < 0.5, f"FPS mismatch: expected {fps}, got {actual_fps:.2f}" + + return {"num_frames": actual_num_frames, "width": actual_width, "height": actual_height, "fps": actual_fps} + + except Exception as e: + print(f"ERROR: {type(e).__name__}: {e}", flush=True) + raise + + finally: + # Cleanup resources + if cap is not None: + cap.release() + if temp_path and temp_path.exists(): + try: + temp_path.unlink() + except OSError: + pass + + +def assert_audio_valid(path: Path, *, sample_rate: int, channels: int, duration_s: float) -> None: + """Assert the WAV has the expected sample rate, channel count, and duration.""" + assert path.exists(), f"Audio not found: {path}" + info = sf.info(str(path)) + assert info.samplerate == sample_rate, f"Expected sample_rate={sample_rate}, got {info.samplerate}" + assert info.channels == channels, f"Expected {channels} channel(s), got {info.channels}" + expected_frames = int(duration_s * sample_rate) + assert info.frames == expected_frames, ( + f"Expected {expected_frames} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}" + ) + + +def decode_b64_image(b64: str): + img = Image.open(BytesIO(base64.b64decode(b64))) + img.load() + return img + + +@pytest.fixture(scope="session") +def model_prefix() -> str: + """Optional model-path prefix from MODEL_PREFIX env var. + Useful if models are downloaded to non-default local directories. + """ + prefix = os.environ.get("MODEL_PREFIX", "") + return f"{prefix.rstrip('/')}/" if prefix else "" + + +@pytest.fixture(autouse=True) +def default_vllm_config(): + """Set a default VllmConfig for all tests. + + This fixture is auto-used for all tests to ensure that any test + that directly instantiates vLLM CustomOps (e.g., RMSNorm, LayerNorm) + or model components has the required VllmConfig context. + + This fixture is required for vLLM 0.14.0+ where CustomOp initialization + requires a VllmConfig context set via set_current_vllm_config(). + """ + from vllm.config import DeviceConfig, VllmConfig, set_current_vllm_config + + # Use CPU device if no GPU is available (e.g., in CI environments) + has_gpu = torch.cuda.is_available() and torch.cuda.device_count() > 0 + device = "cuda" if has_gpu else "cpu" + device_config = DeviceConfig(device=device) + + with set_current_vllm_config(VllmConfig(device_config=device_config)): + yield + + +@pytest.fixture(autouse=True) +def clean_gpu_memory_between_tests(): + print("\n=== PRE-TEST GPU CLEANUP ===") + _run_pre_test_cleanup() + yield + _run_post_test_cleanup() + + +@pytest.fixture(autouse=True) +def log_test_name_before_test(request): + print(f"--- Running test: {request.node.name}") + yield + + +def _run_pre_test_cleanup(enable_force=False): + if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force: + print("GPU cleanup disabled") + return + + print("Pre-test GPU status:") + + num_gpus = torch.cuda.device_count() + if num_gpus > 0: + try: + from tests.utils import wait_for_gpu_memory_to_clear + + wait_for_gpu_memory_to_clear( + devices=list(range(num_gpus)), + threshold_ratio=0.05, + ) + except Exception as e: + print(f"Pre-test cleanup note: {e}") + + +def _run_post_test_cleanup(enable_force=False): + if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force: + print("GPU cleanup disabled") + return + + if torch.cuda.is_available(): + gc.collect() + torch.cuda.empty_cache() + + print("Post-test GPU status:") + _print_gpu_processes() + + +def _print_gpu_processes(): + """Print GPU information including nvidia-smi and system processes""" + + print("\n" + "=" * 80) + print("NVIDIA GPU Information (nvidia-smi)") + print("=" * 80) + + try: + nvidia_result = subprocess.run( + ["nvidia-smi"], + capture_output=True, + text=True, + timeout=5, + ) + + if nvidia_result.returncode == 0: + lines = nvidia_result.stdout.strip().split("\n") + for line in lines[:20]: + print(line) + + if len(lines) > 20: + print(f"... (showing first 20 of {len(lines)} lines)") + else: + print("nvidia-smi command failed") + + except (subprocess.TimeoutExpired, FileNotFoundError): + print("nvidia-smi not available or timed out") + except Exception as e: + print(f"Error running nvidia-smi: {e}") + + print("\n" + "=" * 80) + print("Detailed GPU Processes (nvidia-smi pmon)") + print("=" * 80) + + try: + pmon_result = subprocess.run( + ["nvidia-smi", "pmon", "-c", "1"], + capture_output=True, + text=True, + timeout=3, + ) + + if pmon_result.returncode == 0 and pmon_result.stdout.strip(): + print(pmon_result.stdout) + else: + print("No active GPU processes found via nvidia-smi pmon") + + except Exception: + print("nvidia-smi pmon not available") + + print("\n" + "=" * 80) + print("System Processes with GPU keywords") + print("=" * 80) + + +def dummy_messages_from_mix_data( + system_prompt: dict[str, Any] = None, + video_data_url: Any = None, + audio_data_url: Any = None, + image_data_url: Any = None, + content_text: str = None, +): + """Create messages with video、image、audio data URL for OpenAI API.""" + + if content_text is not None: + content = [{"type": "text", "text": content_text}] + else: + content = [] + + media_items = [] + if isinstance(video_data_url, list): + for video_url in video_data_url: + media_items.append((video_url, "video")) + else: + media_items.append((video_data_url, "video")) + + if isinstance(image_data_url, list): + for url in image_data_url: + media_items.append((url, "image")) + else: + media_items.append((image_data_url, "image")) + + if isinstance(audio_data_url, list): + for url in audio_data_url: + media_items.append((url, "audio")) + else: + media_items.append((audio_data_url, "audio")) + + content.extend( + {"type": f"{media_type}_url", f"{media_type}_url": {"url": url}} + for url, media_type in media_items + if url is not None + ) + messages = [{"role": "user", "content": content}] + if system_prompt is not None: + messages = [system_prompt] + messages + return messages + + +def generate_synthetic_audio( + duration: int, # seconds + num_channels: int, # 1:Mono,2:Stereo 5:5.1 surround sound + sample_rate: int = 48000, # Default use 48000Hz. + save_to_file: bool = False, +) -> dict[str, Any]: + """ + Generate TTS speech with pyttsx3 and return base64 string. + """ + + import pyttsx3 + import soundfile as sf + + def _pick_voice(engine: pyttsx3.Engine) -> str | None: + voices = engine.getProperty("voices") + if not voices: + return None + + preferred_tokens = ( + "natural", + "jenny", + "sonia", + "susan", + "zira", + "aria", + "hazel", + "samantha", + "ava", + "allison", + "female", + "woman", + "english-us", + "en-us", + "english", + ) + discouraged_tokens = ( + "espeak", + "robot", + "mbrola", + "microsoft david", + "male", + "man", + ) + + best_voice = voices[0] + best_score = float("-inf") + for voice in voices: + voice_text = f"{getattr(voice, 'id', '')} {getattr(voice, 'name', '')}".lower() + voice_languages = " ".join( + lang.decode(errors="ignore") if isinstance(lang, bytes) else str(lang) + for lang in getattr(voice, "languages", []) + ).lower() + combined_text = f"{voice_text} {voice_languages}" + score = 0 + for idx, token in enumerate(preferred_tokens): + if token in combined_text: + score += 20 - idx + for token in discouraged_tokens: + if token in combined_text: + score -= 10 + if "english" in combined_text or "en_" in combined_text or "en-" in combined_text: + score += 4 + if "en-us" in combined_text or "english-us" in combined_text: + score += 4 + if score > best_score: + best_score = score + best_voice = voice + + return best_voice.id + + def _resample_audio(audio: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray: + if src_sr == dst_sr or len(audio) == 0: + return audio.astype(np.float32) + + src_len = audio.shape[0] + dst_len = max(1, int(round(src_len * float(dst_sr) / float(src_sr)))) + src_idx = np.arange(src_len, dtype=np.float32) + dst_idx = np.linspace(0, src_len - 1, dst_len, dtype=np.float32) + + resampled_channels: list[np.ndarray] = [] + for ch in range(audio.shape[1]): + resampled_channels.append(np.interp(dst_idx, src_idx, audio[:, ch]).astype(np.float32)) + return np.stack(resampled_channels, axis=1) + + def _match_channels(audio: np.ndarray, target_channels: int) -> np.ndarray: + current_channels = audio.shape[1] + if current_channels == target_channels: + return audio.astype(np.float32) + if target_channels == 1: + return np.mean(audio, axis=1, keepdims=True, dtype=np.float32) + if current_channels == 1: + return np.repeat(audio, target_channels, axis=1).astype(np.float32) + + collapsed = np.mean(audio, axis=1, keepdims=True, dtype=np.float32) + return np.repeat(collapsed, target_channels, axis=1).astype(np.float32) + + def _trim_silence(audio: np.ndarray, threshold: float = 0.01) -> np.ndarray: + if len(audio) == 0: + return audio + energy = np.max(np.abs(audio), axis=1) + voiced = np.where(energy > threshold)[0] + if len(voiced) == 0: + return audio + start = max(0, int(voiced[0]) - int(sample_rate * 0.02)) + end = min(len(audio), int(voiced[-1]) + int(sample_rate * 0.04) + 1) + return audio[start:end] + + def _enhance_speech(audio: np.ndarray) -> np.ndarray: + if len(audio) == 0: + return audio.astype(np.float32) + enhanced = audio.astype(np.float32).copy() + enhanced -= np.mean(enhanced, axis=0, keepdims=True, dtype=np.float32) + if len(enhanced) > 1: + preemphasis = enhanced.copy() + preemphasis[1:] = enhanced[1:] - 0.94 * enhanced[:-1] + enhanced = 0.7 * enhanced + 0.3 * preemphasis + # Mild dynamic-range compression for ASR/TTS robustness. + enhanced = np.sign(enhanced) * np.sqrt(np.abs(enhanced)) + # Light fade to avoid clicks after trimming/repeating. + fade = min(len(enhanced) // 4, max(1, int(sample_rate * 0.01))) + if fade > 1: + ramp_in = np.linspace(0.0, 1.0, fade, dtype=np.float32) + ramp_out = np.linspace(1.0, 0.0, fade, dtype=np.float32) + enhanced[:fade] *= ramp_in[:, None] + enhanced[-fade:] *= ramp_out[:, None] + peak = float(np.max(np.abs(enhanced))) + if peak > 1e-8: + enhanced = enhanced / peak * 0.95 + return enhanced.astype(np.float32) + + phrase_text = "test" + num_samples = int(sample_rate * max(1, duration)) + audio_data = np.zeros((num_samples, num_channels), dtype=np.float32) + + engine = pyttsx3.init() + engine.setProperty("rate", 112) + engine.setProperty("volume", 1.0) + selected_voice = _pick_voice(engine) + if selected_voice is not None: + engine.setProperty("voice", selected_voice) + + temp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + temp_wav.close() + + try: + engine.save_to_file(phrase_text, temp_wav.name) + engine.runAndWait() + engine.stop() + + ready = False + for _ in range(50): + if os.path.exists(temp_wav.name) and os.path.getsize(temp_wav.name) > 44: + ready = True + break + time.sleep(0.1) + + if not ready: + raise RuntimeError("pyttsx3 did not produce a WAV file in time.") + + tts_audio, tts_sr = sf.read(temp_wav.name, dtype="float32", always_2d=True) + finally: + if os.path.exists(temp_wav.name): + os.unlink(temp_wav.name) + + if len(tts_audio) == 0: + raise RuntimeError("pyttsx3 produced an empty WAV file.") + + tts_audio = _resample_audio(tts_audio, tts_sr, sample_rate) + tts_audio = _match_channels(tts_audio, num_channels) + tts_audio = _trim_silence(tts_audio, threshold=0.012) + tts_audio = _enhance_speech(tts_audio) + + lead_silence = min(int(sample_rate * 0.02), num_samples // 8) + pause_samples = int(sample_rate * 0.18) + start = lead_silence + phrase_len = tts_audio.shape[0] + + while start < num_samples: + take = min(phrase_len, num_samples - start) + audio_data[start : start + take] = tts_audio[:take] + start += phrase_len + pause_samples + + max_amp = float(np.max(np.abs(audio_data))) + if max_amp > 0: + audio_data = audio_data / max_amp * 0.95 + + audio_bytes: bytes | None = None + output_path: str | None = None + result: dict[str, Any] = { + "np_array": audio_data.copy(), + } + + if save_to_file: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = f"audio_{num_channels}ch_{timestamp}.wav" + + try: + sf.write(output_path, audio_data, sample_rate, format="WAV", subtype="PCM_16") + print(f"Audio saved: {output_path}") + + with open(output_path, "rb") as f: + audio_bytes = f.read() + except Exception as e: + print(f"Save failed: {e}") + save_to_file = False + + # If not saving or save failed, create in memory + if not save_to_file or audio_bytes is None: + buffer = io.BytesIO() + sf.write(buffer, audio_data, sample_rate, format="WAV", subtype="PCM_16") + buffer.seek(0) + audio_bytes = buffer.read() + + # Return result + base64_audio = base64.b64encode(audio_bytes).decode("utf-8") + result["base64"] = base64_audio + # Always include file_path to avoid KeyError in callers. + result["file_path"] = output_path if save_to_file and output_path else None + + return result + + +def _mux_mp4_bytes_with_synthetic_audio( + video_mp4_bytes: bytes, + *, + num_frames: int, + fps: float = 30.0, + sample_rate: int = 48000, +) -> bytes: + """ + Mux a video-only MP4 with mono TTS audio from :func:`generate_synthetic_audio` (AAC). + + Audio length is at least the video duration in whole seconds (rounded up); ffmpeg + ``-shortest`` trims to the video when the WAV is longer. + + Uses ffmpeg from ``imageio_ffmpeg`` when available, else ``ffmpeg`` on PATH. + If TTS or mux fails, returns ``video_mp4_bytes`` unchanged. + + Mux subprocess does **not** use ``capture_output=True``: ffmpeg can block writing + to a full stderr pipe while :func:`subprocess.run` waits for exit (classic deadlock). + """ + duration_sec = num_frames / fps if fps > 0 else 0.0 + # generate_synthetic_audio(duration=int) uses at least 1s of buffer internally + duration_int = max(1, int(math.ceil(duration_sec))) + + try: + audio_result = generate_synthetic_audio( + duration=duration_int, + num_channels=1, + sample_rate=sample_rate, + save_to_file=False, + ) + audio_pcm = audio_result["np_array"] + except Exception as e: + logger.warning("Synthetic video: generate_synthetic_audio failed (%s); using video-only MP4.", e) + return video_mp4_bytes + + try: + import imageio_ffmpeg + + ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe() + except Exception: + ffmpeg_exe = "ffmpeg" + + import tempfile + + try: + with tempfile.TemporaryDirectory(prefix="syn_vid_mux_") as tmp: + vid_path = os.path.join(tmp, "video.mp4") + wav_path = os.path.join(tmp, "audio.wav") + out_path = os.path.join(tmp, "out.mp4") + with open(vid_path, "wb") as f: + f.write(video_mp4_bytes) + sf.write(wav_path, audio_pcm, sample_rate, format="WAV", subtype="PCM_16") + cmd = [ + ffmpeg_exe, + "-y", + "-nostdin", + "-hide_banner", + "-loglevel", + "error", + "-i", + vid_path, + "-i", + wav_path, + "-c:v", + "copy", + "-c:a", + "aac", + "-b:a", + "128k", + "-shortest", + "-movflags", + "+faststart", + out_path, + ] + subprocess.run( + cmd, + check=True, + stdin=subprocess.DEVNULL, + timeout=300, + ) + with open(out_path, "rb") as f: + return f.read() + except ( + FileNotFoundError, + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + OSError, + ) as e: + logger.warning("Synthetic video: audio mux failed (%s); using video-only MP4.", e) + return video_mp4_bytes + + +def generate_synthetic_video( + width: int, + height: int, + num_frames: int, + save_to_file: bool = False, + *, + embed_audio: bool = False, +) -> dict[str, Any]: + """Generate synthetic video with bouncing balls and base64 MP4. + + When ``embed_audio`` is True, muxes mono AAC from :func:`generate_synthetic_audio` + (TTS + ffmpeg) into the MP4; otherwise returns video-only MP4 (faster when tests do + not need an audio track). + """ + + import cv2 + import imageio + + # Create random balls + num_balls = random.randint(3, 8) + balls = [] + + for _ in range(num_balls): + radius = min(width, height) // 8 + if radius < 1: + raise ValueError(f"Video dimensions ({width}x{height}) are too small for synthetic video generation") + x = random.randint(radius, width - radius) + y = random.randint(radius, height - radius) + + speed = random.uniform(3.0, 8.0) + angle = random.uniform(0, 2 * math.pi) + vx = speed * math.cos(angle) + vy = speed * math.sin(angle) + + # OpenCV uses BGR format, but imageio expects RGB + # We'll create in BGR first, then convert to RGB later + color_bgr = (random.randint(50, 255), random.randint(50, 255), random.randint(50, 255)) + + balls.append({"x": x, "y": y, "vx": vx, "vy": vy, "radius": radius, "color_bgr": color_bgr}) + + # Generate video frames + video_frames = [] + + for frame_idx in range(num_frames): + # Create black background (BGR format) + frame_bgr = np.zeros((height, width, 3), dtype=np.uint8) + + for ball in balls: + # Update position + ball["x"] += ball["vx"] + ball["y"] += ball["vy"] + + # Boundary collision detection + if ball["x"] - ball["radius"] <= 0 or ball["x"] + ball["radius"] >= width: + ball["vx"] = -ball["vx"] + ball["x"] = max(ball["radius"], min(width - ball["radius"], ball["x"])) + + if ball["y"] - ball["radius"] <= 0 or ball["y"] + ball["radius"] >= height: + ball["vy"] = -ball["vy"] + ball["y"] = max(ball["radius"], min(height - ball["radius"], ball["y"])) + + # Use cv2 to draw circle + x, y = int(ball["x"]), int(ball["y"]) + radius = ball["radius"] + + # Draw solid circle (main circle) + cv2.circle(frame_bgr, (x, y), radius, ball["color_bgr"], -1) + + # Add simple 3D effect: draw a brighter center + if radius > 3: # Only add highlight when radius is large enough + highlight_radius = max(1, radius // 2) + highlight_x = max(highlight_radius, min(x - radius // 4, width - highlight_radius)) + highlight_y = max(highlight_radius, min(y - radius // 4, height - highlight_radius)) + + # Create highlight color (brighter) + highlight_color = tuple(min(c + 40, 255) for c in ball["color_bgr"]) + cv2.circle(frame_bgr, (highlight_x, highlight_y), highlight_radius, highlight_color, -1) + + # Convert BGR to RGB for imageio + frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB) + video_frames.append(frame_rgb) + + video_array = np.array(video_frames) + result = { + "np_array": video_array, + } + saved_file_path = None + + fps = 30 + buffer = io.BytesIO() + writer_kwargs = { + "format": "mp4", + "fps": fps, + "codec": "libx264", + "quality": 7, + "pixelformat": "yuv420p", + "macro_block_size": 16, + "ffmpeg_params": [ + "-preset", + "medium", + "-crf", + "23", + "-movflags", + "+faststart", + "-pix_fmt", + "yuv420p", + "-vf", + f"scale={width}:{height}", + ], + } + + try: + with imageio.get_writer(buffer, **writer_kwargs) as writer: + for frame in video_frames: + writer.append_data(frame) + buffer.seek(0) + video_only_bytes = buffer.read() + except Exception as e: + print(f"Warning: Failed to encode synthetic video: {e}") + raise + + if embed_audio: + video_bytes = _mux_mp4_bytes_with_synthetic_audio(video_only_bytes, num_frames=num_frames, fps=float(fps)) + else: + video_bytes = video_only_bytes + + if save_to_file: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = f"video_{width}x{height}_{timestamp}.mp4" + try: + with open(output_path, "wb") as f: + f.write(video_bytes) + saved_file_path = output_path + print(f"Video saved to: {saved_file_path}") + except Exception as e: + print(f"Warning: Failed to save video to file {output_path}: {e}") + + base64_video = base64.b64encode(video_bytes).decode("utf-8") + + result["base64"] = base64_video + if save_to_file and saved_file_path: + result["file_path"] = saved_file_path + + return result + + +def generate_synthetic_image(width: int, height: int, save_to_file: bool = False) -> dict[str, Any]: + """Generate synthetic image with randomly colored squares and return base64 string.""" + from PIL import Image, ImageDraw + + # Create white background + image = Image.new("RGB", (width, height), (255, 255, 255)) + draw = ImageDraw.Draw(image) + + # Generate random number of squares + num_squares = random.randint(3, 8) + + for _ in range(num_squares): + # Random square size + square_size = random.randint(min(width, height) // 8, min(width, height) // 4) + + # Random position + x = random.randint(0, width - square_size - 1) + y = random.randint(0, height - square_size - 1) + + # Random color + color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) + + # Random border width + border_width = random.randint(1, 5) + + # Draw square + draw.rectangle([x, y, x + square_size, y + square_size], fill=color, outline=(0, 0, 0), width=border_width) + + image_array = np.array(image) + result = {"np_array": image_array.copy()} + + # Handle file saving + image_bytes = None + saved_file_path = None + + if save_to_file: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = f"image_{width}x{height}_{timestamp}.jpg" + + try: + # Save image to file + image.save(output_path, format="JPEG", quality=85, optimize=True) + saved_file_path = output_path + print(f"Image saved to: {saved_file_path}") + + # Read file for base64 encoding + with open(output_path, "rb") as f: + image_bytes = f.read() + + except Exception as e: + print(f"Warning: Failed to save image to file {output_path}: {e}") + save_to_file = False + + # If not saving or save failed, create in memory + if not save_to_file or image_bytes is None: + buffer = io.BytesIO() + image.save(buffer, format="JPEG", quality=85, optimize=True) + buffer.seek(0) + image_bytes = buffer.read() + + # Generate base64 + base64_image = base64.b64encode(image_bytes).decode("utf-8") + + # Return result + result["base64"] = base64_image + if save_to_file and saved_file_path: + result["file_path"] = saved_file_path + + return result + + +def preprocess_text(text): + import opencc + + word_to_num = { + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + } + + for word, num in word_to_num.items(): + pattern = r"\b" + re.escape(word) + r"\b" + text = re.sub(pattern, num, text, flags=re.IGNORECASE) + + text = re.sub(r"[^\w\s]", "", text) + text = re.sub(r"\s+", " ", text) + cc = opencc.OpenCC("t2s") + text = cc.convert(text) + + # Special handling for spaces between Chinese characters: + # - Keep single spaces between English words/numbers + # - Remove spaces only when surrounded by Chinese characters on both sides to prevent incorrect word segmentation + text = re.sub(r"(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])", "", text) + + return text.lower().strip() + + +def cosine_similarity_text(text1, text2, n: int = 3): + from collections import Counter + + if not text1 or not text2: + return 0.0 + + text1 = preprocess_text(text1) + text2 = preprocess_text(text2) + print(f"cosine similarity text1 is: {text1}, text2 is: {text2}") + + ngrams1 = [text1[i : i + n] for i in range(len(text1) - n + 1)] + ngrams2 = [text2[i : i + n] for i in range(len(text2) - n + 1)] + + counter1 = Counter(ngrams1) + counter2 = Counter(ngrams2) + + all_ngrams = set(counter1.keys()) | set(counter2.keys()) + vec1 = [counter1.get(ng, 0) for ng in all_ngrams] + vec2 = [counter2.get(ng, 0) for ng in all_ngrams] + + dot_product = sum(a * b for a, b in zip(vec1, vec2)) + norm1 = sum(a * a for a in vec1) ** 0.5 + norm2 = sum(b * b for b in vec2) ** 0.5 + + if norm1 == 0 or norm2 == 0: + return 0.0 + return dot_product / (norm1 * norm2) + + +def convert_audio_to_text(audio_data): + """ + Convert base64 encoded audio data to text using speech recognition. + """ + audio_data = base64.b64decode(audio_data) + output_path = f"./test_{uuid.uuid4().hex}.wav" + with open(output_path, "wb") as audio_file: + audio_file.write(audio_data) + + print(f"audio data is saved: {output_path}") + text = convert_audio_file_to_text(output_path=output_path) + return text + + +def _merge_base64_audio_to_segment(base64_list: list[str]): + """Merge a list of base64-encoded audio chunks into one pydub AudioSegment.""" + from pydub import AudioSegment + + merged = None + for b64 in base64_list: + raw = base64.b64decode(b64.split(",", 1)[-1]) + seg = AudioSegment.from_file(io.BytesIO(raw)) + merged = seg if merged is None else merged + seg + return merged + + +def _whisper_transcribe_in_current_process(output_path: str) -> str: + import whisper + + # Multi-GPU: use last visible device to avoid colliding with default device 0; single device uses 0. + device_index = None + if current_omni_platform.is_available(): + n = current_omni_platform.get_device_count() + if n == 1: + device_index = 0 + elif n > 1: + device_index = n - 1 + + if device_index is not None: + torch_device = current_omni_platform.get_torch_device(device_index) + current_omni_platform.set_device(torch_device) + device = str(torch_device) + use_accelerator = True + else: + use_accelerator = False + device = "cpu" + model = whisper.load_model("small", device=device) + try: + text = model.transcribe( + output_path, + temperature=0.0, + word_timestamps=True, + condition_on_previous_text=False, + )["text"] + finally: + del model + gc.collect() + if use_accelerator: + current_omni_platform.synchronize() + current_omni_platform.empty_cache() + + return text or "" + + +def convert_audio_file_to_text(output_path: str) -> str: + """Convert an audio file to text in an isolated subprocess.""" + # Import locally to avoid impacting test module import time. + ctx = multiprocessing.get_context("spawn") + with concurrent.futures.ProcessPoolExecutor(max_workers=1, mp_context=ctx) as executor: + future = executor.submit(_whisper_transcribe_in_current_process, output_path) + return future.result() + + +def convert_audio_bytes_to_text(raw_bytes: bytes) -> str: + """ + Write container audio bytes (WAV, etc.) to a temp WAV file suitable for Whisper/ffmpeg. + Normalizes with soundfile to PCM_16 WAV when possible to avoid codec issues. + """ + output_path = f"./test_{uuid.uuid4().hex}.wav" + data, samplerate = sf.read(io.BytesIO(raw_bytes)) + sf.write(output_path, data, samplerate, format="WAV", subtype="PCM_16") + text = convert_audio_file_to_text(output_path) + return text + + +def modify_stage_config( + yaml_path: str, + updates: dict[str, Any] = None, + deletes: dict[str, Any] = None, +) -> str: + """ + Modify configurations in a YAML file, supporting both top-level and stage-specific modifications, + including addition, modification, and deletion of configurations. + + Args: + yaml_path: Path to the YAML configuration file. + updates: Dictionary containing both top-level and stage-specific modifications to add or update. + Format: { + 'async_chunk': True, + 'stage_args': { + 0: {'engine_args.max_model_len': 5800}, + 1: {'engine_args.max_num_seqs': 2} + } + } + deletes: Dictionary containing configurations to delete. + Format: { + 'old_config': None, # Delete entire key + 'stage_args': { + 0: ['engine_args.old_param'], + 1: ['runtime.unused_setting'] + } + } + + Returns: + str: Path to the newly created modified YAML file with timestamp suffix. + """ + path = Path(yaml_path) + if not path.exists(): + raise FileNotFoundError(f"yaml does not exist: {path}") + + try: + with open(yaml_path, encoding="utf-8") as f: + config = yaml.safe_load(f) or {} + except Exception as e: + raise ValueError(f"Cannot parse YAML file: {e}") + + # Helper function to apply update + def apply_update(config_dict: dict, key_path: str, value: Any) -> None: + """Apply update to dictionary using dot-separated path.""" + # Handle direct list assignment (e.g., engine_input_source: [1, 2]) + if "." not in key_path: + # Simple key, set directly + config_dict[key_path] = value + return + + current = config_dict + keys = key_path.split(".") + + for i in range(len(keys) - 1): + key = keys[i] + + # Handle list indices + if key.isdigit() and isinstance(current, list): + index = int(key) + if index < 0: + raise ValueError(f"Negative list index not allowed: {index}") + if index >= len(current): + # Expand list if needed + while len(current) <= index: + # If we need to go deeper (more keys after this), create a dict + # Otherwise, create None placeholder + current.append({} if i < len(keys) - 2 else None) + current = current[index] + elif isinstance(current, dict): + # Handle dictionary keys + if key not in current: + # If there are more keys after this, create appropriate structure + if i < len(keys) - 1: + # Check if next key is a digit (list index) or string (dict key) + if keys[i + 1].isdigit(): + current[key] = [] + else: + current[key] = {} + else: + # This is the last key, create based on value type + current[key] = [] if isinstance(value, list) else {} + elif not isinstance(current[key], (dict, list)) and i < len(keys) - 1: + # If current value is not dict/list but we need to go deeper, replace it + if keys[i + 1].isdigit(): + current[key] = [] + else: + current[key] = {} + current = current[key] + else: + # Current is not a dict or list, cannot traverse further + raise TypeError( + f"Cannot access {'.'.join(keys[: i + 1])} as a dict/list. It's a {type(current).__name__}" + ) + + # Set the final value + last_key = keys[-1] + if isinstance(current, list) and last_key.isdigit(): + # Setting a value in a list by index + index = int(last_key) + if index < 0: + raise ValueError(f"Negative list index not allowed: {index}") + if index >= len(current): + # Expand list if needed + while len(current) <= index: + current.append(None) + current[index] = value + elif isinstance(current, dict): + # Special case: if the value is a list and we're setting a top-level key + # Example: updating engine_input_source with [1, 2] + current[last_key] = value + else: + # Current is not a dict, cannot set key + raise TypeError(f"Cannot set value at {key_path}. Current type is {type(current).__name__}, expected dict.") + + # Helper function to delete by path + def delete_by_path(config_dict: dict, path: str) -> None: + """Delete configuration by dot-separated path.""" + if not path: + return + + current = config_dict + keys = path.split(".") + + # Traverse to the parent + for i in range(len(keys) - 1): + key = keys[i] + + # Handle list indices + if key.isdigit() and isinstance(current, list): + index = int(key) + if index < 0 or index >= len(current): + raise KeyError(f"List index {index} out of bounds") + current = current[index] + elif isinstance(current, dict): + if key not in current: + raise KeyError(f"Path {'.'.join(keys[: i + 1])} does not exist") + current = current[key] + else: + raise TypeError( + f"Cannot access {'.'.join(keys[: i + 1])} as a dict/list. It's a {type(current).__name__}" + ) + + # Delete the item + last_key = keys[-1] + + if isinstance(current, list) and last_key.isdigit(): + index = int(last_key) + if index < 0 or index >= len(current): + raise KeyError(f"List index {index} out of bounds") + del current[index] + elif isinstance(current, dict) and last_key in current: + del current[last_key] + else: + print(f"Path {path} does not exist") + + # Apply deletions first + if deletes: + for key, value in deletes.items(): + if key == "stage_args": + if value and isinstance(value, dict): + stage_args = config.get("stage_args", []) + if not stage_args: + raise ValueError("stage_args does not exist in config") + + for stage_id, delete_paths in value.items(): + if not delete_paths: + continue + + # Find stage by ID + target_stage = None + for stage in stage_args: + if stage.get("stage_id") == int(stage_id): + target_stage = stage + break + + if target_stage is None: + continue + + # Delete specified paths in this stage + for path in delete_paths: + if path: # Skip empty paths + delete_by_path(target_stage, path) + elif "." in key: + # Delete using dot-separated path + delete_by_path(config, key) + elif value is None and key in config: + # Delete entire key + del config[key] + + # Apply updates + if updates: + for key, value in updates.items(): + if key == "stage_args": + if value and isinstance(value, dict): + stage_args = config.get("stage_args", []) + if not stage_args: + raise ValueError("stage_args does not exist in config") + + for stage_id, stage_updates in value.items(): + # Find stage by ID + target_stage = None + for stage in stage_args: + if stage.get("stage_id") == int(stage_id): + target_stage = stage + break + + if target_stage is None: + available_ids = [s.get("stage_id") for s in stage_args if "stage_id" in s] + raise KeyError(f"Stage ID {stage_id} not found, available: {available_ids}") + + # Apply updates to this stage + for path, val in stage_updates.items(): + # Check if this is a simple key (not dot-separated) + # Example: 'engine_input_source' vs 'engine_args.max_model_len' + if "." not in path: + # Direct key assignment (e.g., updating a list value) + target_stage[path] = val + else: + # Dot-separated path (e.g., nested dict access) + apply_update(target_stage, path, val) + elif "." in key: + # Apply using dot-separated path + apply_update(config, key, value) + else: + # Direct top-level key + config[key] = value + + # Unique suffix: multiple modify_stage_config calls in one process often run + # within the same second (e.g. test_qwen3_omni_expansion imports both + # get_chunk_config and get_batch_token_config). int(time.time()) would collide + # and the later write would overwrite the earlier YAML on disk. + base_name = yaml_path.rsplit(".", 1)[0] if "." in yaml_path else yaml_path + output_path = f"{base_name}_{time.time_ns()}.yaml" + + with open(output_path, "w", encoding="utf-8") as f: + yaml.dump(config, f, default_flow_style=None, sort_keys=False, allow_unicode=True, indent=2) + + return output_path + + +class OmniServer: + """Omniserver for vLLM-Omni tests.""" + + def __init__( + self, + model: str, + serve_args: list[str], + *, + port: int | None = None, + env_dict: dict[str, str] | None = None, + use_omni: bool = True, + ) -> None: + _run_pre_test_cleanup(enable_force=True) + _run_post_test_cleanup(enable_force=True) + cleanup_dist_env_and_memory() + self.model = model + self.serve_args = serve_args + self.env_dict = env_dict + self.use_omni = use_omni + self.proc: subprocess.Popen | None = None + self.host = "127.0.0.1" + if port is None: + self.port = get_open_port() + else: + self.port = port + + def _start_server(self) -> None: + """Start the vLLM-Omni server subprocess.""" + env = os.environ.copy() + env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + if self.env_dict is not None: + env.update(self.env_dict) + + cmd = [ + sys.executable, + "-m", + "vllm_omni.entrypoints.cli.main", + "serve", + self.model, + "--host", + self.host, + "--port", + str(self.port), + ] + if self.use_omni: + cmd.append("--omni") + cmd += self.serve_args + + print(f"Launching OmniServer with: {' '.join(cmd)}") + self.proc = subprocess.Popen( + cmd, + env=env, + cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))), # Set working directory to vllm-omni root + ) + + # Wait for server to be ready + max_wait = 1200 # 20 minutes + start_time = time.time() + while time.time() - start_time < max_wait: + # Check for process status + ret = self.proc.poll() + if ret is not None: + raise RuntimeError(f"Server processes exited with code {ret} before becoming ready.") + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(1) + result = sock.connect_ex((self.host, self.port)) + if result == 0: + print(f"Server ready on {self.host}:{self.port}") + return + time.sleep(2) + + raise RuntimeError(f"Server failed to start within {max_wait} seconds") + + def _kill_process_tree(self, pid): + """kill process and its children with verification""" + try: + parent = psutil.Process(pid) + children = parent.children(recursive=True) + + # Get all PIDs first + all_pids = [pid] + [child.pid for child in children] + + # Terminate children + for child in children: + try: + child.terminate() + except psutil.NoSuchProcess: + pass + + # Wait for children + gone, still_alive = psutil.wait_procs(children, timeout=10) + + # Kill remaining children + for child in still_alive: + try: + child.kill() + except psutil.NoSuchProcess: + pass + + # Terminate parent + try: + parent.terminate() + parent.wait(timeout=10) + except (psutil.NoSuchProcess, psutil.TimeoutExpired): + try: + parent.kill() + except psutil.NoSuchProcess: + pass + + # VERIFICATION: Check if all processes are gone + time.sleep(1) # Give system time + alive_processes = [] + for check_pid in all_pids: + if psutil.pid_exists(check_pid): + alive_processes.append(check_pid) + + if alive_processes: + print(f"Warning: Processes still alive: {alive_processes}") + # Optional: Try system kill + import subprocess + + for alive_pid in alive_processes: + try: + subprocess.run(["kill", "-9", str(alive_pid)], timeout=2) + except Exception as e: + print(f"Cleanup failed: {e}") + + except psutil.NoSuchProcess: + pass + + def __enter__(self): + self._start_server() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.proc: + self._kill_process_tree(self.proc.pid) + _run_pre_test_cleanup(enable_force=True) + _run_post_test_cleanup(enable_force=True) + cleanup_dist_env_and_memory() + + +def pytest_addoption(parser): + parser.addoption( + "--run-level", + action="store", + default="core_model", + choices=["core_model", "advanced_model"], + help="Test level to run: L2, L3", + ) + + +@pytest.fixture(scope="session") +def run_level(request) -> str: + """A command-line argument that specifies the level of tests to run in this session. + See https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/CI_5levels/""" + return request.config.getoption("--run-level") + + +_omni_server_lock = threading.Lock() + + +@pytest.fixture(scope="module") +def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: str) -> Generator[OmniServer, Any, None]: + """Start vLLM-Omni server as a subprocess with actual model weights. + Uses session scope so the server starts only once for the entire test session. + Multi-stage initialization can take 10-20+ minutes. + """ + with _omni_server_lock: + params: OmniServerParams = request.param + model = model_prefix + params.model + port = params.port + stage_config_path = params.stage_config_path + if run_level == "advanced_model" and stage_config_path is not None: + with open(stage_config_path, encoding="utf-8") as f: + cfg = yaml.safe_load(f) or {} + stage_ids = [stage["stage_id"] for stage in cfg.get("stage_args", []) if "stage_id" in stage] + stage_config_path = modify_stage_config( + stage_config_path, + deletes={"stage_args": {stage_id: ["engine_args.load_format"] for stage_id in stage_ids}}, + ) + + server_args = params.server_args or [] + if params.use_omni: + server_args = ["--stage-init-timeout", "120", *server_args] + if stage_config_path is not None: + server_args += ["--stage-configs-path", stage_config_path] + + with ( + OmniServer( + model, + server_args, + port=port, + env_dict=params.env_dict, + use_omni=params.use_omni, + ) + if port + else OmniServer( + model, + server_args, + env_dict=params.env_dict, + use_omni=params.use_omni, + ) + ) as server: + print("OmniServer started successfully") + yield server + print("OmniServer stopping...") + + print("OmniServer stopped") + + +@dataclass +class OmniResponse: + text_content: str | None = None + audio_data: list[str] | None = None + audio_content: str | None = None + audio_format: str | None = None + audio_bytes: bytes | None = None + similarity: float | None = None + e2e_latency: float | None = None + success: bool = False + error_message: str | None = None + + +@dataclass +class DiffusionResponse: + text_content: str | None = None + images: list[Image.Image] | None = None + audios: list[Any] | None = None + videos: list[Any] | None = None + e2e_latency: float | None = None + success: bool = False + error_message: str | None = None + + +def _load_gender_pipeline(): + """ + Lazy-load a cached audio-classification pipeline for gender. + + We prefer the pipeline wrapper because it encapsulates processor/model loading + and avoids direct AutoProcessor.from_pretrained call sites in this file. + """ + global _GENDER_PIPELINE + if _GENDER_PIPELINE is not None: + return _GENDER_PIPELINE + + model_name = "7wolf/wav2vec2-base-gender-classification" + try: + # device=-1 forces CPU for pipeline. + _GENDER_PIPELINE = pipeline( + task="audio-classification", + model=model_name, + device=-1, + ) + return _GENDER_PIPELINE + except Exception as exc: # pragma: no cover - best-effort fallback + print(f"Warning: failed to create gender pipeline '{model_name}': {exc}") + _GENDER_PIPELINE = None + return None + + +def _median_pitch_hz_from_autocorr(mono: np.ndarray, sr: int) -> float | None: + """ + Rough median F0 (Hz) over short-time frames. Used to debias wav2vec2 gender head on TTS, + which often labels lower-pitched synthetic speech as female under load or on clean signals. + Returns None if the clip is too short or mostly unvoiced. + """ + x = np.asarray(mono, dtype=np.float64) + x = x - np.mean(x) + if x.size < int(0.15 * sr): + return None + frame_len = int(0.04 * sr) + hop = max(frame_len // 2, 1) + f0_min_hz, f0_max_hz = 70.0, 400.0 + lag_min = max(1, int(sr / f0_max_hz)) + lag_max = min(frame_len - 2, int(sr / f0_min_hz)) + if lag_max <= lag_min: + return None + win = np.hamming(frame_len) + pitches: list[float] = [] + for start in range(0, int(x.shape[0]) - frame_len, hop): + frame = x[start : start + frame_len] * win + frame = frame - np.mean(frame) + if float(np.sqrt(np.mean(frame**2))) < 1e-4: + continue + ac = np.correlate(frame, frame, mode="full")[frame_len - 1 :] + ac = ac / (float(ac[0]) + 1e-12) + region = ac[lag_min : lag_max + 1] + peak_rel = int(np.argmax(region)) + peak_lag = peak_rel + lag_min + if peak_lag <= 0: + continue + f0 = float(sr) / float(peak_lag) + if f0_min_hz <= f0 <= f0_max_hz: + pitches.append(f0) + if len(pitches) < 4: + return None + return float(np.median(np.asarray(pitches, dtype=np.float64))) + + +def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str: + """ + Estimate voice gender from audio using a small pre-trained classification model. + + Uses a cached `audio-classification` pipeline to classify the clip. + Returns 'male' / 'female' when the model confidence is >= 0.9 and the label + maps to one of these; otherwise returns 'unknown'. If the model is unavailable + or inference fails, returns 'unknown' to keep tests stable. + + Under concurrent tests, a global lock serializes pipeline calls (the HF pipeline is not + thread-safe). A coarse F0 median can correct systematic "male -> female" errors on TTS audio. + """ + data, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=True) + if data.size == 0: + raise ValueError("Empty audio") + mono = np.mean(data, axis=1) + + try: + target_sr = 16000 + if int(sr) != target_sr and mono.size > 1: + src_len = int(mono.shape[0]) + dst_len = max(1, int(round(src_len * float(target_sr) / float(sr)))) + src_idx = np.arange(src_len, dtype=np.float32) + dst_idx = np.linspace(0, src_len - 1, dst_len, dtype=np.float32) + mono = np.interp(dst_idx, src_idx, mono.astype(np.float32, copy=False)).astype(np.float32) + sr = target_sr + + median_f0 = _median_pitch_hz_from_autocorr(mono, sr) + + clf = _load_gender_pipeline() + if clf is None: + print("gender model not available, returning 'unknown'") + return "unknown" + + # transformers pipeline returns a list of {label, score} (highest score first). + with _GENDER_PIPELINE_LOCK: + outputs = clf(mono, sampling_rate=sr) + if not outputs: + return "unknown" + + top = outputs[0] + label = str(top.get("label", "")).lower() + conf = float(top.get("score", 0.0)) + + if conf < 0.5: + gender = "unknown" + # Some models use non-English labels (e.g., Russian). Normalize to 'male'/'female'. + elif ("female" in label) or ("жен" in label): + gender = "female" + elif ("male" in label) or ("муж" in label): + gender = "male" + else: + gender = "unknown" + + # Debias: wav2vec2 gender heads often call TTS / band-limited male speech "female". + # Low median F0 (~speech male range) + female label -> trust pitch when score is not overwhelming. + if gender == "female" and median_f0 is not None and median_f0 < 165.0 and conf < 0.88: + print(f"gender pitch assist: reclassifying female->male (median_f0={median_f0:.1f} Hz, conf={conf:.3f})") + gender = "male" + elif gender == "male" and median_f0 is not None and median_f0 > 230.0 and conf < 0.88: + print(f"gender pitch assist: reclassifying male->female (median_f0={median_f0:.1f} Hz, conf={conf:.3f})") + gender = "female" + + print( + f"gender classifier: label={label}, conf={conf:.3f}, gender={gender}" + + (f", median_f0={median_f0:.1f}Hz" if median_f0 is not None else "") + ) + return gender + except Exception as exc: # pragma: no cover - best-effort fallback + print(f"Warning: gender classification failed, returning 'unknown': {exc}") + return "unknown" + + +_PRESET_VOICE_GENDER_MAP: dict[str, str] = { + "serena": "female", + "uncle_fu": "male", + "chelsie": "female", + "clone": "female", + "ethan": "male", } + + +def _assert_preset_voice_gender_from_audio( + audio_bytes: bytes | None, + voice_name: str | None, +) -> None: + """If ``voice_name`` matches a known preset, assert classifier gender matches (skip when unknown).""" + if not voice_name or not audio_bytes: + return + key = str(voice_name).lower() + expected_gender = _PRESET_VOICE_GENDER_MAP.get(key) + if expected_gender is None: + return + estimated_gender = _estimate_voice_gender_from_audio(audio_bytes) + print(f"Preset voice gender check: preset={key!r}, estimated={estimated_gender!r}, expected={expected_gender!r}") + if estimated_gender != "unknown": + assert estimated_gender == expected_gender, ( + f"{voice_name!r} is expected {expected_gender}, but estimated gender is {estimated_gender!r}" + ) + + +# Threshold aligned with _compute_pcm_hnr_db docstring (clean clone vs distorted). +_MIN_PCM_SPEECH_HNR_DB = 1.0 + + +def _compute_pcm_hnr_db(pcm_samples: np.ndarray, sr: int = _PCM_SPEECH_SAMPLE_RATE_HZ) -> float: + """Compute mean Harmonic-to-Noise Ratio (dB) for speech quality. + + Clean cloned speech has HNR > 1.2 dB; distorted speech (e.g. lost + ref_code decoder context) drops below 1.0 dB. + """ + frame_len = int(0.03 * sr) # 30ms frames + hop = frame_len // 2 + hnr_values: list[float] = [] + + for start in range(0, len(pcm_samples) - frame_len, hop): + frame = pcm_samples[start : start + frame_len].astype(np.float32, copy=False) + frame = frame - np.mean(frame) + if np.max(np.abs(frame)) < 0.01: + continue + ac = np.correlate(frame, frame, mode="full")[len(frame) - 1 :] + ac = ac / (ac[0] + 1e-10) + min_lag = int(sr / 400) + max_lag = min(int(sr / 80), len(ac)) + if min_lag >= max_lag: + continue + peak = float(np.max(ac[min_lag:max_lag])) + if 0 < peak < 1: + hnr_values.append(10 * np.log10(peak / (1 - peak + 1e-10))) + + return float(np.mean(hnr_values)) if hnr_values else 0.0 + + +def _assert_pcm_int16_speech_hnr(audio_bytes: bytes) -> None: + """Validate harmonic-to-noise ratio on raw int16 PCM from /v1/audio/speech.""" + assert audio_bytes is not None and len(audio_bytes) >= 2, "missing PCM bytes" + assert len(audio_bytes) % 2 == 0, "PCM byte length must be aligned to int16" + pcm_samples = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + hnr = _compute_pcm_hnr_db(pcm_samples) + print(f"PCM speech HNR: {hnr:.2f} dB (threshold: {_MIN_PCM_SPEECH_HNR_DB} dB)") + assert hnr >= _MIN_PCM_SPEECH_HNR_DB, ( + f"Audio distortion detected: HNR={hnr:.2f} dB < {_MIN_PCM_SPEECH_HNR_DB} dB. " + "Voice clone decoder may be losing ref_code speaker context on later chunks." + ) + + +def assert_omni_response(response: OmniResponse, request_config: dict[str, Any], run_level): + """ + Validate response results. + + Args: + response: OmniResponse object + + Raises: + AssertionError: When the response does not meet validation criteria + """ + assert response.success, "The request failed." + e2e_latency = response.e2e_latency + if e2e_latency is not None: + print(f"the e2e latency is: {e2e_latency}") + + modalities = request_config.get("modalities", ["text", "audio"]) + + if run_level == "advanced_model": + if "audio" in modalities: + assert response.audio_content is not None, "No audio output is generated" + print(f"audio content is: {response.audio_content}") + speaker = request_config.get("speaker") + if speaker: + _assert_preset_voice_gender_from_audio( + response.audio_bytes, + speaker, + ) + + if "text" in modalities: + assert response.text_content is not None, "No text output is generated" + print(f"text content is: {response.text_content}") + + # Verify image description + word_types = ["text", "image", "audio", "video"] + keywords_dict = request_config.get("key_words", {}) + for word_type in word_types: + keywords = keywords_dict.get(word_type) + if "text" in modalities: + if keywords: + text_lower = response.text_content.lower() + assert any(str(kw).lower() in text_lower for kw in keywords), ( + "The output does not contain any of the keywords." + ) + else: + if keywords: + audio_lower = response.audio_content.lower() + assert any(str(kw).lower() in audio_lower for kw in keywords), ( + "The output does not contain any of the keywords." + ) + + # Verify similarity (Whisper transcript vs streamed/detokenized text) + if "text" in modalities and "audio" in modalities: + assert response.similarity is not None and response.similarity > 0.9, ( + "The audio content is not same as the text" + ) + print(f"similarity is: {response.similarity}") + + +def assert_audio_speech_response( + response: OmniResponse, + request_config: dict[str, Any], + run_level: str, +) -> None: + """ + Validate /v1/audio/speech response: success, optional format check, transcription similarity + and gender (non-PCM only for advanced_model), and int16 PCM HNR when response_format is pcm. + """ + assert response.success, "The request failed." + + req_fmt = request_config.get("response_format") + + if req_fmt == "pcm" and response.audio_bytes: + _assert_pcm_int16_speech_hnr(response.audio_bytes) + if response.audio_format: + assert "pcm" in response.audio_format.lower(), ( + f"Expected audio/pcm content-type, got {response.audio_format!r}" + ) + + elif req_fmt == "wav" and response.audio_format: + assert req_fmt in response.audio_format, ( + f"The response audio format {response.audio_format} don't match the request audio format {req_fmt}" + ) + + e2e_latency = response.e2e_latency + if e2e_latency is not None: + print(f"the avg e2e latency is: {e2e_latency}") + + if run_level == "advanced_model" and req_fmt != "pcm": + # Text–audio semantic similarity check (skipped for raw PCM: no Whisper transcript). + expected_text = request_config.get("input") + if expected_text: + transcript = (response.audio_content or "").strip() + print(f"audio content is: {transcript}") + print(f"input text is: {expected_text}") + similarity = cosine_similarity_text(transcript.lower(), expected_text.lower()) + print(f"Cosine similarity: {similarity:.3f}") + assert similarity > 0.9, ( + f"Transcript doesn't match input: similarity={similarity:.2f}, transcript='{transcript}'" + ) + + # Voice gender consistency check (preset names in ``_PRESET_VOICE_GENDER_MAP``). + # When the estimator returns 'unknown', we treat it as inconclusive and do NOT fail the test. + _assert_preset_voice_gender_from_audio( + response.audio_bytes, + request_config.get("voice"), + ) + + +def assert_diffusion_response(response: DiffusionResponse, request_config: dict[str, Any], run_level: str = None): + """ + Validate diffusion response results. + + Dispatcher that routes validation to modality-specific assert functions. + + Args: + response: DiffusionResponse object. + request_config: Request configuration dictionary. + run_level: Test run level (e.g. "core_model", "advanced_model") + + Raises: + AssertionError: When the response does not meet validation criteria + KeyError: When the request_config does not contain necessary parameters for validation + """ + assert response.success, "The request failed." + + e2e_latency = response.e2e_latency + if e2e_latency is not None: + print(f"the avg e2e is: {e2e_latency}") + + has_any_content = any(content is not None for content in (response.images, response.videos, response.audios)) + assert has_any_content, "Response contains no images, videos, or audios" + + if response.images is not None: + assert_image_diffusion_response( + response=response, + request_config=request_config, + run_level=run_level, + ) + + if response.videos is not None: + assert_video_diffusion_response( + response=response, + request_config=request_config, + run_level=run_level, + ) + + if response.audios is not None: + assert_audio_diffusion_response( + response=response, + request_config=request_config, + run_level=run_level, + ) + + +class OpenAIClientHandler: + """ + OpenAI client handler class, encapsulating both streaming and non-streaming response processing logic. + + This class integrates OpenAI API request sending, response handling, and validation functionality, + supporting both single request and concurrent request modes. + """ + + def __init__( + self, host: str = "127.0.0.1", port: int = get_open_port(), api_key: str = "EMPTY", run_level: str = None + ): + """ + Initialize the OpenAI client. + + Args: + host: vLLM-Omni server host address + port: vLLM-Omni server port + api_key: API key (defaults to "EMPTY") + """ + self.base_url = f"http://{host}:{port}" + self.client = OpenAI(base_url=f"http://{host}:{port}/v1", api_key=api_key) + self.run_level = run_level + + def _process_stream_omni_response(self, chat_completion) -> OmniResponse: + """ + Process streaming responses. + + Args: + chat_completion: OpenAI streaming response object + request_config: Request configuration dictionary + + Returns: + OmniResponse: Processed response object + """ + result = OmniResponse() + start_time = time.perf_counter() + + try: + text_content = "" + audio_data = [] + + for chunk in chat_completion: + for choice in chunk.choices: + # Get content data + if hasattr(choice, "delta"): + content = getattr(choice.delta, "content", None) + else: + content = None + + # Get modality type + modality = getattr(chunk, "modality", None) + + # Process content based on modality type + if modality == "audio" and content: + audio_data.append(content) + elif modality == "text" and content: + text_content += content if content else "" + + # Calculate end-to-end latency + result.e2e_latency = time.perf_counter() - start_time + + # Process audio and text content + audio_content = None + similarity = None + + if audio_data or text_content: + if audio_data: + merged_seg = _merge_base64_audio_to_segment(audio_data) + wav_buf = BytesIO() + merged_seg.export(wav_buf, format="wav") + result.audio_bytes = wav_buf.getvalue() + audio_content = convert_audio_bytes_to_text(result.audio_bytes) + if audio_content and text_content: + similarity = cosine_similarity_text(audio_content.lower(), text_content.lower()) + + # Populate result object + result.text_content = text_content + result.audio_data = audio_data + result.audio_content = audio_content + result.similarity = similarity + result.success = True + + except Exception as e: + result.error_message = f"Stream processing error: {str(e)}" + print(f"Error: {result.error_message}") + + return result + + def _process_non_stream_omni_response(self, chat_completion) -> OmniResponse: + """ + Process non-streaming responses. + + Args: + chat_completion: OpenAI non-streaming response object + request_config: Request configuration dictionary + + Returns: + OmniResponse: Processed response object + """ + result = OmniResponse() + start_time = time.perf_counter() + + try: + audio_data = None + text_content = None + + # Iterate through all choices + for choice in chat_completion.choices: + # Process audio data + if hasattr(choice.message, "audio") and choice.message.audio is not None: + audio_message = choice.message + audio_data = audio_message.audio.data + + # Process text content + if hasattr(choice.message, "content") and choice.message.content is not None: + text_content = choice.message.content + + # Calculate end-to-end latency + result.e2e_latency = time.perf_counter() - start_time + + # Process audio and text content + audio_content = None + similarity = None + + if audio_data or text_content: + if audio_data: + result.audio_bytes = base64.b64decode(audio_data) + audio_content = convert_audio_bytes_to_text(result.audio_bytes) + if audio_content and text_content: + similarity = cosine_similarity_text(audio_content.lower(), text_content.lower()) + + # Populate result object + result.text_content = text_content + result.audio_content = audio_content + result.similarity = similarity + result.success = True + + except Exception as e: + result.error_message = f"Non-stream processing error: {str(e)}" + print(f"Error: {result.error_message}") + + return result + + def _process_diffusion_response(self, chat_completion) -> DiffusionResponse: + """ + Process diffusion responses (image generation/editing). + + Args: + chat_completion: OpenAI response object + + Returns: + DiffusionResponse: Processed response object + """ + result = DiffusionResponse() + start_time = time.perf_counter() + + try: + images = [] + # [TODO] reading video and audio output from API response for later validation + + for choice in chat_completion.choices: + if hasattr(choice.message, "content") and choice.message.content is not None: + content = choice.message.content + if isinstance(content, list): + for item in content: + if isinstance(item, dict): + image_url = item.get("image_url", {}).get("url") + else: + image_url_obj = getattr(item, "image_url", None) + image_url = hasattr(image_url_obj, "url", None) if image_url_obj else None + if image_url and image_url.startswith("data:image"): + b64_data = image_url.split(",", 1)[1] + img = decode_b64_image(b64_data) + images.append(img) + + result.e2e_latency = time.perf_counter() - start_time + result.images = images if images else None + result.success = True + + except Exception as e: + result.error_message = f"Diffusion response processing error: {str(e)}" + print(f"Error: {result.error_message}") + + return result + + def _process_stream_audio_speech_response(self, response, *, response_format: str | None = None) -> OmniResponse: + """ + Process streaming /v1/audio/speech responses into an OmniResponse. + + This mirrors _process_stream_omni_response but operates on low-level + audio bytes and produces an OmniResponse with audio_content filled + from Whisper transcription. + """ + result = OmniResponse() + start_time = time.perf_counter() + + try: + # Aggregate all audio bytes from the streaming response. + data = bytearray() + + # Preferred OpenAI helper. + if hasattr(response, "iter_bytes") and callable(getattr(response, "iter_bytes")): + for chunk in response.iter_bytes(): + if chunk: + data.extend(chunk) + else: + # Generic iterable-of-bytes fallback (e.g., generator or list of chunks). + try: + iterator = iter(response) + except TypeError: + iterator = None + + if iterator is not None: + for chunk in iterator: + if not chunk: + continue + if isinstance(chunk, (bytes, bytearray)): + data.extend(chunk) + elif hasattr(chunk, "data"): + data.extend(chunk.data) # type: ignore[arg-type] + elif hasattr(chunk, "content"): + data.extend(chunk.content) # type: ignore[arg-type] + else: + raise TypeError(f"Unsupported stream chunk type: {type(chunk)}") + else: + raise TypeError(f"Unsupported audio speech streaming response type: {type(response)}") + + raw_bytes = bytes(data) + if response_format == "pcm": + transcript = None + else: + transcript = convert_audio_bytes_to_text(raw_bytes) + + # Populate OmniResponse. + result.audio_bytes = raw_bytes + result.audio_content = transcript + result.e2e_latency = time.perf_counter() - start_time + result.success = True + result.audio_format = getattr(response, "response", None) + if result.audio_format is not None: + result.audio_format = result.audio_format.headers.get("content-type", "") + + except Exception as e: + result.error_message = f"Audio speech stream processing error: {str(e)}" + print(f"Error: {result.error_message}") + + return result + + def _process_non_stream_audio_speech_response( + self, response, *, response_format: str | None = None + ) -> OmniResponse: + """ + Process non-streaming /v1/audio/speech responses into an OmniResponse. + + This mirrors _process_non_stream_omni_response but for the binary + audio payload returned by audio.speech.create. + """ + result = OmniResponse() + start_time = time.perf_counter() + + try: + # OpenAI non-streaming audio.speech.create returns HttpxBinaryResponseContent (.read() or .content) + if hasattr(response, "read") and callable(getattr(response, "read")): + raw_bytes = response.read() + elif hasattr(response, "content"): + raw_bytes = response.content # type: ignore[assignment] + else: + raise TypeError(f"Unsupported audio speech response type: {type(response)}") + + if response_format == "pcm": + transcript = None + else: + transcript = convert_audio_bytes_to_text(raw_bytes) + + result.audio_bytes = raw_bytes + result.audio_content = transcript + result.e2e_latency = time.perf_counter() - start_time + result.success = True + result.audio_format = getattr(response, "response", None) + if result.audio_format is not None: + result.audio_format = result.audio_format.headers.get("content-type", "") + + except Exception as e: + result.error_message = f"Audio speech non-stream processing error: {str(e)}" + print(f"Error: {result.error_message}") + + return result + + def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: + """ + Send OpenAI requests. + + Args: + request_config: Request configuration dictionary containing parameters like model, messages, stream. + Optional ``use_audio_in_video`` (bool): when true, sets + ``extra_body["mm_processor_kwargs"] = {"use_audio_in_video": True}`` for Qwen-Omni video+audio + extraction. + Optional top-level ``speaker`` (str): Qwen3-Omni preset TTS speaker name; sent as + ``extra_body["speaker"]`` to ``chat.completions.create``. + request_num: Number of requests, defaults to 1 (single request) + + Returns: + List[OmniResponse]: List of response objects + """ + + responses = [] + stream = request_config.get("stream", False) + modalities = request_config.get("modalities", ["text", "audio"]) + + extra_body: dict[str, Any] = {} + if "speaker" in request_config: + extra_body["speaker"] = request_config["speaker"] + if request_config.get("use_audio_in_video"): + mm = dict(extra_body.get("mm_processor_kwargs") or {}) + mm["use_audio_in_video"] = True + extra_body["mm_processor_kwargs"] = mm + extra_body_arg: dict[str, Any] | None = extra_body if extra_body else None + + create_kwargs: dict[str, Any] = { + "model": request_config.get("model"), + "messages": request_config.get("messages"), + "stream": stream, + "modalities": modalities, + } + if extra_body_arg is not None: + create_kwargs["extra_body"] = extra_body_arg + + if request_num == 1: + # Send single request + chat_completion = self.client.chat.completions.create(**create_kwargs) + + if stream: + response = self._process_stream_omni_response(chat_completion) + else: + response = self._process_non_stream_omni_response(chat_completion) + + assert_omni_response(response, request_config, run_level=self.run_level) + responses.append(response) + + else: + # Send concurrent requests: run create + process in worker so e2e_latency includes full round-trip. + def _one_omni_request(): + start = time.perf_counter() + worker_kwargs: dict[str, Any] = { + "model": request_config.get("model"), + "messages": request_config.get("messages"), + "modalities": modalities, + "stream": stream, + } + if extra_body_arg is not None: + worker_kwargs["extra_body"] = extra_body_arg + chat_completion = self.client.chat.completions.create(**worker_kwargs) + if stream: + response = self._process_stream_omni_response(chat_completion) + else: + response = self._process_non_stream_omni_response(chat_completion) + response.e2e_latency = time.perf_counter() - start + return response + + with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: + futures = [executor.submit(_one_omni_request) for _ in range(request_num)] + for future in concurrent.futures.as_completed(futures): + response = future.result() + assert_omni_response(response, request_config, run_level=self.run_level) + responses.append(response) + + return responses + + def send_audio_speech_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: + """ + Call the /v1/audio/speech endpoint using the same configuration-dict + style as send_omni_request, but via the OpenAI Python client's + audio.speech APIs. + + Expected keys in request_config: + - model: model name/path (required) + - input: text to synthesize (required) + - response_format: audio format such as "wav" or "pcm" (optional) + - task_type, ref_text, ref_audio: TTS-specific extras (optional, passed via extra_body) + - timeout: request timeout in seconds (float, optional, default 120.0) + - stream: whether to use streaming API (bool, optional, default False) + """ + timeout = float(request_config.get("timeout", 120.0)) + + model = request_config["model"] + text_input = request_config["input"] + stream = bool(request_config.get("stream", False)) + voice = request_config.get("voice", None) + + # Standard OpenAI param: use omit when not provided to keep default behavior. + response_format = request_config.get("response_format", omit) + + # Qwen3-TTS custom fields, forwarded via extra_body. + extra_body: dict[str, Any] = {} + # Keep this list aligned with vllm_omni.entrypoints.openai.protocol.audio params. + for key in ("task_type", "ref_text", "ref_audio", "language", "max_new_tokens"): + if key in request_config: + extra_body[key] = request_config[key] + + responses: list[OmniResponse] = [] + + speech_fmt: str | None = None if response_format is omit else str(response_format).lower() + + if request_num == 1: + if stream: + # Use streaming response helper. + with self.client.audio.speech.with_streaming_response.create( + model=model, + input=text_input, + response_format=response_format, + extra_body=extra_body or None, + timeout=timeout, + voice=voice, + ) as resp: + omni_resp = self._process_stream_audio_speech_response(resp, response_format=speech_fmt) + else: + # Non-streaming response. + resp = self.client.audio.speech.create( + model=model, + input=text_input, + response_format=response_format, + extra_body=extra_body or None, + timeout=timeout, + voice=voice, + ) + omni_resp = self._process_non_stream_audio_speech_response(resp, response_format=speech_fmt) + + assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level) + responses.append(omni_resp) + return responses + else: + # request_num > 1: concurrent requests (use same params as single-request path) + + if stream: + + def _stream_task(): + with self.client.audio.speech.with_streaming_response.create( + model=model, + input=text_input, + response_format=response_format, + extra_body=extra_body or None, + timeout=timeout, + voice=voice, + ) as resp: + return self._process_stream_audio_speech_response(resp, response_format=speech_fmt) + + with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: + futures = [executor.submit(_stream_task) for _ in range(request_num)] + for future in concurrent.futures.as_completed(futures): + omni_resp = future.result() + assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level) + responses.append(omni_resp) + else: + with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: + futures = [] + for _ in range(request_num): + future = executor.submit( + self.client.audio.speech.create, + model=model, + input=text_input, + response_format=response_format, + extra_body=extra_body or None, + timeout=timeout, + voice=voice, + ) + futures.append(future) + + for future in concurrent.futures.as_completed(futures): + resp = future.result() + omni_resp = self._process_non_stream_audio_speech_response(resp, response_format=speech_fmt) + assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level) + responses.append(omni_resp) + + return responses + + def send_diffusion_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: + """ + Send OpenAI requests for diffusion models. + + Args: + request_config: Request configuration dictionary containing parameters like model, messages + request_num: Number of requests to send concurrently, defaults to 1 (single request) + Returns: + List[OmniResponse]: List of response objects + """ + responses = [] + stream = request_config.get("stream", False) + modalities = request_config.get("modalities", omit) # Most diffusion models don't require modalities param + extra_body = request_config.get("extra_body", None) + + if stream: + raise NotImplementedError("Streaming is not currently implemented for diffusion model e2e test") + + if request_num == 1: + # Send single request + chat_completion = self.client.chat.completions.create( + model=request_config.get("model"), + messages=request_config.get("messages"), + extra_body=extra_body, + modalities=modalities, + ) + + response = self._process_diffusion_response(chat_completion) + assert_diffusion_response(response, request_config, run_level=self.run_level) + responses.append(response) + + else: + # Send concurrent requests + with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: + futures = [] + + # Submit all request tasks + for _ in range(request_num): + future = executor.submit( + self.client.chat.completions.create, + model=request_config.get("model"), + messages=request_config.get("messages"), + modalities=modalities, + extra_body=extra_body, + ) + futures.append(future) + + # Process completed tasks + for future in concurrent.futures.as_completed(futures): + chat_completion = future.result() + response = self._process_diffusion_response(chat_completion) + assert_diffusion_response(response, request_config, run_level=self.run_level) + responses.append(response) + + return responses + + def send_video_diffusion_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: + """ + Send native /v1/videos requests. + """ + if request_num != 1: + raise NotImplementedError("Concurrent video diffusion requests are not currently implemented") + + if request_config.get("stream", False): + raise NotImplementedError("Streaming is not currently implemented for video diffusion e2e test") + + form_data = request_config.get("form_data") + if not isinstance(form_data, dict): + raise ValueError("Video request_config must contain 'form_data'") + + if not form_data.get("prompt"): + raise ValueError("Video request_config['form_data'] must contain 'prompt'") + + normalized_form_data = {key: str(value) for key, value in form_data.items() if value is not None} + + files: dict[str, tuple[str, BytesIO, str]] = {} + image_reference = request_config.get("image_reference") + if image_reference: + if image_reference.startswith("data:image"): + header, encoded = image_reference.split(",", 1) + content_type = header.split(";")[0].removeprefix("data:") + extension = content_type.split("/")[-1] + file_data = base64.b64decode(encoded) + + files["input_reference"] = ( + f"reference.{extension}", + BytesIO(file_data), + content_type, + ) + else: + normalized_form_data["image_reference"] = json.dumps({"image_url": image_reference}) + + result = DiffusionResponse() + start_time = time.perf_counter() + + try: + create_url = self._build_url("/v1/videos") + response = requests.post( + create_url, + data=normalized_form_data, + files=files, + headers={"Accept": "application/json"}, + timeout=60, + ) + response.raise_for_status() + + job_data = response.json() + video_id = job_data["id"] + + self._wait_until_video_completed(video_id) + + video_content = self._download_video_content(video_id) + + result.success = True + result.videos = [video_content] + result.e2e_latency = time.perf_counter() - start_time + + assert_diffusion_response(result, request_config, run_level=self.run_level) + + except Exception as e: + result.success = False + result.error_message = f"Diffusion response processing error: {e}" + assert False, result.error_message + + return [result] + + def _wait_until_video_completed( + self, + video_id: str, + poll_interval_seconds: int = 2, + timeout_seconds: int = 300, + ) -> None: + status_url = self._build_url(f"/v1/videos/{video_id}") + deadline = time.monotonic() + timeout_seconds + + while time.monotonic() < deadline: + status_resp = requests.get( + status_url, + headers={"Accept": "application/json"}, + timeout=30, + ) + status_resp.raise_for_status() + + status_data = status_resp.json() + current_status = status_data["status"] + + if current_status == "completed": + return + + if current_status == "failed": + error_msg = status_data.get("last_error", "Unknown error") + raise RuntimeError(f"Job failed: {error_msg}") + + time.sleep(poll_interval_seconds) + + raise TimeoutError(f"Video job {video_id} did not complete within {timeout_seconds}s") + + def _download_video_content(self, video_id: str) -> bytes: + download_url = self._build_url(f"/v1/videos/{video_id}/content") + video_resp = requests.get(download_url, stream=True, timeout=60) + video_resp.raise_for_status() + + video_bytes = BytesIO() + for chunk in video_resp.iter_content(chunk_size=8192): + if chunk: + video_bytes.write(chunk) + + return video_bytes.getvalue() + + def _build_url(self, path: str) -> str: + return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}" + + +@pytest.fixture +def openai_client(omni_server: OmniServer, run_level: str): + """Create OpenAIClientHandler fixture to facilitate communication with OmniServer + with encapsulated request sending, concurrent requests, response handling, and validation.""" + return OpenAIClientHandler(host=omni_server.host, port=omni_server.port, api_key="EMPTY", run_level=run_level) + + +class OmniRunner: + """ + Offline test runner for Omni models. + """ + + def __init__( + self, + model_name: str, + seed: int = 42, + stage_init_timeout: int = 300, + batch_timeout: int = 10, + init_timeout: int = 300, + shm_threshold_bytes: int = 65536, + log_stats: bool = False, + stage_configs_path: str | None = None, + **kwargs, + ) -> None: + """ + Initialize an OmniRunner for testing. + + Args: + model_name: The model name or path + seed: Random seed for reproducibility + stage_init_timeout: Timeout for initializing a single stage in seconds + batch_timeout: Timeout for batching in seconds + init_timeout: Timeout for initializing stages in seconds + shm_threshold_bytes: Threshold for using shared memory + log_stats: Enable detailed statistics logging + stage_configs_path: Optional path to YAML stage config file + **kwargs: Additional arguments passed to Omni + """ + cleanup_dist_env_and_memory() + _run_pre_test_cleanup(enable_force=True) + _run_post_test_cleanup(enable_force=True) + self.model_name = model_name + self.seed = seed + + self.omni = Omni( + model=model_name, + log_stats=log_stats, + stage_init_timeout=stage_init_timeout, + batch_timeout=batch_timeout, + init_timeout=init_timeout, + shm_threshold_bytes=shm_threshold_bytes, + stage_configs_path=stage_configs_path, + **kwargs, + ) + + def _estimate_prompt_len( + self, + additional_information: dict[str, Any], + model_name: str, + _cache: dict[str, Any] = {}, + ) -> int: + """Estimate prompt_token_ids placeholder length for the Talker stage. + + The AR Talker replaces all input embeddings via ``preprocess``, so the + placeholder values are irrelevant but the **length** must match the + embeddings that ``preprocess`` will produce. + """ + try: + from vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts import Qwen3TTSConfig + from vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_talker import ( + Qwen3TTSTalkerForConditionalGeneration, + ) + + if model_name not in _cache: + from transformers import AutoTokenizer + + tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side="left") + cfg = Qwen3TTSConfig.from_pretrained(model_name, trust_remote_code=True) + _cache[model_name] = (tok, getattr(cfg, "talker_config", None)) + + tok, tcfg = _cache[model_name] + task_type = (additional_information.get("task_type") or ["CustomVoice"])[0] + return Qwen3TTSTalkerForConditionalGeneration.estimate_prompt_len_from_additional_information( + additional_information=additional_information, + task_type=task_type, + tokenize_prompt=lambda t: tok(t, padding=False)["input_ids"], + codec_language_id=getattr(tcfg, "codec_language_id", None), + spk_is_dialect=getattr(tcfg, "spk_is_dialect", None), + ) + except Exception as exc: + logger.warning("Failed to estimate prompt length, using fallback 2048: %s", exc) + return 2048 + + def get_default_sampling_params_list(self) -> list[OmniSamplingParams]: + """ + Get a list of default sampling parameters for all stages. + + Returns: + List of SamplingParams with default decoding for each stage + """ + if not hasattr(self.omni, "default_sampling_params_list"): + raise AttributeError("Omni.default_sampling_params_list is not available") + return list(self.omni.default_sampling_params_list) + + def get_omni_inputs( + self, + prompts: list[str] | str, + system_prompt: str | None = None, + audios: PromptAudioInput = None, + images: PromptImageInput = None, + videos: PromptVideoInput = None, + mm_processor_kwargs: dict[str, Any] | None = None, + modalities: list[str] | None = None, + ) -> list[TextPrompt]: + """ + Construct Omni input format from prompts and multimodal data. + + Args: + prompts: Text prompt(s) - either a single string or list of strings + system_prompt: Optional system prompt (defaults to Qwen system prompt) + audios: Audio input(s) - tuple of (audio_array, sample_rate) or list of tuples + images: Image input(s) - PIL Image or list of PIL Images + videos: Video input(s) - numpy array or list of numpy arrays + mm_processor_kwargs: Optional processor kwargs (e.g., use_audio_in_video) + + Returns: + List of prompt dictionaries suitable for Omni.generate() + """ + if system_prompt is None: + system_prompt = ( + "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " + "Group, capable of perceiving auditory and visual inputs, as well as " + "generating text and speech." + ) + + video_padding_token = "<|VIDEO|>" + image_padding_token = "<|IMAGE|>" + audio_padding_token = "<|AUDIO|>" + + if "Qwen3-Omni-30B-A3B-Instruct" in self.model_name: + video_padding_token = "<|video_pad|>" + image_padding_token = "<|image_pad|>" + audio_padding_token = "<|audio_pad|>" + + if isinstance(prompts, str): + prompts = [prompts] + + # Qwen-TTS: follow examples/offline_inference/qwen3_tts/end2end.py style. + # Stage 0 expects token placeholders + additional_information (text/speaker/task_type/...), + # and Talker replaces embeddings in preprocess based on additional_information only. + is_tts_model = "Qwen3-TTS" in self.model_name or "qwen3_tts" in self.model_name.lower() + if is_tts_model and modalities == ["audio"]: + tts_kw = mm_processor_kwargs or {} + task_type = tts_kw.get("task_type", "CustomVoice") + speaker = tts_kw.get("speaker", "Vivian") + language = tts_kw.get("language", "Auto") + max_new_tokens = int(tts_kw.get("max_new_tokens", 2048)) + ref_audio = tts_kw.get("ref_audio", None) + ref_text = tts_kw.get("ref_text", None) + + omni_inputs: list[TextPrompt] = [] + for prompt_text in prompts: + text_str = str(prompt_text).strip() or " " + additional_information: dict[str, Any] = { + "task_type": [task_type], + "text": [text_str], + "language": [language], + "speaker": [speaker], + "max_new_tokens": [max_new_tokens], + } + if ref_audio is not None: + additional_information["ref_audio"] = [ref_audio] + if ref_text is not None: + additional_information["ref_text"] = [ref_text] + # Use official helper to get correct placeholder length + plen = self._estimate_prompt_len(additional_information, self.model_name) + input_dict: TextPrompt = { + "prompt_token_ids": [0] * plen, + "additional_information": additional_information, + } + omni_inputs.append(input_dict) + return omni_inputs + + def _normalize_mm_input(mm_input, num_prompts): + if mm_input is None: + return [None] * num_prompts + if isinstance(mm_input, list): + if len(mm_input) != num_prompts: + raise ValueError( + f"Multimodal input list length ({len(mm_input)}) must match prompts length ({num_prompts})" + ) + return mm_input + return [mm_input] * num_prompts + + num_prompts = len(prompts) + audios_list = _normalize_mm_input(audios, num_prompts) + images_list = _normalize_mm_input(images, num_prompts) + videos_list = _normalize_mm_input(videos, num_prompts) + + omni_inputs = [] + for i, prompt_text in enumerate(prompts): + user_content = "" + multi_modal_data = {} + + audio = audios_list[i] + if audio is not None: + if isinstance(audio, list): + for _ in audio: + user_content += f"<|audio_bos|>{audio_padding_token}<|audio_eos|>" + multi_modal_data["audio"] = audio + else: + user_content += f"<|audio_bos|>{audio_padding_token}<|audio_eos|>" + multi_modal_data["audio"] = audio + + image = images_list[i] + if image is not None: + if isinstance(image, list): + for _ in image: + user_content += f"<|vision_bos|>{image_padding_token}<|vision_eos|>" + multi_modal_data["image"] = image + else: + user_content += f"<|vision_bos|>{image_padding_token}<|vision_eos|>" + multi_modal_data["image"] = image + + video = videos_list[i] + if video is not None: + if isinstance(video, list): + for _ in video: + user_content += f"<|vision_bos|>{video_padding_token}<|vision_eos|>" + multi_modal_data["video"] = video + else: + user_content += f"<|vision_bos|>{video_padding_token}<|vision_eos|>" + multi_modal_data["video"] = video + + user_content += prompt_text + + full_prompt = ( + f"<|im_start|>system\n{system_prompt}<|im_end|>\n" + f"<|im_start|>user\n{user_content}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) + + input_dict: TextPrompt = {"prompt": full_prompt} + if multi_modal_data: + input_dict["multi_modal_data"] = multi_modal_data + if modalities: + input_dict["modalities"] = modalities + if mm_processor_kwargs: + input_dict["mm_processor_kwargs"] = mm_processor_kwargs + + omni_inputs.append(input_dict) + + return omni_inputs + + def generate( + self, + prompts: list[TextPrompt], + sampling_params_list: list[OmniSamplingParams] | None = None, + ) -> list[OmniRequestOutput]: + """ + Generate outputs for the given prompts. + + Args: + prompts: List of prompt dictionaries with 'prompt' and optionally + 'multi_modal_data' keys + sampling_params_list: List of sampling parameters for each stage. + If None, uses default parameters. + + Returns: + List of OmniRequestOutput objects from stages with final_output=True + """ + if sampling_params_list is None: + sampling_params_list = self.get_default_sampling_params_list() + + return self.omni.generate(prompts, sampling_params_list) + + def generate_multimodal( + self, + prompts: list[str] | str, + sampling_params_list: list[OmniSamplingParams] | None = None, + system_prompt: str | None = None, + audios: PromptAudioInput = None, + images: PromptImageInput = None, + videos: PromptVideoInput = None, + mm_processor_kwargs: dict[str, Any] | None = None, + modalities: list[str] | None = None, + ) -> list[OmniRequestOutput]: + """ + Convenience method to generate with multimodal inputs. + + Args: + prompts: Text prompt(s) + sampling_params_list: List of sampling parameters for each stage + system_prompt: Optional system prompt + audios: Audio input(s) + images: Image input(s) + videos: Video input(s) + mm_processor_kwargs: Optional processor kwargs + + Returns: + List of OmniRequestOutput objects from stages with final_output=True + """ + omni_inputs = self.get_omni_inputs( + prompts=prompts, + system_prompt=system_prompt, + audios=audios, + images=images, + videos=videos, + mm_processor_kwargs=mm_processor_kwargs, + modalities=modalities, + ) + return self.generate(omni_inputs, sampling_params_list) + + def start_profile( + self, + profile_prefix: str | None = None, + stages: list[int] | None = None, + ) -> list[Any]: + """Start profiling specified stages. + + Args: + profile_prefix: Optional prefix for the trace file names. + stages: List of stage IDs to profile. If None, profiles all stages. + + Returns: + List of results from each stage. + """ + return self.omni.start_profile(profile_prefix=profile_prefix, stages=stages) + + def stop_profile(self, stages: list[int] | None = None) -> list[Any]: + """Stop profiling specified stages. + + Args: + stages: List of stage IDs to profile. If None, stops all stages. + + Returns: + List of results from each stage. + """ + return self.omni.stop_profile(stages=stages) + + def _cleanup_process(self): + try: + keywords = ["enginecore"] + matched = [] + + for proc in psutil.process_iter(["pid", "name", "cmdline", "username"]): + try: + cmdline = " ".join(proc.cmdline()).lower() if proc.cmdline() else "" + name = proc.name().lower() + + is_process = any(keyword in cmdline for keyword in keywords) or any( + keyword in name for keyword in keywords + ) + + if is_process: + print(f"Found vllm process: PID={proc.pid}, cmd={cmdline[:100]}") + matched.append(proc) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + for proc in matched: + try: + proc.terminate() + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + _, still_alive = psutil.wait_procs(matched, timeout=5) + for proc in still_alive: + try: + proc.kill() + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + if still_alive: + _, stubborn = psutil.wait_procs(still_alive, timeout=3) + if stubborn: + print(f"Warning: failed to kill residual vllm pids: {[p.pid for p in stubborn]}") + else: + print(f"Force-killed residual vllm pids: {[p.pid for p in still_alive]}") + elif matched: + print(f"Terminated vllm pids: {[p.pid for p in matched]}") + + except Exception as e: + print(f"Error in psutil vllm cleanup: {e}") + + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit - cleanup resources.""" + if hasattr(self.omni, "close"): + self.omni.close() + self._cleanup_process() + _run_pre_test_cleanup(enable_force=True) + _run_post_test_cleanup(enable_force=True) + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="module") +def omni_runner(request, model_prefix): + with _omni_server_lock: + model, stage_config_path = request.param + model = model_prefix + model + with OmniRunner(model, seed=42, stage_configs_path=stage_config_path, stage_init_timeout=300) as runner: + print("OmniRunner started successfully") + yield runner + print("OmniRunner stopping...") + + print("OmniRunner stopped") + + +class OmniRunnerHandler: + def __init__(self, omni_runner): + self.runner = omni_runner + + def _process_output(self, outputs: list[Any]) -> OmniResponse: + result = OmniResponse() + try: + text_content = None + audio_content = None + for stage_output in outputs: + if getattr(stage_output, "final_output_type", None) == "text": + text_content = stage_output.request_output.outputs[0].text + if getattr(stage_output, "final_output_type", None) == "audio": + audio_content = stage_output.request_output.outputs[0].multimodal_output["audio"] + + result.audio_content = audio_content + result.text_content = text_content + result.success = True + + except Exception as e: + result.error_message = f"Output processing error: {str(e)}" + result.success = False + print(f"Error: {result.error_message}") + + return result + + def send_request(self, request_config: dict[str, Any] | None = None) -> OmniResponse: + if request_config is None: + request_config = {} + prompts = request_config.get("prompts") + videos = request_config.get("videos") + images = request_config.get("images") + audios = request_config.get("audios") + modalities = request_config.get("modalities", ["text", "audio"]) + outputs = self.runner.generate_multimodal( + prompts=prompts, videos=videos, images=images, audios=audios, modalities=modalities + ) + response = self._process_output(outputs) + assert_omni_response(response, request_config, run_level="core_model") + return response + + def send_audio_speech_request( + self, + request_config: dict[str, Any], + ) -> OmniResponse: + """ + Offline TTS: text -> audio via generate_multimodal, then validate with assert_audio_speech_response. + + request_config must contain: + - 'input' or 'prompts': text to synthesize. + Optional keys: + - 'voice' -> speaker (CustomVoice) + - 'task_type' -> task_type in additional_information (default: "CustomVoice") + - 'language' -> language in additional_information (default: "Auto") + - 'max_new_tokens' -> max_new_tokens in additional_information (default: 2048) + - 'response_format' -> desired audio format (used only for assertion) + """ + input_text = request_config.get("input") or request_config.get("prompts") + if input_text is None: + raise ValueError("request_config must contain 'input' or 'prompts' for TTS") + if isinstance(input_text, list): + input_text = input_text[0] if input_text else "" + + # Build TTS-specific kwargs passed through to get_omni_inputs for Qwen3-TTS, + # matching examples/offline_inference/qwen3_tts/end2end.py. + mm_processor_kwargs: dict[str, Any] = {} + if "voice" in request_config: + mm_processor_kwargs["speaker"] = request_config["voice"] + if "task_type" in request_config: + mm_processor_kwargs["task_type"] = request_config["task_type"] + if "ref_audio" in request_config: + mm_processor_kwargs["ref_audio"] = request_config["ref_audio"] + if "ref_text" in request_config: + mm_processor_kwargs["ref_text"] = request_config["ref_text"] + if "language" in request_config: + mm_processor_kwargs["language"] = request_config["language"] + if "max_new_tokens" in request_config: + mm_processor_kwargs["max_new_tokens"] = request_config["max_new_tokens"] + + outputs = self.runner.generate_multimodal( + prompts=input_text, + modalities=["audio"], + mm_processor_kwargs=mm_processor_kwargs or None, + ) + mm_out: dict[str, Any] | None = None + for stage_out in outputs: + if getattr(stage_out, "final_output_type", None) == "audio": + mm_out = stage_out.request_output.outputs[0].multimodal_output + break + if mm_out is None: + result = OmniResponse(success=False, error_message="No audio output from pipeline") + assert result.success, result.error_message + return result + + audio_data = mm_out.get("audio") + if audio_data is None: + result = OmniResponse(success=False, error_message="No audio tensor in multimodal output") + assert result.success, result.error_message + return result + + sr_raw = mm_out.get("sr") + sr_val = sr_raw[-1] if isinstance(sr_raw, list) and sr_raw else sr_raw + sr = int(sr_val.item() if hasattr(sr_val, "item") else sr_val) + wav_tensor = torch.cat(audio_data, dim=-1) if isinstance(audio_data, list) else audio_data + wav_buf = io.BytesIO() + sf.write( + wav_buf, + wav_tensor.float().cpu().numpy().reshape(-1), + samplerate=sr, + format="WAV", + subtype="PCM_16", + ) + result = OmniResponse(success=True, audio_bytes=wav_buf.getvalue(), audio_format="audio/wav") + assert_audio_speech_response(result, request_config, run_level="core_model") + return result + + def start_profile( + self, + profile_prefix: str | None = None, + stages: list[int] | None = None, + ) -> list[Any]: + """Start profiling specified stages.""" + return self.runner.start_profile(profile_prefix=profile_prefix, stages=stages) + + def stop_profile(self, stages: list[int] | None = None) -> list[Any]: + """Stop profiling specified stages.""" + return self.runner.stop_profile(stages=stages) + + +@pytest.fixture +def omni_runner_handler(omni_runner): + return OmniRunnerHandler(omni_runner) diff --git a/tests/core/sched/test_chunk_scheduling_coordinator.py b/tests/core/sched/test_chunk_scheduling_coordinator.py deleted file mode 100644 index 5e19465e224..00000000000 --- a/tests/core/sched/test_chunk_scheduling_coordinator.py +++ /dev/null @@ -1,690 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for OmniSchedulingCoordinator (formerly ChunkSchedulingCoordinator). - -These tests use mock request objects and mock queues. They do not require -GPU, vLLM runtime, or any connector. -""" - -from __future__ import annotations - -import unittest -from types import SimpleNamespace - -import vllm_omni.core.sched.omni_scheduling_coordinator as coord_mod -from vllm_omni.core.sched.omni_scheduling_coordinator import ( - ChunkSchedulingCoordinator, - OmniSchedulingCoordinator, -) - -# ------------------------------------------------------------------ # -# Mock helpers -# ------------------------------------------------------------------ # - - -class _RequestStatus: - WAITING = "waiting" - RUNNING = "running" - WAITING_FOR_CHUNK = "waiting_for_chunk" - WAITING_FOR_INPUT = "waiting_for_input" - FINISHED_STOPPED = "finished_stopped" - - -# Patch RequestStatus for tests that don't import vllm -try: - from vllm.v1.request import RequestStatus -except ImportError: - RequestStatus = _RequestStatus # type: ignore[misc,assignment] - -if not hasattr(RequestStatus, "WAITING_FOR_INPUT"): - coord_mod.RequestStatus = _RequestStatus # type: ignore[assignment] - RequestStatus = _RequestStatus # type: ignore[misc,assignment] - - -def _make_request(req_id: str, status: str = "waiting") -> SimpleNamespace: - return SimpleNamespace( - request_id=req_id, - external_req_id=req_id, - status=status, - additional_information=None, - prompt_token_ids=[], - num_prompt_tokens=0, - num_computed_tokens=0, - _all_token_ids=[], - _output_token_ids=[], - ) - - -class MockQueue: - """Simplified queue that mimics the Scheduler waiting queue interface.""" - - def __init__(self, items: list | None = None): - self._items: list = list(items or []) - - def __iter__(self): - return iter(self._items) - - def __len__(self): - return len(self._items) - - def __contains__(self, item): - return item in self._items - - def add_request(self, request): - self._items.append(request) - - def prepend_requests(self, requests): - self._items = list(requests) + self._items - - def remove(self, request): - self._items.remove(request) - - def remove_requests(self, requests): - remove_set = set(id(r) for r in requests) - self._items = [r for r in self._items if id(r) not in remove_set] - - -# ------------------------------------------------------------------ # -# Tests -# ------------------------------------------------------------------ # - - -class TestChunkCoordinatorStateTransition(unittest.TestCase): - """Test 5: process_pending_chunks transitions WAITING_FOR_CHUNK → target.""" - - def test_ready_request_transitions_to_waiting(self): - coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1, async_chunk=True) - - req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) - waiting = MockQueue([req]) - running: list = [] - - coord.process_pending_chunks( - waiting, - running, - chunk_ready_req_ids={"r1"}, - chunk_finished_req_ids=set(), - ) - - self.assertEqual(req.status, RequestStatus.WAITING) - self.assertIn("r1", coord.requests_with_ready_chunks) - - def test_non_ready_stays_waiting_for_chunk(self): - coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1, async_chunk=True) - - req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) - waiting = MockQueue([req]) - running: list = [] - - coord.process_pending_chunks( - waiting, - running, - chunk_ready_req_ids=set(), - chunk_finished_req_ids=set(), - ) - - self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) - - def test_stage_0_is_noop(self): - coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=0) - req = _make_request("r1") - waiting = MockQueue([req]) - running: list = [] - - coord.process_pending_chunks( - waiting, - running, - chunk_ready_req_ids={"r1"}, - chunk_finished_req_ids=set(), - ) - self.assertNotEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) - - -class TestChunkCoordinatorRestoreQueues(unittest.TestCase): - """Test 6: restore_queues returns waiting-for-chunk requests.""" - - def test_restore(self): - coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) - - r1 = _make_request("r1") - r2 = _make_request("r2") - coord._waiting_for_chunk_waiting.append(r1) - coord._waiting_for_chunk_running.append(r2) - - waiting = MockQueue() - running: list = [] - - coord.restore_queues(waiting, running) - - self.assertIn(r1, waiting) - self.assertIn(r2, running) - self.assertEqual(len(coord._waiting_for_chunk_waiting), 0) - self.assertEqual(len(coord._waiting_for_chunk_running), 0) - - -class TestChunkCoordinatorFinishedSignal(unittest.TestCase): - """Test 8: chunk_finished_req_ids → finished_requests.""" - - def test_finished_signal(self): - coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1, async_chunk=True) - - req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) - waiting = MockQueue([req]) - running: list = [] - - coord.process_pending_chunks( - waiting, - running, - chunk_ready_req_ids={"r1"}, - chunk_finished_req_ids={"r1"}, - ) - - self.assertIn("r1", coord.finished_requests) - - -class TestChunkCoordinatorUpdateRequestMetadata(unittest.TestCase): - """Test update_request_metadata applies scheduling metadata to requests.""" - - def test_ar_mode_no_longer_sets_additional_information(self): - """AR mode only processes scheduling metadata, not full payloads.""" - coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) - - req = _make_request("r1") - requests = {"r1": req} - - # Only scheduling metadata is passed now (full payload stays in model runner) - request_metadata = {"r1": {"next_stage_prompt_len": 50}} - - coord.update_request_metadata(requests, request_metadata, model_mode="ar") - - # next_stage_prompt_len should update prompt_token_ids - self.assertEqual(len(req.prompt_token_ids), 50) - self.assertEqual(req.num_prompt_tokens, 50) - # additional_information should NOT be set - self.assertIsNone(getattr(req, "additional_information", None)) - - def test_generation_mode(self): - coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) - - req = _make_request("r1") - req.prompt_token_ids = [0, 0, 0] - requests = {"r1": req} - - request_metadata = { - "r1": { - "code_predictor_codes": [10, 20, 30], - "left_context_size": 25, - } - } - - coord.update_request_metadata(requests, request_metadata, model_mode="generation") - - self.assertEqual(req.prompt_token_ids, [10, 20, 30]) - self.assertEqual(req.num_computed_tokens, 0) - self.assertIsNone(req.additional_information) - self.assertEqual(req._omni_initial_model_buffer, {"left_context_size": 25}) - - -class TestChunkCoordinatorPostprocess(unittest.TestCase): - """Test postprocess_scheduler_output clears ready chunks.""" - - def test_clear_ready(self): - coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) - coord.requests_with_ready_chunks = {"r1", "r2"} - - new_req = SimpleNamespace(req_id="r1") - cached_reqs = SimpleNamespace(req_ids=["r2"]) - scheduler_output = SimpleNamespace( - scheduled_new_reqs=[new_req], - scheduled_cached_reqs=cached_reqs, - ) - - coord.postprocess_scheduler_output(scheduler_output) - - self.assertEqual(coord.requests_with_ready_chunks, set()) - - -class TestWaitingForInputTransition(unittest.TestCase): - """Test B8: process_pending_full_payload_inputs transitions WAITING_FOR_INPUT.""" - - def test_transition_on_recv(self): - coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) - - req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) - waiting = MockQueue([req]) - running: list = [] - - coord.process_pending_full_payload_inputs( - waiting, - running, - stage_recv_req_ids={"r1"}, - ) - - self.assertEqual(req.status, RequestStatus.WAITING) - - def test_stays_waiting_for_input_if_not_received(self): - coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) - - req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) - waiting = MockQueue([req]) - running: list = [] - - coord.process_pending_full_payload_inputs( - waiting, - running, - stage_recv_req_ids=set(), - ) - - self.assertEqual(req.status, RequestStatus.WAITING_FOR_INPUT) - self.assertEqual(len(coord._waiting_for_input), 1) - - def test_stage_0_is_noop(self): - coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=0) - - req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) - waiting = MockQueue([req]) - running: list = [] - - coord.process_pending_full_payload_inputs( - waiting, - running, - stage_recv_req_ids={"r1"}, - ) - self.assertEqual(req.status, RequestStatus.WAITING_FOR_INPUT) - - def test_restore_queues_includes_waiting_for_input(self): - coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) - - r1 = _make_request("r1") - coord._waiting_for_input.append(r1) - - waiting = MockQueue() - running: list = [] - - coord.restore_queues(waiting, running) - - self.assertIn(r1, waiting) - self.assertEqual(len(coord._waiting_for_input), 0) - - def test_full_payload_mode_auto_transitions_waiting_to_waiting_for_input(self): - """In full_payload_mode (async_chunk=False), fresh WAITING requests on - non-Stage-0 should be transitioned to WAITING_FOR_INPUT.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - async_chunk=False, - ) - - req = _make_request("r1", status=RequestStatus.WAITING) - waiting = MockQueue([req]) - running: list = [] - - coord.process_pending_full_payload_inputs( - waiting, - running, - stage_recv_req_ids=set(), - ) - - self.assertEqual(req.status, RequestStatus.WAITING_FOR_INPUT) - self.assertEqual(len(coord._waiting_for_input), 1) - self.assertEqual(len(coord.pending_input_registrations), 1) - - def test_async_chunk_mode_does_not_auto_transition(self): - """In async_chunk mode, fresh WAITING requests should NOT be - transitioned to WAITING_FOR_INPUT.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - async_chunk=True, - ) - - req = _make_request("r1", status=RequestStatus.WAITING) - waiting = MockQueue([req]) - running: list = [] - - coord.process_pending_full_payload_inputs( - waiting, - running, - stage_recv_req_ids=set(), - ) - - self.assertEqual(req.status, RequestStatus.WAITING) - - def test_pending_input_registrations(self): - coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) - - req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) - waiting = MockQueue([req]) - running: list = [] - - coord.process_pending_full_payload_inputs( - waiting, - running, - stage_recv_req_ids=set(), - ) - - self.assertEqual(len(coord.pending_input_registrations), 1) - self.assertEqual(coord.pending_input_registrations[0].request_id, "r1") - - -class TestTimeoutDetection(unittest.TestCase): - """Regression tests for orphaned pending-recv timeout detection. - - Covers the full lifecycle: - 1. Request enters WAITING_FOR_CHUNK from either waiting or running queue - 2. restore_queues() moves it back to the scheduler queue - 3. Timeout fires via collect_timed_out_request_ids() - 4. Scheduler removes from both queues and calls _free_request() - """ - - def test_waiting_since_recorded_on_chunk_wait(self): - """_waiting_since is set when a request enters WAITING_FOR_CHUNK.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - async_chunk=True, - ) - req = _make_request("r1", status=RequestStatus.WAITING) - waiting = MockQueue([req]) - - coord.process_pending_chunks( - waiting, - [], - chunk_ready_req_ids=set(), - chunk_finished_req_ids=set(), - ) - - self.assertIn("r1", coord._waiting_since) - self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) - - def test_waiting_since_cleared_on_chunk_arrival(self): - """_waiting_since is cleared when a chunk arrives.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - async_chunk=True, - ) - req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) - waiting = MockQueue([req]) - - coord.process_pending_chunks( - waiting, - [], - chunk_ready_req_ids={"r1"}, - chunk_finished_req_ids=set(), - ) - - self.assertNotIn("r1", coord._waiting_since) - - def test_waiting_since_recorded_on_input_wait(self): - """_waiting_since is set when a request enters WAITING_FOR_INPUT.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - async_chunk=False, - ) - req = _make_request("r1", status=RequestStatus.WAITING) - waiting = MockQueue([req]) - - coord.process_pending_full_payload_inputs( - waiting, - [], - stage_recv_req_ids=set(), - ) - - self.assertIn("r1", coord._waiting_since) - - def test_waiting_since_cleared_on_input_arrival(self): - """_waiting_since is cleared when input data arrives.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - async_chunk=False, - ) - req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) - coord._waiting_for_input.append(req) - coord._waiting_since["r1"] = 0.0 - - waiting = MockQueue() - coord.process_pending_full_payload_inputs( - waiting, - [], - stage_recv_req_ids={"r1"}, - ) - - self.assertNotIn("r1", coord._waiting_since) - self.assertEqual(req.status, RequestStatus.WAITING) - - def test_collect_timed_out_request_ids_no_timeout(self): - """No IDs returned when nothing has timed out.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - ) - import time - - coord._waiting_since["r1"] = time.monotonic() - - result = coord.collect_timed_out_request_ids(timeout_s=300.0) - self.assertEqual(result, set()) - - def test_collect_timed_out_request_ids_expired(self): - """Timed-out IDs are returned and _waiting_since is cleared.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - ) - coord._waiting_since["r1"] = 0.0 # epoch → definitely expired - coord._waiting_since["r2"] = 0.0 - - import time - - coord._waiting_since["r3"] = time.monotonic() + 9999 # far future - - result = coord.collect_timed_out_request_ids(timeout_s=1.0) - - self.assertEqual(result, {"r1", "r2"}) - self.assertNotIn("r1", coord._waiting_since) - self.assertNotIn("r2", coord._waiting_since) - self.assertIn("r3", coord._waiting_since) - - def test_collect_removes_from_coordinator_queues(self): - """Timed-out requests are defensively removed from internal queues.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - ) - r1 = _make_request("r1") - r2 = _make_request("r2") - coord._waiting_for_chunk_waiting.append(r1) - coord._waiting_for_input.append(r2) - coord._waiting_since["r1"] = 0.0 - coord._waiting_since["r2"] = 0.0 - - result = coord.collect_timed_out_request_ids(timeout_s=1.0) - - self.assertEqual(result, {"r1", "r2"}) - self.assertEqual(len(coord._waiting_for_chunk_waiting), 0) - self.assertEqual(len(coord._waiting_for_input), 0) - - def test_free_finished_request_clears_waiting_since(self): - """free_finished_request clears _waiting_since.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - ) - coord._waiting_since["r1"] = 0.0 - coord.free_finished_request("r1") - self.assertNotIn("r1", coord._waiting_since) - - def test_timeout_from_running_queue_full_lifecycle(self): - """End-to-end: request from running → WAITING_FOR_CHUNK → restore → - timeout → removed from running list. - - This is the critical regression case: WAITING_FOR_CHUNK requests - that originated from self.running are placed back into self.running - by restore_queues(), but their status remains WAITING_FOR_CHUNK. - The scheduler must remove from BOTH queues unconditionally. - """ - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - async_chunk=True, - ) - - # 1) Request starts in running queue with WAITING status - req = _make_request("r1", status=RequestStatus.WAITING) - running = [req] - waiting = MockQueue() - - # 2) process_pending_chunks: moves to WAITING_FOR_CHUNK - coord.process_pending_chunks( - waiting, - running, - chunk_ready_req_ids=set(), - chunk_finished_req_ids=set(), - ) - self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) - self.assertIn("r1", coord._waiting_since) - self.assertEqual(len(coord._waiting_for_chunk_running), 1) - - # 3) restore_queues: back to running (status stays WAITING_FOR_CHUNK) - coord.restore_queues(waiting, running) - self.assertIn(req, running) - self.assertEqual(len(coord._waiting_for_chunk_running), 0) - self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) - - # 4) Force timeout by setting _waiting_since to epoch - coord._waiting_since["r1"] = 0.0 - - timed_out_ids = coord.collect_timed_out_request_ids(timeout_s=1.0) - self.assertEqual(timed_out_ids, {"r1"}) - - # 5) Scheduler removes from both queues (simulating the scheduler path) - timed_out_id_set = {id(req)} - running = [r for r in running if id(r) not in timed_out_id_set] - waiting.remove_requests([req]) - - self.assertNotIn(req, running) - self.assertEqual(len(waiting), 0) - - def test_timeout_from_waiting_queue_full_lifecycle(self): - """End-to-end: request from waiting → WAITING_FOR_CHUNK → restore → - timeout → removed from waiting queue.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=10, - stage_id=1, - async_chunk=True, - ) - - req = _make_request("r1", status=RequestStatus.WAITING) - waiting = MockQueue([req]) - running: list = [] - - coord.process_pending_chunks( - waiting, - running, - chunk_ready_req_ids=set(), - chunk_finished_req_ids=set(), - ) - self.assertEqual(len(coord._waiting_for_chunk_waiting), 1) - - coord.restore_queues(waiting, running) - self.assertIn(req, waiting) - - coord._waiting_since["r1"] = 0.0 - timed_out_ids = coord.collect_timed_out_request_ids(timeout_s=1.0) - self.assertEqual(timed_out_ids, {"r1"}) - - waiting.remove_requests([req]) - self.assertEqual(len(waiting), 0) - - -class TestOverflowPreemption(unittest.TestCase): - """Tests for P1-1: overflow requests must get WAITING status. - - Overflow happens when multiple WAITING_FOR_CHUNK requests in - ``_waiting_for_chunk_running`` receive their chunk in the same cycle. - ``_process_chunk_queue`` restores them to RUNNING (``continue`` - path) while RUNNING requests without chunks are moved out. If the - net result exceeds ``scheduler_max_num_seqs``, the tail is pushed - to ``waiting_queue`` and must have status == WAITING. - """ - - def test_overflow_sets_waiting_status(self): - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=1, - stage_id=1, - async_chunk=True, - ) - - # r1 is currently RUNNING in the queue. - # r2, r3 were previously moved to _waiting_for_chunk_running. - r1 = _make_request("r1", status=RequestStatus.RUNNING) - r2 = _make_request("r2", status=RequestStatus.WAITING_FOR_CHUNK) - r3 = _make_request("r3", status=RequestStatus.WAITING_FOR_CHUNK) - - running = [r1] - waiting = MockQueue([]) - coord._waiting_for_chunk_running.extend([r2, r3]) - - # restore_queues puts r2, r3 back into running - coord.restore_queues(waiting, running) - self.assertEqual(len(running), 3) - - # Now process_pending_chunks with r2, r3 chunks ready: - # _process_chunk_queue will: - # r1 (RUNNING) → no chunk → move to _waiting_for_chunk_running - # r2 (WAITING_FOR_CHUNK, chunk ready) → set RUNNING, stay in running - # r3 (WAITING_FOR_CHUNK, chunk ready) → set RUNNING, stay in running - # running = [r2, r3], len=2 > max=1 → overflow - coord.process_pending_chunks( - waiting, - running, - chunk_ready_req_ids={"r2", "r3"}, - chunk_finished_req_ids=set(), - ) - - self.assertEqual(len(running), 1) - self.assertEqual(len(waiting), 1) - overflow_req = list(waiting)[0] - self.assertEqual( - overflow_req.status, - RequestStatus.WAITING, - f"Overflowed request should have WAITING status, got {overflow_req.status}", - ) - - def test_overflow_does_not_strand_request(self): - """Without the fix, the overflowed request would keep its - RUNNING status in the waiting queue and never be re-scheduled.""" - coord = OmniSchedulingCoordinator( - scheduler_max_num_seqs=1, - stage_id=1, - async_chunk=True, - ) - - r1 = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) - r2 = _make_request("r2", status=RequestStatus.WAITING_FOR_CHUNK) - coord._waiting_for_chunk_running.extend([r1, r2]) - - running: list = [] - waiting = MockQueue([]) - - coord.restore_queues(waiting, running) - self.assertEqual(len(running), 2) - - coord.process_pending_chunks( - waiting, - running, - chunk_ready_req_ids={"r1", "r2"}, - chunk_finished_req_ids=set(), - ) - - self.assertEqual(len(running), 1) - self.assertEqual(len(waiting), 1) - for req in waiting: - self.assertNotEqual(req.status, RequestStatus.RUNNING, "Overflowed request must not keep RUNNING status") - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/core/sched/test_generation_scheduler_restore.py b/tests/core/sched/test_generation_scheduler_restore.py index 5cc1cab7025..0eae3c4db91 100644 --- a/tests/core/sched/test_generation_scheduler_restore.py +++ b/tests/core/sched/test_generation_scheduler_restore.py @@ -6,12 +6,9 @@ those requests are permanently orphaned. """ +import unittest from collections import deque -import pytest - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - class FakeAdapter: """Minimal mock of OmniChunkTransferAdapter tracking restore calls.""" @@ -38,7 +35,7 @@ def postprocess_scheduler_output(self, output): pass -class TestRestoreQueuesOnError: +class TestRestoreQueuesOnError(unittest.TestCase): """Verify that restore_queues is called even when rewrapping raises.""" def test_requests_not_lost_on_exception(self): @@ -51,8 +48,8 @@ def test_requests_not_lost_on_exception(self): # Step 1: process_pending_chunks moves req-B out adapter.process_pending_chunks(waiting=[], running=running) - assert running == ["req-A"] - assert len(adapter.waiting_for_chunk_running_requests) == 1 + self.assertEqual(running, ["req-A"]) + self.assertEqual(len(adapter.waiting_for_chunk_running_requests), 1) # Step 2: simulate the try/except/finally pattern try: @@ -64,9 +61,9 @@ def test_requests_not_lost_on_exception(self): adapter.restore_queues(waiting=[], running=running) # Step 3: verify request is restored - assert adapter.restore_called is True - assert "req-B" in running - assert len(adapter.waiting_for_chunk_running_requests) == 0 + self.assertTrue(adapter.restore_called) + self.assertIn("req-B", running) + self.assertEqual(len(adapter.waiting_for_chunk_running_requests), 0) def test_requests_lost_without_fix(self): """Demonstrate the bug: without restore in except, request is lost.""" @@ -75,7 +72,7 @@ def test_requests_lost_without_fix(self): running = ["req-A", "req-B"] adapter.process_pending_chunks(waiting=[], running=running) - assert running == ["req-A"] + self.assertEqual(running, ["req-A"]) # Simulate the BUGGY code: except without restore try: @@ -84,8 +81,8 @@ def test_requests_lost_without_fix(self): pass # Bug: no restore_queues call # Request is lost! - assert "req-B" not in running - assert len(adapter.waiting_for_chunk_running_requests) == 1 + self.assertNotIn("req-B", running) + self.assertEqual(len(adapter.waiting_for_chunk_running_requests), 1) def test_happy_path_restores_via_finally(self): """When no exception, restore_queues is still called via finally.""" @@ -101,5 +98,9 @@ def test_happy_path_restores_via_finally(self): finally: adapter.restore_queues(waiting=[], running=running) - assert adapter.restore_called is True - assert "req-B" in running + self.assertTrue(adapter.restore_called) + self.assertIn("req-B", running) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/core/sched/test_omni_scheduler_mixin.py b/tests/core/sched/test_omni_scheduler_mixin.py deleted file mode 100644 index e04a9c39fbc..00000000000 --- a/tests/core/sched/test_omni_scheduler_mixin.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Unit tests for OmniSchedulerMixin streaming session replacement. - -These tests pin the behavior of `_replace_session_with_streaming_update` against -current vLLM `Request` / `StreamingUpdate` (and Omni patches). When upgrading -vLLM, failures here should highlight incompatible changes to request state or -update payloads early. -""" - -from __future__ import annotations - -from dataclasses import replace - -import pytest - -# Imports must run in this order: vllm_omni applies patches to vllm.v1.request before -# Request / StreamingUpdate are bound in this module. Ruff isort would reorder them. -# isort: off -import vllm_omni # noqa: F401 - import for side effects (patch vLLM) -from vllm.sampling_params import SamplingParams -from vllm.v1.engine import EngineCoreEventType -from vllm.v1.request import Request, RequestStatus, StreamingUpdate -from vllm_omni.core.sched.omni_scheduler_mixin import OmniSchedulerMixin - -# isort: on - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -class _SchedulerStub(OmniSchedulerMixin): - """Minimal scheduler surface required by OmniSchedulerMixin.""" - - def __init__(self, *, log_stats: bool = False) -> None: - self.num_waiting_for_streaming_input = 0 - self.log_stats = log_stats - - -def _make_request(**kwargs) -> Request: - sp = SamplingParams(max_tokens=8) - defaults = dict( - request_id="req-mixin-test", - prompt_token_ids=[1, 2, 3], - sampling_params=sp, - pooling_params=None, - arrival_time=100.0, - block_hasher=None, - ) - defaults.update(kwargs) - return Request(**defaults) - - -def _make_update(**kwargs) -> StreamingUpdate: - sp_new = SamplingParams(max_tokens=16) - defaults = dict( - mm_features=None, - prompt_token_ids=[10, 20], - max_tokens=32, - arrival_time=200.0, - sampling_params=sp_new, - ) - defaults.update(kwargs) - return StreamingUpdate(**defaults) - - -class TestReplaceSessionWithStreamingUpdate: - def test_resets_tokens_and_prompt_from_update(self) -> None: - sched = _SchedulerStub() - session = _make_request() - session.append_output_token_ids([7, 8]) - session.num_computed_tokens = 99 - session.status = RequestStatus.WAITING_FOR_STREAMING_REQ - - update = _make_update(prompt_token_ids=[40, 41, 42]) - sched.num_waiting_for_streaming_input = 3 - sched._replace_session_with_streaming_update(session, update) - - assert session._output_token_ids == [] - assert list(session._all_token_ids) == [40, 41, 42] - assert session.prompt_token_ids == [40, 41, 42] - assert session.num_computed_tokens == 0 - assert session.num_prompt_tokens == 3 - assert session.arrival_time == 200.0 - assert session.sampling_params is update.sampling_params - assert session.status == RequestStatus.WAITING - assert sched.num_waiting_for_streaming_input == 2 - - def test_none_prompt_token_ids_becomes_empty(self) -> None: - sched = _SchedulerStub() - session = _make_request() - session.status = RequestStatus.RUNNING - update = _make_update(prompt_token_ids=None) - sched._replace_session_with_streaming_update(session, update) - - assert session.prompt_token_ids == () - assert list(session._all_token_ids) == [] - assert session.num_prompt_tokens == 0 - assert sched.num_waiting_for_streaming_input == 0 - - def test_additional_information_cleared_when_update_omits_it(self) -> None: - sched = _SchedulerStub() - session = _make_request() - if not hasattr(session, "additional_information"): - pytest.skip("Request has no additional_information (Omni patch inactive?)") - session.additional_information = {"keep": True} - session.status = RequestStatus.RUNNING - - base = _make_update() - if not hasattr(base, "additional_information"): - pytest.skip("StreamingUpdate has no additional_information (Omni patch inactive?)") - update = replace(base, additional_information=None) - - sched._replace_session_with_streaming_update(session, update) - assert session.additional_information is None - - def test_does_not_decrement_waiting_when_not_streaming_status(self) -> None: - sched = _SchedulerStub() - session = _make_request() - session.status = RequestStatus.RUNNING - sched.num_waiting_for_streaming_input = 5 - sched._replace_session_with_streaming_update(session, _make_update()) - assert sched.num_waiting_for_streaming_input == 5 - - def test_records_queued_event_when_log_stats_enabled(self) -> None: - sched = _SchedulerStub(log_stats=True) - session = _make_request() - session.status = RequestStatus.WAITING_FOR_STREAMING_REQ - sched._replace_session_with_streaming_update(session, _make_update()) - - assert session.events - assert session.events[-1].type == EngineCoreEventType.QUEUED diff --git a/tests/core/test_prefix_cache.py b/tests/core/test_prefix_cache.py deleted file mode 100644 index b5d0e96d305..00000000000 --- a/tests/core/test_prefix_cache.py +++ /dev/null @@ -1,349 +0,0 @@ -import pytest -import torch - -from vllm_omni.core.prefix_cache import OmniTensorPrefixCache - -DEFAULT_SEQ_LEN = 15 -NUM_BLOCKS = 10 -BLOCK_SIZE = 4 -HIDDEN_SIZE = 2 -DTYPE = torch.float32 -OTHER_DTYPE = torch.float16 -DEFAULT_SHAPE = torch.Size([NUM_BLOCKS, BLOCK_SIZE, HIDDEN_SIZE]) - - -class MockInputBatch: - def __init__(self, num_computed_tokens_cpu): - self.req_ids = ["req1", "req2"] - self.req_id_to_index = {req_id: i for i, req_id in enumerate(self.req_ids)} - self.num_computed_tokens_cpu = num_computed_tokens_cpu - - # Block table is only mocked for validation of length; - # we don't actually need to add valid values here since - # we patch the table when testing. - class _DummyBlockTable: - pass - - self.block_table = _DummyBlockTable() - self.block_table.block_tables = [None] - - -def get_omni_pcache_with_mm_tensors(feat_dims, seq_len) -> OmniTensorPrefixCache: - """Build an OmniTensorPrefixCache and init mm tensors.""" - cache = get_omni_pcache() - mm_outputs = get_multimodal_outputs(feat_dims, seq_len) - cache.maybe_init_missing_mm_cache_keys(mm_outputs, seq_len) - return cache - - -def get_omni_pcache() -> OmniTensorPrefixCache: - """Build an OmniTensorPrefixCache, but don't init mm tensors.""" - cache = OmniTensorPrefixCache( - num_blocks=NUM_BLOCKS, - block_size=BLOCK_SIZE, - hidden_size=HIDDEN_SIZE, - hs_dtype=DTYPE, - ) - return cache - - -def get_multimodal_outputs(feat_dims: dict[str, int], seq_len: int) -> dict[str, torch.Tensor]: - fake_mm_inputs = {} - for mm_key, feat_dim in feat_dims.items(): - fake_mm_inputs[mm_key] = torch.rand((seq_len, feat_dim), dtype=DTYPE) - return fake_mm_inputs - - -### Tests for initialization -def test_initialization_simple(): - """Check default initialization only creates the hidden states.""" - cache = get_omni_pcache() - assert isinstance(cache.hidden_states_cache, torch.Tensor) - assert cache.hidden_states_cache.shape == DEFAULT_SHAPE - assert len(cache.mm_outputs_cache) == 0 - assert len(cache.mm_cache_keys) == 0 - - -def test_initialization_with_multimodal(): - """Check initialization + registration of multimodal outputs.""" - cache = get_omni_pcache() - feat_dims = {"foo": 100, "bar": 50, "baz": 10} - mm_outputs = get_multimodal_outputs( - feat_dims, - seq_len=DEFAULT_SEQ_LEN, - ) - # Cast one of the keys to a different dtype; the dtype of the tensor - # that is used to initialize the cache dictates the cache dtype. - mm_outputs["foo"] = mm_outputs["foo"].to(OTHER_DTYPE) - - cache.maybe_init_missing_mm_cache_keys(mm_outputs, DEFAULT_SEQ_LEN) - assert len(cache.mm_cache_keys) == 3 - assert set(cache.mm_cache_keys) == set(feat_dims.keys()) - for mm_key in cache.mm_cache_keys: - cache_tensor = cache.mm_outputs_cache[mm_key] - assert isinstance(cache_tensor, torch.Tensor) - assert cache_tensor.shape[-1] == feat_dims[mm_key] - assert mm_outputs[mm_key].dtype == cache_tensor.dtype - - -def test_init_missing_mm_cache_keys_is_idempotent(): - """Ensure that the cache doesn't reinitialize old keys.""" - cache = get_omni_pcache() - mm_key = "foo" - feat_dims = {mm_key: 100} - mm_outputs = get_multimodal_outputs( - feat_dims, - seq_len=DEFAULT_SEQ_LEN, - ) - cache.maybe_init_missing_mm_cache_keys(mm_outputs, DEFAULT_SEQ_LEN) - assert len(cache.mm_cache_keys) == 1 - assert mm_key in cache.mm_cache_keys - - # Cache is initialized to 0 - fill it with 1s - cache.mm_outputs_cache[mm_key].fill_(1) - - # Ensure that running another initialization - # doesn't zero out our cache values - cache.maybe_init_missing_mm_cache_keys(mm_outputs, DEFAULT_SEQ_LEN) - assert len(cache.mm_cache_keys) == 1 - assert mm_key in cache.mm_cache_keys - assert torch.all(cache.mm_outputs_cache[mm_key] == 1) - - -### Tests for Update -def test_update_no_multimodal(): - """Test that slot mappings act as row indices hidden states.""" - cache = get_omni_pcache() - - num_tokens_unpadded = 8 - slot_offset = 8 - slot_mapping = torch.arange(slot_offset, slot_offset + num_tokens_unpadded) - new_hidden_states = torch.rand((num_tokens_unpadded, HIDDEN_SIZE), dtype=DTYPE) - - cache.update_omni_tensor_prefix_cache( - hidden_states=new_hidden_states, - multimodal_outputs=None, - num_tokens_unpadded=num_tokens_unpadded, - slot_mapping=slot_mapping, - ) - - # Ensure that if we reshape our 3D cache back to 2D, we can use the - # indices in our slot mappings to access the hidden states as expected - hs_rows = cache.hidden_states_cache.view(NUM_BLOCKS * BLOCK_SIZE, HIDDEN_SIZE) - for slot_idx, new_states in zip(slot_mapping, new_hidden_states): - slot_states = hs_rows[slot_idx] - assert torch.all(slot_states == new_states) - - -@pytest.mark.parametrize( - "feat_dims", - [ - {"foo": 100, "bar": 100}, - {"foo": 100, "bar": 50, "baz": 10}, - ], -) -def test_update_with_multimodal_outputs(feat_dims): - """Test that slot mappings are correct for multimodal tensors.""" - cache = get_omni_pcache_with_mm_tensors(feat_dims, seq_len=DEFAULT_SEQ_LEN) - - num_tokens_unpadded = 8 - slot_offset = 8 - slot_mapping = torch.arange(slot_offset, slot_offset + num_tokens_unpadded) - feature_dims = {key: val.shape[-1] for key, val in cache.mm_outputs_cache.items()} - mm_outputs = {key: torch.rand((num_tokens_unpadded, feature_dims[key]), dtype=DTYPE) for key in cache.mm_cache_keys} - cache.update_omni_tensor_prefix_cache( - hidden_states=None, - multimodal_outputs=mm_outputs, - num_tokens_unpadded=num_tokens_unpadded, - slot_mapping=slot_mapping, - ) - - for mm_key in feat_dims.keys(): - assert mm_key in cache.mm_outputs_cache - key_feat_dim = feature_dims[mm_key] - mm_state_rows = cache.mm_outputs_cache[mm_key].view(NUM_BLOCKS * BLOCK_SIZE, key_feat_dim) - - # Similar to hidden states, but for each key in the dict; - # Different tensors may have different feature dims - new_mm_outputs = mm_outputs[mm_key] - for slot_idx, new_output in zip(slot_mapping, new_mm_outputs): - slot_states = mm_state_rows[slot_idx] - assert torch.all(slot_states == new_output) - - -### Tests for Merging -def fake_get_cached_block_ids(self, req_idx, *args, **kwargs): - """Fake block table lookup. - - Assumption: - req_idx 0 is a cache hit with slots 8, 9, ..., 15 - req_idx 1 is a cache miss - """ - assert req_idx < 2 - if req_idx == 0: - # With the slot offset we provided (8), the corresponding - # blocks IDs are 2 & 3 because the block size is 4. - return torch.tensor([2, 3], dtype=torch.long) - return torch.tensor([], dtype=torch.long) - - -@pytest.mark.parametrize("num_tokens_padded", [None, 16]) -def test_get_merged_hidden_states(num_tokens_padded, mocker): - """Ensure that hidden states are merged correctly.""" - cache = get_omni_pcache() - - orig_num_tokens_unpadded = 8 - slot_offset = 8 # We'll put our states in slots 8, 9, 10, ..., 15 - orig_slot_mapping = torch.arange(slot_offset, slot_offset + orig_num_tokens_unpadded) - orig_hidden_states = torch.rand((orig_num_tokens_unpadded, HIDDEN_SIZE), dtype=DTYPE) - - cache.update_omni_tensor_prefix_cache( - hidden_states=orig_hidden_states, - multimodal_outputs=None, - num_tokens_unpadded=orig_num_tokens_unpadded, - slot_mapping=orig_slot_mapping, - num_tokens_padded=num_tokens_padded, - ) - - # Say that we have two requests, but only one of them is a cache hit - num_new_toks_req1 = 3 - num_new_toks_req2 = 2 - cache.add_prefix_cached_new_req_id("req1") - - num_scheduled_tokens = { - "req1": num_new_toks_req1, - "req2": num_new_toks_req2, - } - new_hidden_states = torch.rand( - (num_new_toks_req1 + num_new_toks_req2, HIDDEN_SIZE), - dtype=DTYPE, - ) - req1_new_states = new_hidden_states[:num_new_toks_req1] - req2_new_states = new_hidden_states[-num_new_toks_req2:] - - input_batch = MockInputBatch(num_computed_tokens_cpu=torch.Tensor([orig_num_tokens_unpadded, 0])) - - mocker.patch( - "vllm_omni.core.prefix_cache.OmniTensorPrefixCache._get_cached_block_ids", - new=fake_get_cached_block_ids, - ) - merged_states = cache.get_merged_hidden_states( - query_start_loc=[0, num_new_toks_req1], - input_batch=input_batch, - hidden_states=new_hidden_states, - num_scheduled_tokens=num_scheduled_tokens, - ) - - assert "req1" in merged_states and "req2" in merged_states - req1_merged_states = merged_states["req1"] - req2_merged_states = merged_states["req2"] - - # First, check the cache hit case - assert req1_merged_states.shape == torch.Size([orig_num_tokens_unpadded + num_new_toks_req1, HIDDEN_SIZE]) - # Ensure that the req1 merged states are the cached states + the new req1 states - assert torch.all(req1_merged_states[:orig_num_tokens_unpadded] == orig_hidden_states) - assert torch.all(req1_merged_states[-num_new_toks_req1:] == req1_new_states) - - # Next, ensure that the cache miss case only has the new states - assert req2_merged_states.shape == torch.Size([num_new_toks_req2, HIDDEN_SIZE]) - assert torch.all(req2_merged_states == req2_new_states) - - -@pytest.mark.parametrize("num_tokens_padded", [None, 16]) -@pytest.mark.parametrize( - "feat_dims", - [ - {"foo": 100, "bar": 100}, - {"foo": 100, "bar": 50, "baz": 10}, - ], -) -def test_get_merged_multimodal_outputs(feat_dims, num_tokens_padded, mocker): - cache = get_omni_pcache_with_mm_tensors(feat_dims, seq_len=DEFAULT_SEQ_LEN) - - orig_num_tokens_unpadded = 8 - slot_offset = 8 # We'll put our states in slots 8, 9, 10, ..., 15 - orig_slot_mapping = torch.arange(slot_offset, slot_offset + orig_num_tokens_unpadded) - feature_dims = {key: val.shape[-1] for key, val in cache.mm_outputs_cache.items()} - orig_mm_outputs = { - key: torch.rand((orig_num_tokens_unpadded, feature_dims[key]), dtype=DTYPE) for key in cache.mm_cache_keys - } - - cache.update_omni_tensor_prefix_cache( - hidden_states=None, - multimodal_outputs=orig_mm_outputs, - num_tokens_unpadded=orig_num_tokens_unpadded, - slot_mapping=orig_slot_mapping, - num_tokens_padded=num_tokens_padded, - ) - - # Similar to hs test- say that we have two requests, but only one of them is a cache hit - num_new_toks_req1 = 3 - num_new_toks_req2 = 2 - cache.add_prefix_cached_new_req_id("req1") - - num_scheduled_tokens = { - "req1": num_new_toks_req1, - "req2": num_new_toks_req2, - } - - new_mm_outputs = {} - for mm_key in cache.mm_cache_keys: - new_mm_outputs[mm_key] = torch.rand( - (num_new_toks_req1 + num_new_toks_req2, feature_dims[mm_key]), - dtype=DTYPE, - ) - # We also want to make sure passthrough data (outside of our keys) isn't dropped - new_mm_outputs["passthrough_data"] = "Something else" - # Lists are a special case because we can't split them yet if we want to match - # the nonprefix cache behavior, because this runs before post process. - new_mm_outputs["passthrough_list"] = ["should", "not", "split"] - - input_batch = MockInputBatch(num_computed_tokens_cpu=torch.Tensor([orig_num_tokens_unpadded, 0])) - - mocker.patch( - "vllm_omni.core.prefix_cache.OmniTensorPrefixCache._get_cached_block_ids", - new=fake_get_cached_block_ids, - ) - merged_mm_outputs = cache.get_merged_multimodal_states( - query_start_loc=[0, num_new_toks_req1], - input_batch=input_batch, - multimodal_outputs=new_mm_outputs, - num_scheduled_tokens=num_scheduled_tokens, - ) - - # Ensure the passthrough data wasn't dropped - assert "passthrough_data" in merged_mm_outputs - assert "passthrough_list" in merged_mm_outputs - - for mm_key, mm_output in merged_mm_outputs.items(): - # Ensure passthrough data is just forwarded normally and not duplicated - assert isinstance(mm_output, dict) - assert "req1" in mm_output and "req2" in mm_output - if mm_key == "passthrough_data": - assert mm_key not in cache.mm_cache_keys - assert new_mm_outputs[mm_key] == mm_output["req1"] - assert new_mm_outputs[mm_key] == mm_output["req2"] - elif mm_key == "passthrough_list": - assert mm_key not in cache.mm_cache_keys - assert new_mm_outputs[mm_key] == mm_output["req1"] - assert new_mm_outputs[mm_key] == mm_output["req2"] - else: - assert mm_key in cache.mm_cache_keys - curr_feat_dim = feature_dims[mm_key] - # Ensure that req1 (cache hit) merged the mm data - req1_merged_mm_outputs = mm_output["req1"] - req1_new_mm_outputs = new_mm_outputs[mm_key][:num_new_toks_req1] - - assert req1_merged_mm_outputs.shape == torch.Size( - [orig_num_tokens_unpadded + num_new_toks_req1, curr_feat_dim] - ) - # Ensure that the req1 merged mm data are the cached data + the new data - assert torch.all(req1_merged_mm_outputs[:orig_num_tokens_unpadded] == orig_mm_outputs[mm_key]) - assert torch.all(req1_merged_mm_outputs[-num_new_toks_req1:] == req1_new_mm_outputs) - - # Ensure that req2 (cache miss) only has the new mm data - req2_merged_mm_outputs = mm_output["req2"] - req2_new_mm_outputs = new_mm_outputs[mm_key][-num_new_toks_req2:] - - assert req2_merged_mm_outputs.shape == torch.Size([num_new_toks_req2, curr_feat_dim]) - assert torch.all(req2_merged_mm_outputs == req2_new_mm_outputs) diff --git a/tests/dfx/conftest.py b/tests/dfx/conftest.py index 12eb8e6f1b5..e54141b3442 100644 --- a/tests/dfx/conftest.py +++ b/tests/dfx/conftest.py @@ -1,13 +1,8 @@ import json -import os -import subprocess -from datetime import datetime from pathlib import Path from typing import Any -import pytest - -from tests.helpers.stage_config import modify_stage_config +from tests.conftest import modify_stage_config def load_configs(config_path: str) -> list[dict[str, Any]]: @@ -40,70 +35,25 @@ def modify_stage(default_path, updates, deletes): return path -def _build_serve_args(serve_args: Any) -> list[str]: - """Convert server_params.serve_args to a flat CLI args list.""" - if serve_args is None: - return [] - if isinstance(serve_args, list): - return [str(item) for item in serve_args] - if not isinstance(serve_args, dict): - raise TypeError(f"serve_args must be dict/list/None, got {type(serve_args).__name__}") - - args: list[str] = [] - for key, value in serve_args.items(): - flag = f"--{str(key).replace('_', '-')}" - if isinstance(value, bool): - if value: - args.append(flag) - continue - if value is None: - continue - if isinstance(value, (dict, list)): - args.extend([flag, json.dumps(value, ensure_ascii=False, separators=(",", ":"))]) - continue - args.extend([flag, str(value)]) - return args - - def create_unique_server_params( configs: list[dict[str, Any]], stage_configs_dir: Path, -) -> list[tuple[str, str, str | None, str | None, tuple[str, ...]]]: - """Return one row per unique server configuration (same 5-tuple shape as upstream). - - ``(test_name, model, deploy_yaml_path, stage_overrides_json, extra_cli_args)``. - - JSON ``server_params.serve_args`` (dict/list) is expanded via ``_build_serve_args`` - and **prepended** to ``extra_cli_args`` so perf / stability ``omni_server`` fixtures - stay identical to main while still honoring ``serve_args`` in benchmark JSON. - """ - unique_params: list[tuple[str, str, str | None, str | None, tuple[str, ...]]] = [] - seen: set[tuple[str, str, str | None, str | None, tuple[str, ...]]] = set() +) -> list[tuple[str, str, str]]: + unique_params = [] + seen = set() for config in configs: test_name = config["test_name"] - server_params = config["server_params"] - model = server_params["model"] - stage_config_name = server_params.get("stage_config_name") + model = config["server_params"]["model"] + stage_config_name = config["server_params"].get("stage_config_name") if stage_config_name: stage_config_path = str(stage_configs_dir / stage_config_name) - delete = server_params.get("delete", None) - update = server_params.get("update", None) + delete = config["server_params"].get("delete", None) + update = config["server_params"].get("update", None) stage_config_path = modify_stage(stage_config_path, update, delete) else: stage_config_path = None - stage_overrides = server_params.get("stage_overrides") - stage_overrides_json = json.dumps(stage_overrides) if stage_overrides else None - - # ``extra_cli_args`` passes raw CLI flags straight through to - # ``vllm_omni.entrypoints.cli.main serve`` — used for flags that - # don't map to stage-level overrides, e.g. ``--async-chunk`` / - # ``--no-async-chunk`` toggling the deploy-level async_chunk bool. - serve_flat = _build_serve_args(server_params.get("serve_args")) - raw_extra = tuple(server_params.get("extra_cli_args") or ()) - extra_cli_args = tuple(serve_flat) + raw_extra - - server_param = (test_name, model, stage_config_path, stage_overrides_json, extra_cli_args) + server_param = (test_name, model, stage_config_path) if server_param not in seen: seen.add(server_param) unique_params.append(server_param) @@ -120,11 +70,7 @@ def create_test_parameter_mapping(configs: list[dict[str, Any]]) -> dict[str, di "test_name": test_name, "benchmark_params": [], } - for entry in config["benchmark_params"]: - # Skip disabled entries - if not entry.get("enabled", True): - continue - mapping[test_name]["benchmark_params"].append(entry) + mapping[test_name]["benchmark_params"].extend(config["benchmark_params"]) return mapping @@ -149,146 +95,3 @@ def create_benchmark_indices( indices.append((test_name, idx)) return indices - - -def _safe_filename_token(value: Any | None, *, default: str = "na") -> str: - """Make a single path segment safe for result filenames on common filesystems.""" - if value is None: - return default - s = str(value).strip() - for bad in ("/", "\\", ":", "*", "?", '"', "<", ">", "|"): - s = s.replace(bad, "_") - return s if s else default - - -def _resolve_baseline_value( - baseline_raw: Any, - *, - sweep_index: int | None, - max_concurrency: Any = None, - request_rate: Any = None, -) -> Any: - """Pick the baseline threshold for this sweep step.""" - if baseline_raw is None: - return 100000 - if isinstance(baseline_raw, dict): - if max_concurrency is not None: - for key in (max_concurrency, str(max_concurrency)): - if key in baseline_raw: - return baseline_raw[key] - if request_rate is not None: - for key in (request_rate, str(request_rate)): - if key in baseline_raw: - return baseline_raw[key] - raise KeyError( - f"baseline dict has no key for max_concurrency={max_concurrency!r} " - f"or request_rate={request_rate!r}; keys={list(baseline_raw.keys())!r}" - ) - if isinstance(baseline_raw, (list, tuple)): - if sweep_index is None: - raise ValueError("list baseline requires sweep_index") - if not (0 <= sweep_index < len(baseline_raw)): - raise IndexError(f"baseline list len={len(baseline_raw)} has no index {sweep_index}") - return baseline_raw[sweep_index] - return baseline_raw - - -def _baseline_thresholds_for_step( - baseline_data: dict[str, Any], - *, - sweep_index: int | None = None, - max_concurrency: Any = None, - request_rate: Any = None, -) -> dict[str, Any]: - """Resolve baseline config to one threshold per metric for this iteration.""" - return { - metric_name: _resolve_baseline_value( - baseline_raw, - sweep_index=sweep_index, - max_concurrency=max_concurrency, - request_rate=request_rate, - ) - for metric_name, baseline_raw in baseline_data.items() - } - - -def run_benchmark( - args: list[str], - test_name: str, - flow: Any, - dataset_name: str, - num_prompt: int, - *, - baseline_config: dict[str, Any] | None = None, - sweep_index: int | None = None, - request_rate: Any | None = None, - max_concurrency: Any | None = None, - random_input_len: Any | None = None, - random_output_len: Any | None = None, -) -> dict[str, Any]: - """Run one ``vllm bench serve --omni`` iteration and return parsed metrics.""" - current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") - ri = _safe_filename_token(random_input_len) - ro = _safe_filename_token(random_output_len) - result_filename = f"result_{test_name}_{dataset_name}_{flow}_{num_prompt}_in{ri}_out{ro}_{current_dt}.json" - if "--result-filename" in args: - print(f"The result file will be overwritten by {result_filename}") - command = ( - ["vllm", "bench", "serve", "--omni"] - + args - + [ - "--num-warmups", - "2", - "--save-result", - "--result-dir", - os.environ.get("BENCHMARK_DIR", "tests"), - "--result-filename", - result_filename, - ] - ) - process = subprocess.Popen( - command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, universal_newlines=True - ) - - for line in iter(process.stdout.readline, ""): - print(line, end=" ") - - for line in iter(process.stderr.readline, ""): - print(line, end=" ") - - if "--result-dir" in command: - index = command.index("--result-dir") - result_dir = command[index + 1] - else: - result_dir = "./" - - result_path = os.path.join(result_dir, result_filename) - with open(result_path, encoding="utf-8") as f: - result = json.load(f) - - if baseline_config: - result["baseline"] = _baseline_thresholds_for_step( - baseline_config, - sweep_index=sweep_index, - request_rate=request_rate, - max_concurrency=max_concurrency, - ) - else: - result["baseline"] = {} - if random_input_len is not None: - result["random_input_len"] = random_input_len - if random_output_len is not None: - result["random_output_len"] = random_output_len - with open(result_path, "w", encoding="utf-8") as f: - json.dump(result, f, ensure_ascii=False, indent=2) - return result - - -def pytest_addoption(parser: pytest.Parser) -> None: - """Register shared CLI options for DFX benchmark suites.""" - parser.addoption( - "--test-config-file", - action="store", - default=None, - help=("Path to benchmark config JSON. Example: --test-config-file tests/dfx/perf/tests/test_tts.json"), - ) diff --git a/tests/dfx/perf/scripts/diffusion_result_template.json b/tests/dfx/perf/scripts/diffusion_result_template.json deleted file mode 100644 index 86bdf1bc7aa..00000000000 --- a/tests/dfx/perf/scripts/diffusion_result_template.json +++ /dev/null @@ -1,86 +0,0 @@ -[ - { - "test_name": null, - "backend": null, - "timestamp": null, - "server_params": { - "model": null, - "serve_args": { - "enable-diffusion-pipeline-profiler": false - } - }, - "benchmark_params": { - "name": null, - "dataset": null, - "task": null, - "width": 0, - "height": 0, - "num-inference-steps": 0, - "num-prompts": 0, - "max-concurrency": 0, - "num-input-images": 0, - "enable-negative-prompt": false, - "baseline": { - "throughput_qps": 0, - "latency_mean": 0, - "peak_memory_mb_max": 0, - "peak_memory_mb_mean": 0 - } - }, - "result": { - "duration": 0, - "completed_requests": 0, - "failed_requests": 0, - "throughput_qps": 0, - "latency_mean": 0, - "latency_median": 0, - "latency_p99": 0, - "latency_p95": 0, - "latency_p50": 0, - "peak_memory_mb_max": 0, - "peak_memory_mb_mean": 0, - "peak_memory_mb_median": 0, - "stage_durations_mean": {}, - "stage_durations_p50": {}, - "stage_durations_p99": {}, - "backend": null, - "model": null, - "dataset": null, - "task": null - }, - "log_file": null, - "Model": null, - "Framework": null, - "Hardware": null, - "Deployment": null, - "Task": null, - "Dataset": null, - "resolution": null, - "Parallelism": null, - "max_concurrency": 0, - "Cache": null, - "Quantization": null, - "offload": null, - "compile": null, - "Attn_backend": null, - "num_inference_steps": 0, - "completed": 0, - "failed": 0, - "throughput_qps": 0, - "latency_mean": 0, - "latency_median": 0, - "latency_p99": 0, - "latency_p95": 0, - "latency_p50": 0, - "peak_memory_mb_max": 0, - "peak_memory_mb_mean": 0, - "peak_memory_mb_median": 0, - "stage_durations_mean": {}, - "stage_durations_p50": {}, - "stage_durations_p99": {}, - "commit_sha": null, - "build_id": null, - "build_url": null, - "source_file": null - } -] diff --git a/tests/dfx/perf/scripts/result_omni_template.json b/tests/dfx/perf/scripts/result_omni_template.json deleted file mode 100644 index 1d61321407e..00000000000 --- a/tests/dfx/perf/scripts/result_omni_template.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "date": null, - "endpoint_type": null, - "backend": null, - "label": null, - "model_id": null, - "tokenizer_id": null, - "num_prompts": 0, - "request_rate": null, - "burstiness": 0, - "max_concurrency": 0, - "duration": 0, - "completed": 0, - "failed": 0, - "total_input_tokens": 0, - "total_output_tokens": 0, - "request_throughput": 0, - "request_goodput": null, - "output_throughput": 0, - "total_token_throughput": 0, - "total_audio_duration_s": 0, - "total_audio_frames": 0, - "audio_throughput": 0, - "max_output_tokens_per_s": 0, - "max_concurrent_requests": 0, - "rtfx": 0, - "mean_ttft_ms": 0, - "median_ttft_ms": 0, - "p99_ttft_ms": 0, - "mean_tpot_ms": 0, - "median_tpot_ms": 0, - "p99_tpot_ms": 0, - "mean_itl_ms": 0, - "median_itl_ms": 0, - "p99_itl_ms": 0, - "mean_e2el_ms": 0, - "median_e2el_ms": 0, - "p99_e2el_ms": 0, - "mean_audio_rtf": 0, - "median_audio_rtf": 0, - "p99_audio_rtf": 0, - "mean_audio_ttfp_ms": 0, - "median_audio_ttfp_ms": 0, - "p99_audio_ttfp_ms": 0, - "mean_audio_duration_s": 0, - "median_audio_duration_s": 0, - "p99_audio_duration_s": 0, - "baseline": { - "mean_ttft_ms": 0, - "mean_audio_ttfp_ms": 0, - "mean_audio_rtf": 0 - }, - "random_input_len": 0, - "random_output_len": 0 -} diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index 9036508cb1c..9e375fa9fec 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -8,6 +8,7 @@ import pytest +from tests.conftest import OmniServer from tests.dfx.conftest import ( create_benchmark_indices, create_test_parameter_mapping, @@ -15,44 +16,17 @@ get_benchmark_params_for_server, load_configs, ) -from tests.helpers.runtime import OmniServer - -pytestmark = [pytest.mark.full_model, pytest.mark.omni] - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -def _get_config_file_from_argv() -> str | None: - """Read ``--test-config-file`` from ``sys.argv`` at import time so parametrization can use it.""" - import sys - - for i, arg in enumerate(sys.argv): - if arg == "--test-config-file" and i + 1 < len(sys.argv): - return sys.argv[i + 1] - if arg.startswith("--test-config-file="): - return arg.split("=", 1)[1] - return None - - -_PERF_TESTS_DIR = Path(__file__).resolve().parent.parent / "tests" -_DEFAULT_CONFIG_FILE = str(_PERF_TESTS_DIR / "test_qwen_omni.json") - -CONFIG_FILE_PATH = _get_config_file_from_argv() -if CONFIG_FILE_PATH is None: - print( - "No --test-config-file in argv, using default: tests/dfx/perf/tests/test_qwen_omni.json " - "(override with e.g. --test-config-file tests/dfx/perf/tests/test_tts.json)" - ) - CONFIG_FILE_PATH = _DEFAULT_CONFIG_FILE - +CONFIG_FILE_PATH = str(Path(__file__).parent.parent / "tests" / "test.json") BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) -OMNI_RESULT_TEMPLATE_PATH = Path(__file__).parent / "result_omni_template.json" -DEPLOY_CONFIGS_DIR = Path(__file__).parent.parent / "deploy" -test_params = create_unique_server_params(BENCHMARK_CONFIGS, DEPLOY_CONFIGS_DIR) +STAGE_CONFIGS_DIR = Path(__file__).parent.parent / "stage_configs" +test_params = create_unique_server_params(BENCHMARK_CONFIGS, STAGE_CONFIGS_DIR) server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS) _omni_server_lock = threading.Lock() @@ -65,19 +39,13 @@ def omni_server(request): Multi-stage initialization can take 10-20+ minutes. """ with _omni_server_lock: - test_name, model, stage_config_path, stage_overrides, extra_cli_args = request.param + test_name, model, stage_config_path = request.param print(f"Starting OmniServer with test: {test_name}, model: {model}") - server_args = ["--stage-init-timeout", "600", "--init-timeout", "900"] - # --deploy-config and --stage-overrides compose at the CLI (see vllm_omni/entrypoints/utils.py): - # deploy-config sets the base; stage-overrides are applied on top. Both can be set. + server_args = ["--stage-init-timeout", "120"] if stage_config_path: - server_args = ["--deploy-config", stage_config_path] + server_args - if stage_overrides: - server_args = ["--stage-overrides", stage_overrides] + server_args - if extra_cli_args: - server_args = list(extra_cli_args) + server_args + server_args = ["--stage-configs-path", stage_config_path] + server_args with OmniServer(model, server_args) as server: server.test_name = test_name print("OmniServer started successfully") @@ -87,41 +55,16 @@ def omni_server(request): print("OmniServer stopped") -def _safe_filename_token(value: Any | None, *, default: str = "na") -> str: - """Make a single path segment safe for result filenames on common filesystems.""" - if value is None: - return default - s = str(value).strip() - for bad in ("/", "\\", ":", "*", "?", '"', "<", ">", "|"): - s = s.replace(bad, "_") - return s if s else default - - def run_benchmark( args: list, test_name: str, flow, dataset_name: str, num_prompt, - *, - baseline_config: dict[str, Any] | None = None, - sweep_index: int | None = None, - request_rate: Any | None = None, - max_concurrency: Any | None = None, - random_input_len: Any | None = None, - random_output_len: Any | None = None, ) -> Any: - """Run a single benchmark iteration and return the parsed result JSON. - - After ``vllm bench`` writes the JSON, ``result["baseline"]`` holds the same - per-metric resolved thresholds as ``assert_result`` (via ``_baseline_thresholds_for_step``). - When ``random_input_len`` / ``random_output_len`` are set, they are also written into the result JSON; - omitted keys when not configured. - """ + """Run a single benchmark iteration and return the parsed result JSON.""" current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") - ri = _safe_filename_token(random_input_len) - ro = _safe_filename_token(random_output_len) - result_filename = f"result_{test_name}_{dataset_name}_{flow}_{num_prompt}_in{ri}_out{ro}_{current_dt}.json" + result_filename = f"result_{test_name}_{dataset_name}_{flow}_{num_prompt}_{current_dt}.json" if "--result-filename" in args: print(f"The result file will be overwritten by {result_filename}") command = ( @@ -151,34 +94,8 @@ def run_benchmark( else: result_dir = "./" - result_path = os.path.join(result_dir, result_filename) - if not os.path.exists(result_path): - with open(OMNI_RESULT_TEMPLATE_PATH, encoding="utf-8") as f: - template_result: dict[str, Any] = json.load(f) - Path(result_path).parent.mkdir(parents=True, exist_ok=True) - with open(result_path, "w", encoding="utf-8") as f: - json.dump(template_result, f, ensure_ascii=False, indent=2) - print(f"Benchmark result file not generated, fallback to template: {result_path}") - result = template_result - else: - with open(result_path, encoding="utf-8") as f: - result = json.load(f) - - if baseline_config: - result["baseline"] = _baseline_thresholds_for_step( - baseline_config, - sweep_index=sweep_index, - request_rate=request_rate, - max_concurrency=max_concurrency, - ) - else: - result["baseline"] = {} - if random_input_len is not None: - result["random_input_len"] = random_input_len - if random_output_len is not None: - result["random_output_len"] = random_output_len - with open(result_path, "w", encoding="utf-8") as f: - json.dump(result, f, ensure_ascii=False, indent=2) + with open(os.path.join(result_dir, result_filename), encoding="utf-8") as f: + result = json.load(f) return result @@ -248,25 +165,6 @@ def _resolve_baseline_value( return baseline_raw -def _baseline_thresholds_for_step( - baseline_data: dict[str, Any], - *, - sweep_index: int | None = None, - max_concurrency: Any = None, - request_rate: Any = None, -) -> dict[str, Any]: - """Resolve ``test.json`` ``baseline`` block to one threshold per metric (same as ``assert_result``).""" - return { - metric_name: _resolve_baseline_value( - baseline_raw, - sweep_index=sweep_index, - max_concurrency=max_concurrency, - request_rate=request_rate, - ) - for metric_name, baseline_raw in baseline_data.items() - } - - def assert_result( result, params, @@ -296,7 +194,6 @@ def assert_result( print(f"ERROR: Test results exceeded baseline: {metric_name}: {current_value} < {baseline_value}") -@pytest.mark.benchmark @pytest.mark.parametrize("omni_server", test_params, indirect=True) @pytest.mark.parametrize("benchmark_params", benchmark_indices, indirect=True) def test_performance_benchmark(omni_server, benchmark_params): @@ -333,7 +230,7 @@ def to_list(value, default=None): raise ValueError("The number of prompts does not match the QPS or max_concurrency") args = ["--host", host, "--port", str(port)] - exclude_keys = {"request_rate", "baseline", "num_prompts", "max_concurrency", "task", "enabled", "eval_phase"} + exclude_keys = {"request_rate", "baseline", "num_prompts", "max_concurrency"} for key, value in params.items(): if key in exclude_keys or value is None: @@ -358,12 +255,6 @@ def to_list(value, default=None): flow=qps, dataset_name=dataset_name, num_prompt=num_prompt, - baseline_config=params.get("baseline"), - sweep_index=i, - request_rate=qps, - max_concurrency=None, - random_input_len=params.get("random_input_len"), - random_output_len=params.get("random_output_len"), ) assert_result( result, @@ -382,12 +273,6 @@ def to_list(value, default=None): flow=concurrency, dataset_name=dataset_name, num_prompt=num_prompt, - baseline_config=params.get("baseline"), - sweep_index=i, - request_rate=None, - max_concurrency=concurrency, - random_input_len=params.get("random_input_len"), - random_output_len=params.get("random_output_len"), ) assert_result( result, diff --git a/tests/dfx/perf/scripts/run_diffusion_benchmark.py b/tests/dfx/perf/scripts/run_diffusion_benchmark.py index 7513c2d3f98..1bd9bf1a143 100644 --- a/tests/dfx/perf/scripts/run_diffusion_benchmark.py +++ b/tests/dfx/perf/scripts/run_diffusion_benchmark.py @@ -1,16 +1,15 @@ """ Performance benchmark CI runner for diffusion models. -This runner separates two concepts: +Supports vLLM-Omni server backend: + - vllm-omni (default): starts DiffusionServer via vllm_omni.entrypoints.cli.main, + benchmarks with diffusion_benchmark_serving.py --backend vllm-omni -1. ``server_type``: how the serving process is started. - Currently only ``vllm-omni`` is supported here. -2. ``benchmark_backend``: which serving API the benchmark client calls. - Examples: ``vllm-omni`` for ``/v1/chat/completions`` and ``v1/videos`` - for async video jobs. +A config JSON file is REQUIRED via --config-file: + pytest run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json -A config JSON file is REQUIRED via --test-config-file: - pytest run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +JSON config entries use a "server_type" field, and this runner executes +the vllm-omni path. All benchmark results for a session are consolidated into a single JSON file under BENCHMARK_RESULT_DIR (override via the DIFFUSION_BENCHMARK_DIR environment variable). @@ -28,16 +27,13 @@ import time from datetime import datetime from pathlib import Path -from typing import Any, cast +from typing import Any import psutil import pytest -pytestmark = [pytest.mark.diffusion, pytest.mark.full_model] - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -os.environ.setdefault("DIFFUSION_ATTENTION_BACKEND", "FLASH_ATTN") # --------------------------------------------------------------------------- # Paths @@ -54,21 +50,19 @@ # Populated lazily after CONFIG_FILE_PATH is resolved. _SESSION_TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S") _RESULT_LOCK = threading.Lock() -_BRANCHPOINT_COMMIT_SHA: str | None = None -DIFFUSION_RESULT_TEMPLATE_PATH = Path(__file__).parent / "diffusion_result_template.json" def _get_config_file_from_argv() -> str | None: - """Read --test-config-file from sys.argv at import time so pytest parametrize can use it. + """Read --config-file from sys.argv at import time so pytest parametrize can use it. pytest_addoption (below) registers the same flag so pytest does not reject it. - Supports both ``--test-config-file path`` and ``--test-config-file=path`` forms. + Supports both ``--config-file path`` and ``--config-file=path`` forms. Returns None if the flag is not present; callers must handle the missing case. """ for i, arg in enumerate(sys.argv): - if arg == "--test-config-file" and i + 1 < len(sys.argv): + if arg == "--config-file" and i + 1 < len(sys.argv): return sys.argv[i + 1] - if arg.startswith("--test-config-file="): + if arg.startswith("--config-file="): return arg.split("=", 1)[1] return None @@ -116,7 +110,7 @@ def load_configs(config_path: str) -> list[dict[str, Any]]: BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) _config_stem = Path(CONFIG_FILE_PATH).stem # e.g. "test_qwen_image_vllm_omni" -AGGREGATED_RESULT_FILE = BENCHMARK_RESULT_DIR / f"diffusion_result_{_config_stem}_{_SESSION_TIMESTAMP}.json" +AGGREGATED_RESULT_FILE = BENCHMARK_RESULT_DIR / f"benchmark_results_{_config_stem}_{_SESSION_TIMESTAMP}.json" def _append_to_aggregated_file(record: dict[str, Any]) -> None: @@ -137,6 +131,19 @@ def _append_to_aggregated_file(record: dict[str, Any]) -> None: json.dump(records, f, indent=2, ensure_ascii=False) +# Register --config-file with pytest so it does not reject the argument. +def pytest_addoption(parser: pytest.Parser) -> None: + parser.addoption( + "--config-file", + action="store", + default=None, + help=( + "Path to the benchmark config JSON file (required). " + "Example: --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json" + ), + ) + + _server_lock = threading.Lock() # --------------------------------------------------------------------------- @@ -225,13 +232,13 @@ class DiffusionServer: def __init__( self, - server_cfg: dict[str, Any], + model: str, + serve_args: list[str], *, port: int | None = None, ) -> None: - self.server_cfg: dict[str, Any] = server_cfg - self.model = server_cfg["model"] - self.serve_args = server_cfg["serve_args"] + self.model = model + self.serve_args = serve_args self.host = "127.0.0.1" self.port = port if port is not None else _get_open_port() self.proc: subprocess.Popen | None = None @@ -292,95 +299,6 @@ def _build_serve_args(serve_args_dict: dict[str, Any]) -> list[str]: return args -def _get_branchpoint_commit_sha() -> str: - """Return the branch-point commit SHA against main. - - Uses git command: ``git merge-base HEAD origin/main``. - """ - global _BRANCHPOINT_COMMIT_SHA - if _BRANCHPOINT_COMMIT_SHA is not None: - return _BRANCHPOINT_COMMIT_SHA - - repo_root = Path(__file__).parent.parent.parent.parent - try: - sha = ( - subprocess.check_output( - ["git", "merge-base", "HEAD", "origin/main"], - cwd=str(repo_root), - stderr=subprocess.STDOUT, - text=True, - ) - .strip() - .splitlines()[0] - ) - _BRANCHPOINT_COMMIT_SHA = sha - except Exception as e: - print(f"Warning: failed to get branch-point commit SHA: {e}") - _BRANCHPOINT_COMMIT_SHA = "" - return _BRANCHPOINT_COMMIT_SHA - - -def _to_resolution_string(params: dict[str, Any]) -> str: - width = params.get("width", "unknown width") - height = params.get("height", "unknown height") - return f"{width}x{height}" - - -def _to_parallelism_string(framework: str, serve_args_dict: dict[str, Any]) -> str: - parts: list[str] = [] - if framework == "vllm-omni": - keys = [ - "num-gpus", - "usp", - "ulysses-degree", - "ring", - "ring-degree", - "cfg-parallel-size", - "vae-patch-parallel-size", - "vae-use-tiling", - "tensor-parallel-size", - ] - for key in keys: - if key in serve_args_dict: - parts.append(f"{key}={serve_args_dict[key]}") - return ",".join(parts) if parts else "none" - - -def _to_cache_string(framework: str, serve_args_dict: dict[str, Any]) -> str: - if framework == "vllm-omni": - if "cache-backend" in serve_args_dict: - return str(serve_args_dict["cache-backend"]) - return "disabled" - - -def _to_offload_string(framework: str, serve_args_dict: dict[str, Any]) -> str: - selected: list[str] = [] - if framework == "vllm-omni": - offload_keys = [ - "enable-cpu-offload", - "enable-layerwise-offload", - ] - for key in offload_keys: - if key in serve_args_dict: - selected.append(key) - return f"enabled({';'.join(selected)})" if selected else "disabled" - - -def _to_compile_value(framework: str, serve_args_dict: dict[str, Any]) -> str: - if framework == "vllm-omni": - if "enforce-eager" in serve_args_dict: - return "disabled" - return "enabled" - return "disabled" - - -def _to_quantization_value(framework: str, serve_args_dict: dict[str, Any]) -> str: - if framework == "vllm-omni": - quant = serve_args_dict.get("quantization") - return str(quant) if quant else "disabled" - return "disabled" - - def _unique_server_params(configs: list[dict[str, Any]]) -> list[dict[str, Any]]: """Return one server-config dict per unique test_name.""" seen: set[str] = set() @@ -390,18 +308,15 @@ def _unique_server_params(configs: list[dict[str, Any]]) -> list[dict[str, Any]] if test_name in seen: continue seen.add(test_name) - server_type = cfg.get("server_type", "vllm-omni") - if server_type != "vllm-omni": - raise ValueError(f"Unsupported server_type in config: {server_type}") - serve_args_dict = cfg["server_params"].get("serve_args", {}) + if cfg.get("server_type", "vllm-omni") != "vllm-omni": + raise ValueError(f"Unsupported server_type in config: {cfg.get('server_type')}") result.append( { "test_name": test_name, - "server_type": server_type, + "server_type": "vllm-omni", "model": cfg["server_params"]["model"], - "serve_args_dict": serve_args_dict, - "serve_args": _build_serve_args(serve_args_dict), - "benchmark_backend": cfg.get("benchmark_backend"), + "serve_args": _build_serve_args(cfg["server_params"].get("serve_args", {})), + "benchmark_backend": "vllm-omni", "server_params": cfg["server_params"], } ) @@ -419,7 +334,9 @@ def _test_param_mapping(configs: list[dict[str, Any]]) -> dict[str, list[dict]]: def _make_server(server_cfg: dict[str, Any]) -> DiffusionServer: """Factory: return a vLLM-Omni diffusion server instance for the config.""" - return DiffusionServer(server_cfg=server_cfg) + model = server_cfg["model"] + serve_args = server_cfg["serve_args"] + return DiffusionServer(model=model, serve_args=serve_args) # --------------------------------------------------------------------------- @@ -447,6 +364,7 @@ def diffusion_server(request): print(f"\nStarting {server_type} server for test: {test_name}") with _make_server(server_cfg) as server: server.test_name = test_name + server.server_params = server_cfg["server_params"] print(f"{server_type} server started successfully") yield server print(f"{server_type} server stopping…") @@ -484,25 +402,22 @@ def run_benchmark( params: dict[str, Any], test_name: str, backend: str = "vllm-omni", - server_cfg: dict[str, Any] | None = None, - source_file: str = "", + server_params: dict[str, Any] | None = None, ) -> dict[str, Any]: """Run diffusion_benchmark_serving.py as a subprocess and return parsed metrics. The raw metrics are written to a temporary file by the subprocess. After the run completes the metrics are merged with full metadata (test_name, - backend, benchmark_params, timestamp, flat reporting fields) and appended - to the session-wide aggregated JSON file (AGGREGATED_RESULT_FILE). The - temporary file is removed afterwards. Subprocess stdout/stderr are tee'd - to a .log file under BENCHMARK_RESULT_DIR/logs/; its path is stored in - the record. + backend, benchmark_params, timestamp) and appended to the session-wide + aggregated JSON file (AGGREGATED_RESULT_FILE). The temporary file is + removed afterwards. Subprocess stdout/stderr are tee'd to a .log file + under BENCHMARK_RESULT_DIR/logs/; its path is stored in the record. """ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = BENCHMARK_RESULT_DIR / "logs" log_dir.mkdir(parents=True, exist_ok=True) - backend_label = backend.replace("/", "_") - log_file = log_dir / f"{test_name}_{backend_label}_{timestamp}.log" + log_file = log_dir / f"{test_name}_{backend}_{timestamp}.log" with tempfile.NamedTemporaryFile(mode="w", suffix=".json", prefix="diffusion_bench_tmp_", delete=False) as tmp: tmp_result_file = Path(tmp.name) @@ -569,17 +484,10 @@ def run_benchmark( if process.returncode != 0: tmp_result_file.unlink(missing_ok=True) - print(f"ERROR:Benchmark script exited with code {process.returncode}") + raise RuntimeError(f"Benchmark script exited with code {process.returncode}") if not tmp_result_file.exists(): - with open(DIFFUSION_RESULT_TEMPLATE_PATH, encoding="utf-8") as f: - template_payload = json.load(f) - # Template schema is fixed and owned by this repo: - # ``diffusion_result_template.json`` is a one-item list and metrics live at [0]["result"]. - template_metrics: dict[str, Any] = template_payload[0]["result"] - with open(tmp_result_file, "w", encoding="utf-8") as f: - json.dump(template_metrics, f, ensure_ascii=False, indent=2) - print(f"Benchmark result file not generated, fallback to template: {tmp_result_file}") + raise FileNotFoundError(f"Benchmark result file not found: {tmp_result_file}") try: with open(tmp_result_file, encoding="utf-8") as f: @@ -587,57 +495,14 @@ def run_benchmark( finally: tmp_result_file.unlink(missing_ok=True) - server_cfg = server_cfg or {} - server_type = cast(str, server_cfg.get("server_type", "vllm-omni")) - serve_args_dict = server_cfg.get("serve_args_dict", {}) - if not isinstance(serve_args_dict, dict): - serve_args_dict = {} - - completed = metrics.get("completed_requests", metrics.get("completed", 0)) - failed = metrics.get("failed_requests", metrics.get("failed", 0)) - record: dict[str, Any] = { "test_name": test_name, "backend": backend, "timestamp": timestamp, - "server_params": server_cfg.get("server_params"), + "server_params": server_params, "benchmark_params": params, "result": metrics, "log_file": str(log_file), - "Model": model, - "Framework": server_type, - "API Backend": backend, - "Hardware": "", - "Deployment": "", - "Task": params.get("task", "t2i"), - "Dataset": params.get("dataset", "random"), - "resolution": _to_resolution_string(params), - "Parallelism": _to_parallelism_string(server_type, serve_args_dict), - "max_concurrency": params.get("max-concurrency", ""), - "Cache": _to_cache_string(server_type, serve_args_dict), - "Quantization": _to_quantization_value(server_type, serve_args_dict), - "offload": _to_offload_string(server_type, serve_args_dict), - "compile": _to_compile_value(server_type, serve_args_dict), - "Attn_backend": os.environ.get("DIFFUSION_ATTENTION_BACKEND", ""), - "num_inference_steps": params.get("num-inference-steps", ""), - "completed": completed, - "failed": failed, - "throughput_qps": metrics.get("throughput_qps"), - "latency_mean": metrics.get("latency_mean"), - "latency_median": metrics.get("latency_median"), - "latency_p99": metrics.get("latency_p99"), - "latency_p95": metrics.get("latency_p95"), - "latency_p50": metrics.get("latency_p50"), - "peak_memory_mb_max": metrics.get("peak_memory_mb_max"), - "peak_memory_mb_mean": metrics.get("peak_memory_mb_mean"), - "peak_memory_mb_median": metrics.get("peak_memory_mb_median"), - "stage_durations_mean": metrics.get("stage_durations_mean"), - "stage_durations_p50": metrics.get("stage_durations_p50"), - "stage_durations_p99": metrics.get("stage_durations_p99"), - "commit_sha": _get_branchpoint_commit_sha(), - "build_id": os.environ.get("BUILDKITE_BUILD_ID", ""), - "build_url": os.environ.get("BUILDKITE_BUILD_URL", ""), - "source_file": source_file, } _append_to_aggregated_file(record) print(f"\n Result appended to: {AGGREGATED_RESULT_FILE}") @@ -666,27 +531,11 @@ def assert_result(result: dict[str, Any], params: dict[str, Any]) -> None: assert current <= threshold, f"{metric}: {current:.4f} > baseline {threshold}" -def _default_benchmark_backend_for_task(task: str) -> str: - """Return the default client-side benchmark backend for a diffusion task.""" - if task in {"t2v", "i2v", "ti2v"}: - return "v1/videos" - if task in {"t2i", "i2i", "ti2i"}: - return "vllm-omni" - raise ValueError(f"Unsupported task for benchmark backend resolution: {task}") - - -def _resolve_benchmark_backend(server_cfg: dict[str, Any], params: dict[str, Any]) -> str: - """Resolve which serving API the benchmark client should call.""" - configured = server_cfg.get("benchmark_backend") - if configured: - return cast(str, configured) - return _default_benchmark_backend_for_task(cast(str, params.get("task", "t2i"))) - - # --------------------------------------------------------------------------- # Test entry point # --------------------------------------------------------------------------- -@pytest.mark.benchmark + + @pytest.mark.parametrize( "diffusion_server", server_params, @@ -707,8 +556,7 @@ def test_diffusion_performance_benchmark(diffusion_server, benchmark_params): """ test_name = benchmark_params["test_name"] params = benchmark_params["params"] - server_cfg = getattr(diffusion_server, "server_cfg", {}) - backend = _resolve_benchmark_backend(server_cfg, params) + backend = diffusion_server.server_type # "vllm-omni" result = run_benchmark( host=diffusion_server.host, @@ -717,8 +565,7 @@ def test_diffusion_performance_benchmark(diffusion_server, benchmark_params): params=params, test_name=test_name, backend=backend, - server_cfg=server_cfg, - source_file=cast(str, CONFIG_FILE_PATH), + server_params=diffusion_server.server_params, ) print(f"\n{'=' * 60}") diff --git a/tests/dfx/perf/stage_configs/qwen3_omni.yaml b/tests/dfx/perf/stage_configs/qwen3_omni.yaml new file mode 100644 index 00000000000..2add22b8732 --- /dev/null +++ b/tests/dfx/perf/stage_configs/qwen3_omni.yaml @@ -0,0 +1,101 @@ +# Stage config for running Qwen3-Omni-MoE with 3-stage architecture +# Stage 0: Thinker (multimodal understanding + text generation) +# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) +# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) + +# The following config has been verified on 2x H100-80G GPUs. +async_chunk: false +stage_args: + - stage_id: 0 + stage_type: llm # Use llm stage type for AR stages + runtime: + devices: "0" + engine_args: + model_stage: thinker + max_num_seqs: 64 + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.9 + enforce_eager: false + trust_remote_code: true + engine_output_type: latent # Output hidden states for talker + distributed_executor_backend: "mp" + enable_prefix_caching: false + max_num_batched_tokens: 32768 + hf_config_name: thinker_config + tensor_parallel_size: 1 + final_output: true + final_output_type: text + is_comprehension: true + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + top_k: 1 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.05 + + - stage_id: 1 + stage_type: llm # Use llm stage type for AR stages + runtime: + devices: "1" + engine_args: + model_stage: talker + max_num_seqs: 64 + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.6 + enforce_eager: false + trust_remote_code: true + engine_output_type: latent # Output codec codes for code2wav + enable_prefix_caching: false + max_num_batched_tokens: 32768 + distributed_executor_backend: "mp" + hf_config_name: talker_config + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker + # final_output: true + # final_output_type: text + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: False + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 2 + stage_type: llm # Use llm stage type for AR stages + runtime: + devices: "1" + engine_args: + model_stage: code2wav + max_num_seqs: 64 + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio # Final output: audio waveform + gpu_memory_utilization: 0.1 + distributed_executor_backend: "mp" + max_num_batched_tokens: 100000 + hf_config_name: thinker_config + engine_input_source: [1] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: True + repetition_penalty: 1.1 diff --git a/tests/dfx/perf/stage_configs/qwen3_tts.yaml b/tests/dfx/perf/stage_configs/qwen3_tts.yaml new file mode 100644 index 00000000000..dd69b248d1a --- /dev/null +++ b/tests/dfx/perf/stage_configs/qwen3_tts.yaml @@ -0,0 +1,96 @@ +# Stage config for running Qwen3-TTS with 2-stage architecture +# Stage 0: Talker (text -> 8-layer RVQ codec codes) +# Stage 1: Code2Wav (codec codes -> audio waveform) +# +# The following config has been verified on 1x H100-80G GPU. +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + engine_args: + max_num_seqs: 4 + model_stage: qwen3_tts + model_arch: Qwen3TTSTalkerForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: false + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.3 + distributed_executor_backend: "mp" + max_num_batched_tokens: 512 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + engine_args: + max_num_seqs: 4 + model_stage: code2wav + model_arch: Qwen3TTSCode2Wav + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.2 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 32768 + engine_input_source: [0] + final_output: true + final_output_type: audio + input_connectors: + from_stage_0: connector_of_shared_memory + tts_args: + max_instructions_length: 500 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 4 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + codec_streaming: true + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + codec_chunk_frames: 25 + codec_left_context_frames: 25 + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/tests/dfx/perf/tests/test.json b/tests/dfx/perf/tests/test.json new file mode 100644 index 00000000000..fe7e3804698 --- /dev/null +++ b/tests/dfx/perf/tests/test.json @@ -0,0 +1,236 @@ +[ + { + "test_name": "test_qwen3_omni", + "server_params": { + "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", + "stage_config_name": "qwen3_omni.yaml" + }, + "benchmark_params": [ + { + "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [ + 10, + 40, + 100 + ], + "max_concurrency": [ + 1, + 4, + 10 + ], + "random_input_len": 100, + "random_output_len": 100, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [1000, 3000, 5000], + "mean_audio_ttfp_ms": [8000, 10000, 13000], + "mean_audio_rtf": [0.2, 0.25, 0.45] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [ + 10, + 40, + 100 + ], + "request_rate": [ + 0.1, + 0.3, + 0.5 + ], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "random_mm_base_items_per_request": 3, + "random_mm_num_mm_items_range_ratio": 0, + "random_mm_limit_mm_per_prompt": { + "image": 1, + "video": 1, + "audio": 1 + }, + "random_mm_bucket_config": { + "(32, 32, 1)": 0.5, + "(0, 1, 1)": 0.1, + "(32, 32, 2)": 0.4 + }, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [2000, 4000, 6000], + "mean_audio_ttfp_ms": [10000, 13000, 15000], + "mean_audio_rtf": [0.25, 0.35, 0.45] + } + }, + { + "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [ + 4, + 16 + ], + "max_concurrency": [ + 1, + 4 + ], + "random_input_len": 2500, + "random_output_len": 900, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [1000, 3000], + "mean_audio_ttfp_ms": [30000, 60000], + "mean_audio_rtf": [0.35, 0.45] + } + } + ] + }, + { + "test_name": "test_qwen3_omni_chunk", + "server_params": { + "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", + "stage_config_name": "qwen3_omni.yaml", + "update": { + "async_chunk": true, + "stage_args": { + "0": { + "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" + }, + "1": { + "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk" + } + } + }, + "delete": { + "stage_args": { + "2": [ + "custom_process_input_func" + ] + } + } + }, + "benchmark_params": [ + { + "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [ + 10, + 40, + 100 + ], + "max_concurrency": [ + 1, + 4, + 10 + ], + "random_input_len": 100, + "random_output_len": 100, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [1000, 3000, 5000], + "mean_audio_ttfp_ms": [1000, 3000, 5000], + "mean_audio_rtf": [0.2, 0.35, 0.6] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [ + 10, + 40, + 100 + ], + "request_rate": [ + 0.1, + 0.3, + 0.5 + ], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "random_mm_base_items_per_request": 3, + "random_mm_num_mm_items_range_ratio": 0, + "random_mm_limit_mm_per_prompt": { + "image": 1, + "video": 1, + "audio": 1 + }, + "random_mm_bucket_config": { + "(32, 32, 1)": 0.5, + "(0, 1, 1)": 0.1, + "(32, 32, 2)": 0.4 + }, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [2000, 4000, 6000], + "mean_audio_ttfp_ms": [2000, 4000, 6000], + "mean_audio_rtf": [0.25, 0.4, 0.7] + } + }, + { + "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [ + 4, + 16 + ], + "max_concurrency": [ + 1, + 4 + ], + "random_input_len": 2500, + "random_output_len": 900, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [1000, 3000], + "mean_audio_ttfp_ms": [1000, 3000], + "mean_audio_rtf": [0.35, 0.45] + } + } + ] + }, + { + "test_name": "test_qwen3_tts", + "server_params": { + "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" + }, + "benchmark_params": [ + { + "dataset_name": "random", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "num_prompts": [ + 10, + 40 + ], + "max_concurrency": [ + 1, + 4 + ], + "random_input_len": 100, + "random_output_len": 100, + "extra_body": { + "voice": "Vivian", + "language": "English" + }, + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_audio_ttfp_ms": [6000, 6000], + "mean_audio_rtf": [0.3, 0.3] + } + } + ] + } +] diff --git a/tests/dfx/perf/tests/test_ltx2_vllm_omni.json b/tests/dfx/perf/tests/test_ltx2_vllm_omni.json deleted file mode 100644 index 4a6f9e3501f..00000000000 --- a/tests/dfx/perf/tests/test_ltx2_vllm_omni.json +++ /dev/null @@ -1,217 +0,0 @@ -[ - { - "test_name": "test_ltx2_baseline_eager", - "description": "Single-device baseline with enforce-eager (no torch.compile)", - "server_type": "vllm-omni", - "server_params": { - "model": "Lightricks/LTX-2", - "serve_args": { - "enforce-eager": true, - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "256x256_145f_steps6", - "dataset": "random", - "task": "t2v", - "backend": "v1/videos", - "width": 256, - "height": 256, - "num-frames": 145, - "fps": 24, - "num-inference-steps": 6, - "num-prompts": 3, - "max-concurrency": 1, - "enable-negative-prompt": true - }, - { - "name": "480x768_41f_steps20", - "dataset": "random", - "task": "t2v", - "backend": "v1/videos", - "width": 768, - "height": 480, - "num-frames": 41, - "fps": 24, - "num-inference-steps": 20, - "num-prompts": 3, - "max-concurrency": 1, - "enable-negative-prompt": true - } - ] - }, - - { - "test_name": "test_ltx2_torch_compile", - "description": "Single-device with torch.compile (default, no enforce-eager)", - "server_type": "vllm-omni", - "server_params": { - "model": "Lightricks/LTX-2", - "serve_args": { - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "256x256_145f_steps6", - "dataset": "random", - "task": "t2v", - "backend": "v1/videos", - "width": 256, - "height": 256, - "num-frames": 145, - "fps": 24, - "num-inference-steps": 6, - "num-prompts": 3, - "max-concurrency": 1, - "enable-negative-prompt": true - }, - { - "name": "480x768_41f_steps20", - "dataset": "random", - "task": "t2v", - "backend": "v1/videos", - "width": 768, - "height": 480, - "num-frames": 41, - "fps": 24, - "num-inference-steps": 20, - "num-prompts": 3, - "max-concurrency": 1, - "enable-negative-prompt": true - } - ] - }, - - { - "test_name": "test_ltx2_cfg2_eager", - "description": "CFG-parallel=2 with enforce-eager", - "server_type": "vllm-omni", - "server_params": { - "model": "Lightricks/LTX-2", - "serve_args": { - "cfg-parallel-size": 2, - "enforce-eager": true, - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "256x256_145f_steps6", - "dataset": "random", - "task": "t2v", - "backend": "v1/videos", - "width": 256, - "height": 256, - "num-frames": 145, - "fps": 24, - "num-inference-steps": 6, - "num-prompts": 3, - "max-concurrency": 1, - "enable-negative-prompt": true - }, - { - "name": "480x768_41f_steps20", - "dataset": "random", - "task": "t2v", - "backend": "v1/videos", - "width": 768, - "height": 480, - "num-frames": 41, - "fps": 24, - "num-inference-steps": 20, - "num-prompts": 3, - "max-concurrency": 1, - "enable-negative-prompt": true - } - ] - }, - - { - "test_name": "test_ltx2_cfg2_compile", - "description": "CFG-parallel=2 with torch.compile", - "server_type": "vllm-omni", - "server_params": { - "model": "Lightricks/LTX-2", - "serve_args": { - "cfg-parallel-size": 2, - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "256x256_145f_steps6", - "dataset": "random", - "task": "t2v", - "backend": "v1/videos", - "width": 256, - "height": 256, - "num-frames": 145, - "fps": 24, - "num-inference-steps": 6, - "num-prompts": 3, - "max-concurrency": 1, - "enable-negative-prompt": true - }, - { - "name": "480x768_41f_steps20", - "dataset": "random", - "task": "t2v", - "backend": "v1/videos", - "width": 768, - "height": 480, - "num-frames": 41, - "fps": 24, - "num-inference-steps": 20, - "num-prompts": 3, - "max-concurrency": 1, - "enable-negative-prompt": true - } - ] - }, - - { - "test_name": "test_ltx2_cache_dit_eager", - "description": "CacheDiT with enforce-eager", - "server_type": "vllm-omni", - "server_params": { - "model": "Lightricks/LTX-2", - "serve_args": { - "cache-backend": "cache_dit", - "enforce-eager": true, - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "256x256_145f_steps6", - "dataset": "random", - "task": "t2v", - "backend": "v1/videos", - "width": 256, - "height": 256, - "num-frames": 145, - "fps": 24, - "num-inference-steps": 6, - "num-prompts": 3, - "max-concurrency": 1, - "enable-negative-prompt": true - }, - { - "name": "480x768_41f_steps20", - "dataset": "random", - "task": "t2v", - "backend": "v1/videos", - "width": 768, - "height": 480, - "num-frames": 41, - "fps": 24, - "num-inference-steps": 20, - "num-prompts": 3, - "max-concurrency": 1, - "enable-negative-prompt": true - } - ] - } -] diff --git a/tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json deleted file mode 100644 index 7d1fbbfa704..00000000000 --- a/tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json +++ /dev/null @@ -1,167 +0,0 @@ -[ - { - "test_name": "test_qwen_image_edit_2509_single_device", - "description": "Single-device baseline (two input images)", - "server_type": "vllm-omni", - "server_params": { - "model": "Qwen/Qwen-Image-Edit-2509", - "serve_args": { - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20_i2i_2img", - "dataset": "random", - "task": "i2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "num-input-images": 2, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.05, - "latency_mean": 18, - "peak_memory_mb_max": 78500, - "peak_memory_mb_mean": 78500 - } - }, - { - "name": "1536x1536_steps35_i2i_2img", - "dataset": "random", - "task": "i2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "num-input-images": 2, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.01, - "latency_mean": 70, - "peak_memory_mb_max": 81000, - "peak_memory_mb_mean": 81000 - } - } - ] - }, - { - "test_name": "test_qwen_image_edit_2509_ulysses2_cfg2_vae_patch4", - "description": "Ulysses SP=2 + CFG=2 + VAE patch parallel=4", - "server_type": "vllm-omni", - "server_params": { - "model": "Qwen/Qwen-Image-Edit-2509", - "serve_args": { - "ulysses-degree": 2, - "cfg-parallel-size": 2, - "vae-patch-parallel-size": 4, - "vae-use-tiling": true, - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20_i2i_2img", - "dataset": "random", - "task": "i2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "num-input-images": 2, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.1, - "latency_mean": 12, - "peak_memory_mb_max": 69000, - "peak_memory_mb_mean": 69000 - } - }, - { - "name": "1536x1536_steps35_i2i_2img", - "dataset": "random", - "task": "i2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "num-input-images": 2, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.03, - "latency_mean": 28, - "peak_memory_mb_max": 69000, - "peak_memory_mb_mean": 69000 - } - } - ] - }, - { - "test_name": "test_qwen_image_edit_2509_ulysses2_cfg2_cache_dit", - "description": "Ulysses SP=2 + CFG=2 + CacheDiT", - "server_type": "vllm-omni", - "server_params": { - "model": "Qwen/Qwen-Image-Edit-2509", - "serve_args": { - "ulysses-degree": 2, - "cfg-parallel-size": 2, - "cache-backend": "cache_dit", - "cache-config": { - "Fn_compute_blocks": 1, - "Bn_compute_blocks": 0, - "max_warmup_steps": 4, - "residual_diff_threshold": 0.24, - "max_continuous_cached_steps": 3, - "enable_taylorseer": false, - "taylorseer_order": 1, - "scm_steps_mask_policy": null, - "scm_steps_policy": "dynamic" - }, - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20_i2i_2img", - "dataset": "random", - "task": "i2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "num-input-images": 2, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.10, - "latency_mean": 12, - "peak_memory_mb_max": 73000, - "peak_memory_mb_mean": 73000 - } - }, - { - "name": "1536x1536_steps35_i2i_2img", - "dataset": "random", - "task": "i2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "num-input-images": 2, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.05, - "latency_mean": 20, - "peak_memory_mb_max": 81000, - "peak_memory_mb_mean": 81000 - } - } - ] - } -] diff --git a/tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json deleted file mode 100644 index f68201db5f5..00000000000 --- a/tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json +++ /dev/null @@ -1,161 +0,0 @@ -[ - { - "test_name": "test_qwen_image_edit_single_device", - "description": "Single-device baseline", - "server_type": "vllm-omni", - "server_params": { - "model": "Qwen/Qwen-Image-Edit", - "serve_args": { - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20_i2i", - "dataset": "random", - "task": "i2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.05, - "latency_mean": 15.0, - "peak_memory_mb_max": 72500, - "peak_memory_mb_mean": 72500 - } - }, - { - "name": "1536x1536_steps35_i2i", - "dataset": "random", - "task": "i2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.01, - "latency_mean": 65.6, - "peak_memory_mb_max": 80777, - "peak_memory_mb_mean": 80777 - } - } - ] - }, - { - "test_name": "test_qwen_image_edit_ulysses2_cfg2_vae_patch4", - "description": "Ulysses SP=2 + CFG=2 + VAE patch parallel=4", - "server_type": "vllm-omni", - "server_params": { - "model": "Qwen/Qwen-Image-Edit", - "serve_args": { - "ulysses-degree": 2, - "cfg-parallel-size": 2, - "vae-patch-parallel-size": 4, - "vae-use-tiling": true, - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20_i2i", - "dataset": "random", - "task": "i2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.10, - "latency_mean": 7.2, - "peak_memory_mb_max": 68100, - "peak_memory_mb_mean": 68100 - } - }, - { - "name": "1536x1536_steps35_i2i", - "dataset": "random", - "task": "i2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.03, - "latency_mean": 24.0, - "peak_memory_mb_max": 68100, - "peak_memory_mb_mean": 68100 - } - } - ] - }, - { - "test_name": "test_qwen_image_edit_ulysses2_cfg2_cache_dit", - "description": "Ulysses SP=2 + CFG=2 + CacheDiT", - "server_type": "vllm-omni", - "server_params": { - "model": "Qwen/Qwen-Image-Edit", - "serve_args": { - "ulysses-degree": 2, - "cfg-parallel-size": 2, - "cache-backend": "cache_dit", - "cache-config": { - "Fn_compute_blocks": 1, - "Bn_compute_blocks": 0, - "max_warmup_steps": 4, - "residual_diff_threshold": 0.24, - "max_continuous_cached_steps": 3, - "enable_taylorseer": false, - "taylorseer_order": 1, - "scm_steps_mask_policy": null, - "scm_steps_policy": "dynamic" - }, - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "512x512_steps20_i2i", - "dataset": "random", - "task": "i2i", - "width": 512, - "height": 512, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.1, - "latency_mean": 6.5, - "peak_memory_mb_max": 72600, - "peak_memory_mb_mean": 72600 - } - }, - { - "name": "1536x1536_steps35_i2i", - "dataset": "random", - "task": "i2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.05, - "latency_mean": 16.0, - "peak_memory_mb_max": 81000, - "peak_memory_mb_mean": 81000 - } - } - ] - } -] diff --git a/tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json deleted file mode 100644 index 3cf13509c8d..00000000000 --- a/tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json +++ /dev/null @@ -1,49 +0,0 @@ -[ - { - "test_name": "test_qwen_image_layered_single_device", - "description": "Single-device baseline", - "server_type": "vllm-omni", - "server_params": { - "model": "Qwen/Qwen-Image-Layered", - "serve_args": { - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "640x640_steps20_i2i", - "dataset": "random", - "task": "i2i", - "width": 640, - "height": 640, - "num-inference-steps": 20, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.02, - "latency_mean": 40.0, - "peak_memory_mb_max": 70000, - "peak_memory_mb_mean": 70000 - } - }, - { - "name": "1024x1024_steps35_i2i", - "dataset": "random", - "task": "i2i", - "width": 1024, - "height": 1024, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.005, - "latency_mean": 80.0, - "peak_memory_mb_max": 70000, - "peak_memory_mb_mean": 70000 - } - } - ] - } -] diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index cdd0cac2c03..387e874ad5f 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -44,15 +44,19 @@ } ] }, + { - "test_name": "test_qwen_image_single_device_step_execution", - "description": "Single-device baseline (no parallelism) with step execution", + "test_name": "test_qwen_image_ulysses2_cfg2_vae_patch4", + "description": "Ulysses SP=2 + CFG-parallel=2 + VAE Patch Parallel=4", "server_type": "vllm-omni", "server_params": { "model": "Qwen/Qwen-Image", "serve_args": { - "enable-diffusion-pipeline-profiler": true, - "step-execution": true + "ulysses-degree": 2, + "cfg-parallel-size": 2, + "vae-patch-parallel-size": 4, + "vae-use-tiling": true, + "enable-diffusion-pipeline-profiler": true } }, "benchmark_params": [ @@ -67,44 +71,11 @@ "max-concurrency": 1, "enable-negative-prompt": true, "baseline": { - "throughput_qps": 0.30, - "latency_mean": 3.50, - "peak_memory_mb_mean": 67000 + "throughput_qps": 0.1, + "latency_mean": 2.34, + "peak_memory_mb_mean": 61000 } }, - { - "name": "1536x1536_steps35", - "dataset": "random", - "task": "t2i", - "width": 1536, - "height": 1536, - "num-inference-steps": 35, - "num-prompts": 10, - "max-concurrency": 1, - "enable-negative-prompt": true, - "baseline": { - "throughput_qps": 0.037, - "latency_mean": 27.0, - "peak_memory_mb_mean": 74000 - } - } - ] - }, - { - "test_name": "test_qwen_image_ulysses2_cfg2_vae_patch4", - "description": "Ulysses SP=2 + CFG-parallel=2 + VAE Patch Parallel=4", - "server_type": "vllm-omni", - "server_params": { - "model": "Qwen/Qwen-Image", - "serve_args": { - "ulysses-degree": 2, - "cfg-parallel-size": 2, - "vae-patch-parallel-size": 4, - "vae-use-tiling": true, - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ { "name": "1536x1536_steps35", "dataset": "random", @@ -123,6 +94,7 @@ } ] }, + { "test_name": "test_qwen_image_ulysses2_cfg2_cache_dit", "description": "Ulysses SP=2 + CFG-parallel=2 + CacheDiT acceleration", diff --git a/tests/dfx/perf/tests/test_qwen_omni.json b/tests/dfx/perf/tests/test_qwen_omni.json deleted file mode 100644 index eda9720c417..00000000000 --- a/tests/dfx/perf/tests/test_qwen_omni.json +++ /dev/null @@ -1,315 +0,0 @@ -[ - { - "test_name": "test_qwen3_omni", - "server_params": { - "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "extra_cli_args": ["--no-async-chunk"] - }, - "benchmark_params": [ - { - "dataset_name": "random", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [4, 16, 32, 64, 128], - "max_concurrency": [1, 4, 8, 16, 32], - "random_input_len": 2500, - "random_output_len": 900, - "ignore_eos": true, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_ttft_ms": [1000, 3000, 5000, 7000, 9000], - "mean_audio_ttfp_ms": [30000, 60000, 90000, 120000, 150000], - "mean_audio_rtf": [0.35, 0.45, 0.55, 0.65, 0.75] - } - }, - { - "dataset_name": "random-mm", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [10], - "request_rate": [0.1], - "random_input_len": 100, - "random_output_len": 100, - "random_range_ratio": 0.0, - "ignore_eos": true, - "random_mm_base_items_per_request": 1, - "random_mm_num_mm_items_range_ratio": 0.5, - "random_mm_limit_mm_per_prompt": { - "audio": 1 - }, - "random_mm_bucket_config": { - "(0, 60, 3)": 1.0 - }, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_ttft_ms": [2000], - "mean_audio_ttfp_ms": [10000], - "mean_audio_rtf": [0.25] - } - }, - { - "dataset_name": "random-mm", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [40], - "request_rate": [0.5], - "random_input_len": 100, - "random_output_len": 100, - "random_range_ratio": 0.0, - "ignore_eos": true, - "random_mm_base_items_per_request": 2, - "random_mm_num_mm_items_range_ratio": 0.5, - "random_mm_limit_mm_per_prompt": { - "image": 1, - "video": 1 - }, - "random_mm_bucket_config": { - "(256, 256, 1)": 0.5, - "(720, 1280, 2)": 0.5 - }, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_ttft_ms": [6000], - "mean_audio_ttfp_ms": [15000], - "mean_audio_rtf": [0.45] - } - }, - { - "dataset_name": "random-mm", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [100], - "request_rate": [1.0], - "random_input_len": 100, - "random_output_len": 100, - "random_range_ratio": 0.0, - "ignore_eos": true, - "random_mm_base_items_per_request": 3, - "random_mm_num_mm_items_range_ratio": 0.5, - "random_mm_limit_mm_per_prompt": { - "image": 1, - "video": 1, - "audio": 1 - }, - "random_mm_bucket_config": { - "(256, 256, 1)": 0.34, - "(720, 1280, 2)": 0.33, - "(0, 60, 3)": 0.33 - }, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_ttft_ms": [12000], - "mean_audio_ttfp_ms": [18000], - "mean_audio_rtf": [0.9] - } - } - ] - }, - { - "test_name": "test_qwen3_omni_chunk", - "server_params": { - "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "extra_cli_args": ["--async-chunk"] - }, - "benchmark_params": [ - { - "dataset_name": "random", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [4, 16, 32, 64], - "max_concurrency": [1, 4, 8, 16], - "random_input_len": 2500, - "random_output_len": 900, - "ignore_eos": true, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_ttft_ms": [1000, 3000, 5000, 7000], - "mean_audio_ttfp_ms": [1000, 3000, 5000, 7000], - "mean_audio_rtf": [0.2, 0.35, 0.6, 0.85] - } - }, - { - "dataset_name": "random-mm", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [10], - "request_rate": [0.1], - "random_input_len": 100, - "random_output_len": 100, - "random_range_ratio": 0.0, - "ignore_eos": true, - "random_mm_base_items_per_request": 1, - "random_mm_num_mm_items_range_ratio": 0.5, - "random_mm_limit_mm_per_prompt": { - "audio": 1 - }, - "random_mm_bucket_config": { - "(0, 60, 3)": 1.0 - }, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_ttft_ms": [2000], - "mean_audio_ttfp_ms": [2000], - "mean_audio_rtf": [0.25] - } - }, - { - "dataset_name": "random-mm", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [40], - "request_rate": [0.5], - "random_input_len": 100, - "random_output_len": 100, - "random_range_ratio": 0.0, - "ignore_eos": true, - "random_mm_base_items_per_request": 2, - "random_mm_num_mm_items_range_ratio": 0.5, - "random_mm_limit_mm_per_prompt": { - "image": 1, - "video": 1 - }, - "random_mm_bucket_config": { - "(256, 256, 1)": 0.5, - "(720, 1280, 2)": 0.5 - }, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_ttft_ms": [6000], - "mean_audio_ttfp_ms": [6000], - "mean_audio_rtf": [0.7] - } - }, - { - "dataset_name": "random-mm", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [100], - "request_rate": [1.0], - "random_input_len": 100, - "random_output_len": 100, - "random_range_ratio": 0.0, - "ignore_eos": true, - "random_mm_base_items_per_request": 3, - "random_mm_num_mm_items_range_ratio": 0.5, - "random_mm_limit_mm_per_prompt": { - "image": 1, - "video": 1, - "audio": 1 - }, - "random_mm_bucket_config": { - "(256, 256, 1)": 0.34, - "(720, 1280, 2)": 0.33, - "(0, 60, 3)": 0.33 - }, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_ttft_ms": [12000], - "mean_audio_ttfp_ms": [12000], - "mean_audio_rtf": [1.0] - } - }, - { - "dataset_name": "random", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [4, 16, 32, 64, 128], - "max_concurrency": [1, 4, 8, 16, 32], - "random_input_len": 2500, - "random_output_len": 900, - "ignore_eos": true, - "extra_body": { - "modalities": ["text"] - }, - "percentile-metrics": "ttft,tpot,itl,e2el", - "baseline": { - "mean_ttft_ms": [1000, 3000, 5000, 7000, 9000] - } - }, - { - "dataset_name": "random-mm", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [10], - "request_rate": [0.1], - "random_input_len": 100, - "random_output_len": 100, - "random_range_ratio": 0.0, - "ignore_eos": true, - "extra_body": { - "modalities": ["text"] - }, - "random_mm_base_items_per_request": 1, - "random_mm_num_mm_items_range_ratio": 0.5, - "random_mm_limit_mm_per_prompt": { - "audio": 1 - }, - "random_mm_bucket_config": { - "(0, 60, 3)": 1.0 - }, - "percentile-metrics": "ttft,tpot,itl,e2el", - "baseline": { - "mean_ttft_ms": [2000] - } - }, - { - "dataset_name": "random-mm", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [40], - "request_rate": [0.5], - "random_input_len": 100, - "random_output_len": 100, - "random_range_ratio": 0.0, - "ignore_eos": true, - "extra_body": { - "modalities": ["text"] - }, - "random_mm_base_items_per_request": 2, - "random_mm_num_mm_items_range_ratio": 0.5, - "random_mm_limit_mm_per_prompt": { - "image": 1, - "video": 1 - }, - "random_mm_bucket_config": { - "(256, 256, 1)": 0.5, - "(720, 1280, 2)": 0.5 - }, - "percentile-metrics": "ttft,tpot,itl,e2el", - "baseline": { - "mean_ttft_ms": [6000] - } - }, - { - "dataset_name": "random-mm", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "num_prompts": [100], - "request_rate": [1.0], - "random_input_len": 100, - "random_output_len": 100, - "random_range_ratio": 0.0, - "ignore_eos": true, - "extra_body": { - "modalities": ["text"] - }, - "random_mm_base_items_per_request": 3, - "random_mm_num_mm_items_range_ratio": 0.5, - "random_mm_limit_mm_per_prompt": { - "image": 1, - "video": 1, - "audio": 1 - }, - "random_mm_bucket_config": { - "(256, 256, 1)": 0.34, - "(720, 1280, 2)": 0.33, - "(0, 60, 3)": 0.33 - }, - "percentile-metrics": "ttft,tpot,itl,e2el", - "baseline": { - "mean_ttft_ms": [6000] - } - } - ] - } -] diff --git a/tests/dfx/perf/tests/test_runner_metadata.py b/tests/dfx/perf/tests/test_runner_metadata.py deleted file mode 100644 index 1276a847069..00000000000 --- a/tests/dfx/perf/tests/test_runner_metadata.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Tests for DFX runner metadata field exclusion.""" - -import json - -import pytest - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -def test_task_excluded_from_cli_args(): - """'task' field must not become --task CLI arg.""" - params = { - "task": "voice_clone", - "dataset_name": "seed-tts", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "percentile-metrics": "audio_rtf,audio_ttfp", - "baseline": {"mean_audio_rtf": [0.5]}, - } - exclude_keys = {"request_rate", "baseline", "num_prompts", "max_concurrency", "task", "enabled", "eval_phase"} - args = [] - for key, value in params.items(): - if key in exclude_keys or value is None: - continue - arg_name = f"--{key.replace('_', '-')}" - if isinstance(value, bool) and value: - args.append(arg_name) - elif isinstance(value, dict): - args.extend([arg_name, json.dumps(value)]) - elif not isinstance(value, bool): - args.extend([arg_name, str(value)]) - assert "--task" not in args - assert "--enabled" not in args - assert "--dataset-name" in args - - -def test_enabled_false_entry_is_skipped(): - """benchmark_params entry with enabled=false should be skipped.""" - import sys - from pathlib import Path - - sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) - from tests.dfx.conftest import create_test_parameter_mapping - - configs = [ - { - "test_name": "test_model", - "server_params": {"model": "some/model"}, - "benchmark_params": [ - { - "task": "voice_clone", - "enabled": True, - "dataset_name": "seed-tts", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "num_prompts": [10], - "max_concurrency": [1], - "percentile-metrics": "audio_rtf", - "baseline": {}, - }, - { - "task": "voice_design", - "enabled": False, - "dataset_name": "seed-tts-design", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "num_prompts": [5], - "max_concurrency": [1], - "percentile-metrics": "audio_rtf", - "baseline": {}, - }, - ], - } - ] - mapping = create_test_parameter_mapping(configs) - params = mapping["test_model"]["benchmark_params"] - # Only the enabled=True entry should appear - assert len(params) == 1 - assert params[0].get("task") == "voice_clone" diff --git a/tests/dfx/perf/tests/test_tts.json b/tests/dfx/perf/tests/test_tts.json deleted file mode 100644 index 06c9c4d2384..00000000000 --- a/tests/dfx/perf/tests/test_tts.json +++ /dev/null @@ -1,155 +0,0 @@ -[ - { - "test_name": "test_qwen3_tts_base", - "server_params": { - "model": "Qwen/Qwen3-TTS-12Hz-1.7B-Base" - }, - "benchmark_params": [ - { - "task": "voice_clone", - "eval_phase": "latency", - "enabled": false, - "dataset_name": "seed-tts", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "num_prompts": [20], - "max_concurrency": [1], - "seed_tts_locale": "en", - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "median_audio_ttfp_ms": [350], - "median_audio_rtf": [0.25] - } - }, - { - "task": "voice_clone", - "eval_phase": "throughput", - "enabled": false, - "dataset_name": "seed-tts", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "num_prompts": [80], - "max_concurrency": [8], - "seed_tts_locale": "en", - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "median_audio_ttfp_ms": [3500], - "median_audio_rtf": [0.75], - "audio_throughput": [10.0] - } - }, - { - "task": "voice_clone", - "eval_phase": "quality", - "enabled": false, - "dataset_name": "seed-tts", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "num_prompts": [200], - "max_concurrency": [4], - "seed_tts_locale": "en", - "seed_tts_wer_eval": true, - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_audio_rtf": [0.45] - } - } - ] - }, - { - "test_name": "test_qwen3_tts_customvoice", - "server_params": { - "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" - }, - "benchmark_params": [ - { - "task": "default_voice", - "eval_phase": "latency", - "dataset_name": "seed-tts-text", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "dataset_path": "benchmarks/build_dataset/seed_tts_smoke", - "num_prompts": [20], - "max_concurrency": [1], - "seed_tts_locale": "en", - "extra_body": {"voice": "Vivian", "language": "English", "task_type": "CustomVoice"}, - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "median_audio_ttfp_ms": [150], - "median_audio_rtf": [0.15] - } - }, - { - "task": "default_voice", - "eval_phase": "throughput", - "dataset_name": "seed-tts-text", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "dataset_path": "benchmarks/build_dataset/seed_tts_smoke", - "num_prompts": [80], - "max_concurrency": [8], - "seed_tts_locale": "en", - "extra_body": {"voice": "Vivian", "language": "English", "task_type": "CustomVoice"}, - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "median_audio_ttfp_ms": [1500], - "median_audio_rtf": [0.30], - "audio_throughput": [30.0] - } - }, - { - "task": "default_voice", - "eval_phase": "quality", - "enabled": false, - "dataset_name": "seed-tts-text", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "dataset_path": "benchmarks/build_dataset/seed_tts_smoke", - "num_prompts": [200], - "max_concurrency": [4], - "seed_tts_locale": "en", - "extra_body": {"voice": "Vivian", "language": "English", "task_type": "CustomVoice"}, - "seed_tts_wer_eval": true, - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_audio_rtf": [0.35] - } - }, - { - "task": "voice_design", - "eval_phase": "latency", - "dataset_name": "seed-tts-design", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "dataset_path": "benchmarks/build_dataset/seed_tts_design", - "num_prompts": [20], - "max_concurrency": [1], - "seed_tts_locale": "en", - "extra_body": {"task_type": "VoiceDesign", "language": "English"}, - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "median_audio_ttfp_ms": [150], - "median_audio_rtf": [0.15] - } - }, - { - "task": "voice_design", - "eval_phase": "throughput", - "dataset_name": "seed-tts-design", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "dataset_path": "benchmarks/build_dataset/seed_tts_design", - "num_prompts": [80], - "max_concurrency": [8], - "seed_tts_locale": "en", - "extra_body": {"task_type": "VoiceDesign", "language": "English"}, - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "median_audio_ttfp_ms": [1500], - "median_audio_rtf": [0.35], - "audio_throughput": [25.0] - } - } - ] - } -] diff --git a/tests/dfx/perf/tests/test_wan22_i2v_vllm_omni.json b/tests/dfx/perf/tests/test_wan22_i2v_vllm_omni.json deleted file mode 100644 index 58a17c980bd..00000000000 --- a/tests/dfx/perf/tests/test_wan22_i2v_vllm_omni.json +++ /dev/null @@ -1,107 +0,0 @@ -[ - { - "test_name": "test_wan22_i2v_single_device", - "description": "Single-device baseline", - "server_type": "vllm-omni", - "server_params": { - "model": "Wan-AI/Wan2.2-I2V-A14B-Diffusers", - "serve_args": { - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "832x480_frames81_steps4", - "dataset": "random", - "task": "i2v", - "num-prompts": 10, - "max-concurrency": 1, - "num-input-images": 1, - "seed": 42, - "enable-negative-prompt": true, - "random-request-config": [ - { - "width": 832, - "height": 480, - "num_inference_steps": 4, - "num_frames": 81, - "fps": 16, - "weight": 1 - } - ], - "baseline": { - "throughput_qps": 0.034, - "latency_mean": 26.0, - "peak_memory_mb_mean": 80000 - } - } - ] - }, - { - "test_name": "test_wan22_i2v_usp2_vae_patch2_hsdp_slicing", - "description": "USP=2 + VAE patch parallel=2 + HSDP + VAE slicing", - "server_type": "vllm-omni", - "server_params": { - "model": "Wan-AI/Wan2.2-I2V-A14B-Diffusers", - "serve_args": { - "usp": 2, - "vae-patch-parallel-size": 2, - "use-hsdp": true, - "vae-use-slicing": true, - "enable-diffusion-pipeline-profiler": true - } - }, - "benchmark_params": [ - { - "name": "832x480_frames81_steps4", - "dataset": "random", - "task": "i2v", - "num-prompts": 10, - "max-concurrency": 1, - "num-input-images": 1, - "seed": 42, - "enable-negative-prompt": true, - "random-request-config": [ - { - "width": 832, - "height": 480, - "num_inference_steps": 4, - "num_frames": 81, - "fps": 16, - "weight": 1 - } - ], - "baseline": { - "throughput_qps": 0.042, - "latency_mean": 21.6, - "peak_memory_mb_mean": 55300 - } - }, - { - "name": "1280x720_frames121_steps4", - "dataset": "random", - "task": "i2v", - "num-prompts": 10, - "max-concurrency": 1, - "num-input-images": 1, - "seed": 42, - "enable-negative-prompt": true, - "random-request-config": [ - { - "width": 1280, - "height": 720, - "num_inference_steps": 4, - "num_frames": 121, - "fps": 16, - "weight": 1 - } - ], - "baseline": { - "throughput_qps": 0.0085, - "latency_mean": 101.6, - "peak_memory_mb_mean": 65200 - } - } - ] - } -] diff --git a/tests/dfx/stability/conftest.py b/tests/dfx/stability/conftest.py index 30718d4bf5a..3a0aee7608f 100644 --- a/tests/dfx/stability/conftest.py +++ b/tests/dfx/stability/conftest.py @@ -3,79 +3,123 @@ resource monitoring is started before each test and finalized after each test, so each stability test case gets its own HTML report (one report per case). No need to wrap pytest with `bash resource_monitor.sh run -- pytest ...`. - -Duration-based benchmark helper functions are hosted in ``helpers.py``, -while this file focuses on pytest fixtures and setup/teardown. """ -from __future__ import annotations - +import os import subprocess import sys import threading +import time +from pathlib import Path import pytest -from tests.dfx.conftest import get_benchmark_params_for_server -from tests.dfx.stability.helpers import ( - finalize_resource_monitor, - report_latest_gpu_samples, - start_resource_monitor, - wait_for_run_dir, -) -from tests.helpers.runtime import OmniServer - -DEFAULT_STABILITY_SERVER_TIMEOUT_ARGS = ["--stage-init-timeout", "600", "--init-timeout", "900"] - -_omni_server_lock = threading.Lock() - - -@pytest.fixture(scope="module") -def omni_server(request: pytest.FixtureRequest): - """Start OmniServer for stability tests, with per-module timeout override.""" - timeout_args = getattr(request.module, "STABILITY_SERVER_TIMEOUT_ARGS", DEFAULT_STABILITY_SERVER_TIMEOUT_ARGS) - with _omni_server_lock: - # Same 5-tuple and CLI composition as ``tests/dfx/perf/scripts/run_benchmark.py`` on main; - # ``serve_args`` from JSON are folded into ``extra_cli_args`` inside - # ``create_unique_server_params``. - test_name, model, deploy_path, stage_overrides, extra_cli_args = request.param - - print(f"Starting OmniServer with test: {test_name}, model: {model}") - server_args = list(timeout_args) - if deploy_path: - server_args = ["--deploy-config", deploy_path] + server_args - if stage_overrides: - server_args = ["--stage-overrides", stage_overrides] + server_args - if extra_cli_args: - server_args = list(extra_cli_args) + server_args - with OmniServer(model, server_args) as server: - server.test_name = test_name - print("OmniServer started successfully") - yield server - print("OmniServer stopping...") - print("OmniServer stopped") - - -@pytest.fixture -def stability_benchmark_params(request: pytest.FixtureRequest, omni_server): - test_name, param_index = request.param - if test_name != omni_server.test_name: - pytest.skip(f"Skipping parameter for {test_name} - current server is {omni_server.test_name}") - - server_to_benchmark_mapping = getattr(request.module, "server_to_benchmark_mapping", None) - if server_to_benchmark_mapping is None: - raise ValueError("server_to_benchmark_mapping must be defined in the test module") - - all_params = get_benchmark_params_for_server(test_name, server_to_benchmark_mapping) - if not all_params: - raise ValueError(f"No benchmark parameters found for test: {test_name}") - if param_index >= len(all_params): - raise ValueError(f"No benchmark parameters found for index {param_index} in test: {test_name}") - - current = param_index + 1 - total = len(all_params) - print(f"\n Running benchmark {current}/{total} for {test_name}") - return {"test_name": test_name, "params": all_params[param_index]} +STABILITY_DIR = Path(__file__).resolve().parent +RESOURCE_MONITOR_SCRIPT = STABILITY_DIR / "scripts" / "resource_monitor.sh" +REPO_ROOT = STABILITY_DIR.parent.parent.parent + + +def _start_resource_monitor(): + """Start `resource_monitor.sh start` in the background and return `Popen` or `None`.""" + if not RESOURCE_MONITOR_SCRIPT.is_file(): + return None + try: + proc = subprocess.Popen( + ["bash", str(RESOURCE_MONITOR_SCRIPT), "start", "--backend", "gpu"], + cwd=str(REPO_ROOT), + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + start_new_session=True, + ) + try: + proc.wait(timeout=2) + if proc.returncode != 0: + stderr = proc.stderr.read().decode("utf-8", errors="ignore") if proc.stderr else "" + if stderr.strip(): + sys.stderr.write(f"[Stability] Resource monitor failed to start: {stderr.strip()}\n") + return None + except subprocess.TimeoutExpired: + pass + return proc + except (FileNotFoundError, OSError): + return None + + +def _get_monitor_data_root() -> Path: + data_root = os.environ.get("RESOURCE_MONITOR_DATA_ROOT") or os.environ.get("GPU_MONITOR_DATA_ROOT") + if data_root: + return Path(data_root) + return STABILITY_DIR / "gpu_monitor_data" + + +def _wait_for_run_dir(timeout_sec: int = 10) -> Path | None: + data_root = _get_monitor_data_root() + run_id_file = data_root / "current_run_id" + deadline = time.time() + timeout_sec + while time.time() < deadline: + if run_id_file.is_file(): + run_id = run_id_file.read_text(encoding="utf-8").strip() + if run_id: + run_dir = data_root / run_id + if run_dir.is_dir(): + return run_dir + time.sleep(0.5) + return None + + +def _report_latest_gpu_samples(stop_event: threading.Event) -> None: + """Periodically print the latest sampled GPU line.""" + log_interval = int( + os.environ.get("RESOURCE_MONITOR_LOG_INTERVAL") or os.environ.get("GPU_MONITOR_LOG_INTERVAL") or "15" + ) + log_interval = max(log_interval, 1) + last_line = "" + + time.sleep(min(log_interval, 5)) + while not stop_event.wait(log_interval): + run_dir = _wait_for_run_dir(timeout_sec=1) + if run_dir is None: + continue + csv_file = run_dir / "gpu_metrics.csv" + if not csv_file.is_file(): + continue + try: + lines = csv_file.read_text(encoding="utf-8").splitlines() + except OSError: + continue + if len(lines) <= 1: + continue + latest = lines[-1].strip() + if latest and latest != last_line: + last_line = latest + sys.stderr.write(f"[GPU] {latest}\n") + + +def _finalize_resource_monitor() -> str | None: + """ + Run `resource_monitor.sh finalize` for the current run and generate the report. + Returns the bundle dir path (for this test case's report) if successful, else None. + """ + if not RESOURCE_MONITOR_SCRIPT.is_file(): + return None + try: + result = subprocess.run( + ["bash", str(RESOURCE_MONITOR_SCRIPT), "finalize", "--backend", "gpu"], + cwd=str(REPO_ROOT), + capture_output=True, + text=True, + timeout=60, + check=False, + ) + if result.returncode != 0: + return None + for line in (result.stdout or "").splitlines(): + if line.startswith("GPU_MONITOR_BUNDLE_DIR=") or line.startswith("RESOURCE_MONITOR_BUNDLE_DIR="): + _, _, value = line.partition("=") + return value.strip() if value else None + return None + except (FileNotFoundError, OSError, subprocess.TimeoutExpired): + return None @pytest.fixture(autouse=True) @@ -84,19 +128,19 @@ def stability_resource_monitor_per_test(request: pytest.FixtureRequest): For each test under this directory: start GPU monitor before the test, then finalize after the test so this case gets its own report.html. """ - proc = start_resource_monitor() + proc = _start_resource_monitor() stop_event = threading.Event() reporter: threading.Thread | None = None if proc is not None: reporter = threading.Thread( - target=report_latest_gpu_samples, + target=_report_latest_gpu_samples, args=(stop_event,), name="stability-resource-monitor-reporter", daemon=True, ) reporter.start() - run_dir = wait_for_run_dir(timeout_sec=5) + run_dir = _wait_for_run_dir(timeout_sec=5) node_name = request.node.name if run_dir is not None: sys.stderr.write(f"[Stability] Resource monitor started for test: {node_name} | run dir: {run_dir}\n") @@ -117,7 +161,7 @@ def stability_resource_monitor_per_test(request: pytest.FixtureRequest): except subprocess.TimeoutExpired: proc.kill() proc.wait() - bundle_dir = finalize_resource_monitor() + bundle_dir = _finalize_resource_monitor() node_name = request.node.name if bundle_dir: sys.stderr.write(f"[Stability] Report for test «{node_name}»: {bundle_dir}/report.html\n") diff --git a/tests/dfx/stability/helpers.py b/tests/dfx/stability/helpers.py deleted file mode 100644 index 3a873f69ca4..00000000000 --- a/tests/dfx/stability/helpers.py +++ /dev/null @@ -1,504 +0,0 @@ -"""Stability helpers for resource monitoring and benchmark execution.""" - -from __future__ import annotations - -import json -import os -import random -import re -import shlex -import subprocess -import sys -import tempfile -import threading -import time -from collections.abc import Callable -from pathlib import Path -from typing import Any - -from tests.dfx.conftest import run_benchmark - -STABILITY_DIR = Path(__file__).resolve().parent -RESOURCE_MONITOR_SCRIPT = STABILITY_DIR / "scripts" / "resource_monitor.sh" -REPO_ROOT = STABILITY_DIR.parent.parent.parent -_BUCKET_KEY_PATTERN = re.compile(r"^\(\s*([^,]+)\s*,\s*([^,]+)\s*,\s*([^,]+)\s*\)$") - -RunOneBatchFn = Callable[ - [str, int, str, dict[str, Any], int, float | None, int | None, str, int], - dict[str, Any], -] - - -def start_resource_monitor(): - """Start `resource_monitor.sh start` in the background and return `Popen` or `None`.""" - if not RESOURCE_MONITOR_SCRIPT.is_file(): - return None - try: - proc = subprocess.Popen( - ["bash", str(RESOURCE_MONITOR_SCRIPT), "start", "--backend", "gpu"], - cwd=str(REPO_ROOT), - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - start_new_session=True, - ) - try: - proc.wait(timeout=2) - if proc.returncode != 0: - stderr = proc.stderr.read().decode("utf-8", errors="ignore") if proc.stderr else "" - if stderr.strip(): - sys.stderr.write(f"[Stability] Resource monitor failed to start: {stderr.strip()}\n") - return None - except subprocess.TimeoutExpired: - pass - return proc - except (FileNotFoundError, OSError): - return None - - -def get_monitor_data_root() -> Path: - data_root = os.environ.get("RESOURCE_MONITOR_DATA_ROOT") or os.environ.get("GPU_MONITOR_DATA_ROOT") - if data_root: - return Path(data_root) - return STABILITY_DIR / "gpu_monitor_data" - - -def wait_for_run_dir(timeout_sec: int = 10) -> Path | None: - data_root = get_monitor_data_root() - run_id_file = data_root / "current_run_id" - deadline = time.time() + timeout_sec - while time.time() < deadline: - if run_id_file.is_file(): - run_id = run_id_file.read_text(encoding="utf-8").strip() - if run_id: - run_dir = data_root / run_id - if run_dir.is_dir(): - return run_dir - time.sleep(0.5) - return None - - -def report_latest_gpu_samples(stop_event: threading.Event) -> None: - """Periodically print the latest sampled GPU line.""" - log_interval = int( - os.environ.get("RESOURCE_MONITOR_LOG_INTERVAL") or os.environ.get("GPU_MONITOR_LOG_INTERVAL") or "15" - ) - log_interval = max(log_interval, 1) - last_line = "" - - time.sleep(min(log_interval, 5)) - while not stop_event.wait(log_interval): - run_dir = wait_for_run_dir(timeout_sec=1) - if run_dir is None: - continue - csv_file = run_dir / "gpu_metrics.csv" - if not csv_file.is_file(): - continue - try: - lines = csv_file.read_text(encoding="utf-8").splitlines() - except OSError: - continue - if len(lines) <= 1: - continue - latest = lines[-1].strip() - if latest and latest != last_line: - last_line = latest - sys.stderr.write(f"[GPU] {latest}\n") - - -def finalize_resource_monitor() -> str | None: - """ - Run `resource_monitor.sh finalize` for the current run and generate the report. - Returns the bundle dir path (for this test case's report) if successful, else None. - """ - if not RESOURCE_MONITOR_SCRIPT.is_file(): - return None - try: - result = subprocess.run( - ["bash", str(RESOURCE_MONITOR_SCRIPT), "finalize", "--backend", "gpu"], - cwd=str(REPO_ROOT), - capture_output=True, - text=True, - timeout=60, - check=False, - ) - if result.returncode != 0: - return None - for line in (result.stdout or "").splitlines(): - if line.startswith("GPU_MONITOR_BUNDLE_DIR=") or line.startswith("RESOURCE_MONITOR_BUNDLE_DIR="): - _, _, value = line.partition("=") - return value.strip() if value else None - return None - except (FileNotFoundError, OSError, subprocess.TimeoutExpired): - return None - - -def _normalize_bench_metrics(raw: dict[str, Any]) -> dict[str, Any]: - completed = int(raw.get("completed", raw.get("completed_requests", 0) or 0)) - failed = int(raw.get("failed", raw.get("failed_requests", 0) or 0)) - duration = float(raw.get("duration", 0.0) or 0.0) - errors = list(raw.get("errors") or []) - if failed and not errors: - errors = [f"{failed} benchmark request(s) failed"] - return {"completed": completed, "failed": failed, "duration": duration, "errors": errors} - - -def _build_base_args(params: dict[str, Any], host: str, port: int) -> list[str]: - exclude = { - "request_rate", - "max_concurrency", - "num_prompts", - "baseline", - "duration_sec", - "num_prompts_per_batch", - } - args = ["--host", host, "--port", str(port)] - for key, value in params.items(): - if key in exclude or value is None: - continue - arg_name = f"--{key.replace('_', '-')}" - if isinstance(value, bool) and value: - args.append(arg_name) - elif isinstance(value, dict): - args.extend([arg_name, json.dumps(value, ensure_ascii=False, separators=(",", ":"))]) - elif not isinstance(value, bool): - args.extend([arg_name, str(value)]) - return args - - -def _build_diffusion_cmd( - host: str, - port: int, - model: str, - params: dict[str, Any], - num_prompts: int, - request_rate: float | None, - max_concurrency: int | None, - output_path: Path, - diffusion_benchmark_script: Path, -) -> list[str]: - skip_keys = { - "request_rate", - "max_concurrency", - "num_prompts", - "baseline", - "duration_sec", - "num_prompts_per_batch", - } - cmd: list[str] = [ - sys.executable, - "-u", - str(diffusion_benchmark_script), - "--host", - host, - "--port", - str(port), - "--model", - model, - "--output-file", - str(output_path), - ] - for key, value in params.items(): - if key in skip_keys or value is None: - continue - flag = f"--{str(key).replace('_', '-')}" - if isinstance(value, bool) and value: - cmd.append(flag) - elif isinstance(value, bool): - continue - elif isinstance(value, (dict, list)): - cmd.extend([flag, json.dumps(value, ensure_ascii=False, separators=(",", ":"))]) - else: - cmd.extend([flag, str(value)]) - - cmd.extend(["--num-prompts", str(num_prompts)]) - if request_rate is not None: - cmd.extend(["--request-rate", str(request_rate)]) - else: - cmd.extend(["--max-concurrency", str(max_concurrency), "--request-rate", "inf"]) - return cmd - - -def _sample_int_from_range_spec(value: Any, rng: random.Random) -> Any: - """Resolve one value that may be scalar or range spec into an int.""" - if isinstance(value, int): - return value - - if isinstance(value, (list, tuple)) and len(value) == 2 and all(isinstance(v, int) for v in value): - low, high = int(value[0]), int(value[1]) - if low > high: - low, high = high, low - return rng.randint(low, high) - - if isinstance(value, dict) and {"min", "max"} <= set(value): - low, high = int(value["min"]), int(value["max"]) - if low > high: - low, high = high, low - return rng.randint(low, high) - - if isinstance(value, str): - raw = value.strip() - if raw.isdigit(): - return int(raw) - if "-" in raw: - parts = [p.strip() for p in raw.split("-", 1)] - if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit(): - low, high = int(parts[0]), int(parts[1]) - if low > high: - low, high = high, low - return rng.randint(low, high) - - return value - - -def _sample_bucket_key(raw_key: str, rng: random.Random) -> str: - """Sample bucket tuple keys that use range syntax, e.g. ``(128-512, 128-512, 1)``.""" - match = _BUCKET_KEY_PATTERN.match(raw_key.strip()) - if not match: - return raw_key - - sampled_parts: list[int] = [] - for token in match.groups(): - sampled = _sample_int_from_range_spec(token.strip(), rng) - if not isinstance(sampled, int): - return raw_key - sampled_parts.append(sampled) - - # For video buckets (height>0 and num_frames>1), enforce even H/W to avoid - # ffmpeg yuv420p encoding/decoding failures ("Could not open video stream"). - if sampled_parts[0] > 0 and sampled_parts[2] > 1: - sampled_parts[0] = max(2, sampled_parts[0] - (sampled_parts[0] % 2)) - sampled_parts[1] = max(2, sampled_parts[1] - (sampled_parts[1] % 2)) - - return f"({sampled_parts[0]}, {sampled_parts[1]}, {sampled_parts[2]})" - - -def _sample_stability_batch_params(params: dict[str, Any], batch_index: int) -> dict[str, Any]: - """Materialize per-batch random values for configured range fields.""" - sampled = dict(params) - rng = random.Random(time.time_ns() + batch_index) - - for field_name in ( - "random_input_len", - "random_output_len", - "random_mm_base_items_per_request", - "width", - "height", - ): - if field_name in sampled: - sampled[field_name] = _sample_int_from_range_spec(sampled[field_name], rng) - - bucket_config = sampled.get("random_mm_bucket_config") - if isinstance(bucket_config, dict): - sampled_bucket_config: dict[str, float] = {} - for raw_key, probability in bucket_config.items(): - sampled_key = _sample_bucket_key(str(raw_key), rng) - sampled_bucket_config[sampled_key] = sampled_bucket_config.get(sampled_key, 0.0) + float(probability) - sampled["random_mm_bucket_config"] = sampled_bucket_config - - return sampled - - -def _run_one_vllm_bench_batch( - host: str, - port: int, - _model: str, - params: dict[str, Any], - num_prompts: int, - request_rate: float | None, - max_concurrency: int | None, - result_dir: str, - batch_index: int, -) -> dict[str, Any]: - base = _build_base_args(params, host, port) - if request_rate is not None: - args = base + ["--request-rate", str(request_rate), "--num-prompts", str(num_prompts)] - flow = request_rate - else: - args = base + [ - "--max-concurrency", - str(max_concurrency), - "--num-prompts", - str(num_prompts), - "--request-rate", - "inf", - ] - flow = max_concurrency - - # Print the exact per-batch benchmark CLI (randomized params are already materialized). - preview_cmd = ["vllm", "bench", "serve", "--omni", *args] - print(f"\n[Stability][Batch {batch_index}] Benchmark command:") - print(shlex.join(preview_cmd)) - - dataset_name = params.get("dataset_name", "random") - old_benchmark_dir = os.environ.get("BENCHMARK_DIR") - try: - os.environ["BENCHMARK_DIR"] = result_dir - result = run_benchmark( - args=args, - test_name="stability", - flow=flow, - dataset_name=dataset_name, - num_prompt=num_prompts, - random_input_len=params.get("random_input_len"), - random_output_len=params.get("random_output_len"), - ) - return _normalize_bench_metrics(result) - except (FileNotFoundError, OSError) as exc: - return { - "completed": 0, - "failed": 1, - "duration": 0.0, - "errors": [f"Benchmark batch failed: {type(exc).__name__}: {exc}"], - } - finally: - if old_benchmark_dir is not None: - os.environ["BENCHMARK_DIR"] = old_benchmark_dir - elif "BENCHMARK_DIR" in os.environ: - os.environ.pop("BENCHMARK_DIR") - - -def _run_one_diffusion_batch( - host: str, - port: int, - model: str, - params: dict[str, Any], - num_prompts: int, - request_rate: float | None, - max_concurrency: int | None, - _result_dir: str, - _batch_index: int, -) -> dict[str, Any]: - diffusion_benchmark_script = Path(REPO_ROOT / "benchmarks" / "diffusion" / "diffusion_benchmark_serving.py") - with tempfile.NamedTemporaryFile(mode="w", suffix=".json", prefix="stability_diffusion_", delete=False) as tmp: - out_path = Path(tmp.name) - try: - cmd = _build_diffusion_cmd( - host, - port, - model, - params, - num_prompts, - request_rate, - max_concurrency, - out_path, - diffusion_benchmark_script, - ) - proc = subprocess.run( - cmd, - cwd=str(REPO_ROOT), - capture_output=True, - text=True, - ) - if proc.stdout: - print(proc.stdout, end="" if proc.stdout.endswith("\n") else "\n") - if proc.stderr: - print(proc.stderr, end="" if proc.stderr.endswith("\n") else "\n") - if proc.returncode != 0: - return { - "completed": 0, - "failed": 1, - "duration": 0.0, - "errors": [f"diffusion_benchmark_serving.py exited {proc.returncode}"], - } - if not out_path.is_file(): - return { - "completed": 0, - "failed": 1, - "duration": 0.0, - "errors": [f"Missing benchmark output: {out_path}"], - } - with open(out_path, encoding="utf-8") as file: - metrics = json.load(file) - return _normalize_bench_metrics(metrics) - except (FileNotFoundError, OSError, json.JSONDecodeError) as exc: - return { - "completed": 0, - "failed": 1, - "duration": 0.0, - "errors": [f"Diffusion batch failed: {type(exc).__name__}: {exc}"], - } - finally: - out_path.unlink(missing_ok=True) - - -def merge_batch_results(batch_results: list[dict[str, Any]], total_duration_sec: float) -> dict[str, Any]: - if not batch_results: - return {"completed": 0, "failed": 0, "duration": total_duration_sec, "errors": []} - - completed = sum(result.get("completed", 0) for result in batch_results) - failed = sum(result.get("failed", 0) for result in batch_results) - merged: dict[str, Any] = { - "completed": completed, - "failed": failed, - "duration": total_duration_sec, - "errors": [], - } - for result in batch_results: - merged["errors"].extend(result.get("errors") or []) - return merged - - -def print_merged_report(result: dict[str, Any]) -> None: - fmt = "{:<40} {:<10}" - fmt_float = "{:<40} {:<10.2f}" - completed = result.get("completed", 0) - failed = result.get("failed", 0) - duration = float(result.get("duration", 0.0) or 0.0) - print("\n============ Stability Benchmark Summary ============") - print(fmt.format("Successful requests:", completed)) - print(fmt.format("Failed requests:", failed)) - print(fmt_float.format("Total duration (s):", duration)) - print("==================================================\n") - - -def run_stability_benchmark_loop( - host: str, - port: int, - model: str, - duration_sec: int | float, - params: dict[str, Any], - *, - request_rate: float | None, - max_concurrency: int | None, - result_dir: str, - num_prompts_per_batch: int, - run_one_batch: RunOneBatchFn, - result_filename: str | None = None, -) -> dict[str, Any]: - if (request_rate is None) == (max_concurrency is None): - raise ValueError("Exactly one of request_rate or max_concurrency must be specified") - - start_time = time.perf_counter() - batch_results: list[dict[str, Any]] = [] - batch_index = 0 - - while True: - if (time.perf_counter() - start_time) >= duration_sec: - break - sampled_params = _sample_stability_batch_params(params, batch_index) - result = run_one_batch( - host, - port, - model, - sampled_params, - num_prompts_per_batch, - request_rate, - max_concurrency, - result_dir, - batch_index, - ) - batch_results.append(result) - batch_index += 1 - if (time.perf_counter() - start_time) >= duration_sec: - break - - total_duration = time.perf_counter() - start_time - merged = merge_batch_results(batch_results, total_duration) - print_merged_report(merged) - - if result_filename and result_dir: - result_path = Path(result_dir) / result_filename - with open(result_path, "w", encoding="utf-8") as file: - json.dump(merged, file, indent=2, ensure_ascii=False) - - return merged diff --git a/tests/dfx/stability/scripts/test_benchmark_stability.py b/tests/dfx/stability/scripts/test_benchmark_stability.py new file mode 100644 index 00000000000..e8568652d18 --- /dev/null +++ b/tests/dfx/stability/scripts/test_benchmark_stability.py @@ -0,0 +1,286 @@ +""" +Stability test cases: start OmniServer first, then run benchmark traffic with either +`request-rate` or `max-concurrency` for a fixed duration. No new requests are sent +after the duration is reached, and the test asserts that there are no failed requests. + +The overall flow matches the perf logic: `load_configs`, `modify_stage`, +`create_unique_server_params`, `create_test_parameter_mapping`, +`get_benchmark_params_for_server`, `create_benchmark_indices`, and the +`omni_server` fixture are aligned with perf. Only the benchmark execution +(`run_stability_benchmark`, which is duration-based here) and the test cases differ. + +All test-specific parameters, such as `duration_sec`, `request_rate` / +`max_concurrency`, and `num_prompts_per_batch`, are configured in +`tests/dfx/stability/tests/test.json` and are no longer overridden +through environment variables. +""" + +import json +import os +import threading +import time +from pathlib import Path +from typing import Any + +import pytest + +from tests.conftest import OmniServer +from tests.dfx.conftest import ( + create_benchmark_indices, + create_test_parameter_mapping, + create_unique_server_params, + get_benchmark_params_for_server, + load_configs, +) +from tests.dfx.perf.scripts.run_benchmark import run_benchmark + +STABILITY_DIR = Path(__file__).resolve().parent.parent +STAGE_CONFIGS_DIR = STABILITY_DIR / "stage_configs" +CONFIG_FILE_PATH = str(STABILITY_DIR / "tests" / "test.json") +DEFAULT_NUM_PROMPTS_PER_BATCH = 20 + + +try: + BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) +except FileNotFoundError: + BENCHMARK_CONFIGS = [] + +test_params = create_unique_server_params(BENCHMARK_CONFIGS, STAGE_CONFIGS_DIR) if BENCHMARK_CONFIGS else [] +server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS) if BENCHMARK_CONFIGS else {} + +_omni_server_lock = threading.Lock() + + +benchmark_indices = create_benchmark_indices(BENCHMARK_CONFIGS, server_to_benchmark_mapping) + + +def _build_base_args(params: dict[str, Any], host: str, port: int) -> list[str]: + exclude = { + "request_rate", + "max_concurrency", + "num_prompts", + "baseline", + "duration_sec", + "num_prompts_per_batch", + } + args = ["--host", host, "--port", str(port)] + for key, value in params.items(): + if key in exclude or value is None: + continue + arg_name = f"--{key.replace('_', '-')}" + if isinstance(value, bool) and value: + args.append(arg_name) + elif isinstance(value, dict): + args.extend([arg_name, json.dumps(value, ensure_ascii=False, separators=(",", ":"))]) + elif not isinstance(value, bool): + args.extend([arg_name, str(value)]) + return args + + +def _run_one_benchmark_batch( + host: str, + port: int, + params: dict[str, Any], + num_prompts: int, + request_rate: float | None, + max_concurrency: int | None, + result_dir: str, + batch_index: int, +) -> dict[str, Any]: + base = _build_base_args(params, host, port) + if request_rate is not None: + args = base + ["--request-rate", str(request_rate), "--num-prompts", str(num_prompts)] + flow = request_rate + else: + args = base + [ + "--max-concurrency", + str(max_concurrency), + "--num-prompts", + str(num_prompts), + "--request-rate", + "inf", + ] + flow = max_concurrency + + dataset_name = params.get("dataset_name", "random") + old_benchmark_dir = os.environ.get("BENCHMARK_DIR") + try: + os.environ["BENCHMARK_DIR"] = result_dir + result = run_benchmark( + args=args, + test_name="stability", + flow=flow, + dataset_name=dataset_name, + num_prompt=num_prompts, + ) + return result + except (FileNotFoundError, OSError) as e: + # Surface batch failure so the stability test does not false-pass when + # run_benchmark fails before writing JSON (e.g. command not found). + return { + "completed": 0, + "failed": 1, + "duration": 0.0, + "errors": [f"Benchmark batch failed: {type(e).__name__}: {e}"], + } + finally: + if old_benchmark_dir is not None: + os.environ["BENCHMARK_DIR"] = old_benchmark_dir + elif "BENCHMARK_DIR" in os.environ: + os.environ.pop("BENCHMARK_DIR") + + +def _merge_batch_results(batch_results: list[dict[str, Any]], total_duration_sec: float) -> dict[str, Any]: + if not batch_results: + return {"completed": 0, "failed": 0, "duration": total_duration_sec, "errors": []} + + completed = sum(r.get("completed", 0) for r in batch_results) + failed = sum(r.get("failed", 0) for r in batch_results) + merged: dict[str, Any] = { + "completed": completed, + "failed": failed, + "duration": total_duration_sec, + "errors": [], + } + for r in batch_results: + merged["errors"].extend(r.get("errors") or []) + return merged + + +def _print_merged_report(result: dict[str, Any]) -> None: + """Print the final summary: successful requests, failed requests, and total duration only.""" + fmt = "{:<40} {:<10}" + fmt_float = "{:<40} {:<10.2f}" + completed = result.get("completed", 0) + failed = result.get("failed", 0) + duration = float(result.get("duration", 0.0) or 0.0) + print("\n============ Stability Benchmark Summary ============") + print(fmt.format("Successful requests:", completed)) + print(fmt.format("Failed requests:", failed)) + print(fmt_float.format("Total duration (s):", duration)) + print("==================================================\n") + + +def run_stability_benchmark( + host: str, + port: int, + duration_sec: int | float, + params: dict[str, Any], + *, + request_rate: float | None = None, + max_concurrency: int | None = None, + result_filename: str | None = None, + result_dir: str = "./", + num_prompts_per_batch: int = DEFAULT_NUM_PROMPTS_PER_BATCH, +) -> dict[str, Any]: + if (request_rate is None) == (max_concurrency is None): + raise ValueError("Exactly one of request_rate or max_concurrency must be specified") + + start_time = time.perf_counter() + batch_results: list[dict[str, Any]] = [] + batch_index = 0 + + while True: + if (time.perf_counter() - start_time) >= duration_sec: + break + result = _run_one_benchmark_batch( + host=host, + port=port, + params=params, + num_prompts=num_prompts_per_batch, + request_rate=request_rate, + max_concurrency=max_concurrency, + result_dir=result_dir, + batch_index=batch_index, + ) + batch_results.append(result) + batch_index += 1 + if (time.perf_counter() - start_time) >= duration_sec: + break + + total_duration = time.perf_counter() - start_time + merged = _merge_batch_results(batch_results, total_duration) + _print_merged_report(merged) + + if result_filename and result_dir: + result_path = Path(result_dir) / result_filename + with open(result_path, "w", encoding="utf-8") as f: + json.dump(merged, f, indent=2, ensure_ascii=False) + + return merged + + +@pytest.fixture(scope="module") +def omni_server(request): + """Start vLLM-Omni server as a subprocess with actual model weights. + Uses session scope so the server starts only once for the entire test session. + Multi-stage initialization can take 10-20+ minutes. + """ + with _omni_server_lock: + test_name, model, stage_config_path = request.param + + print(f"Starting OmniServer with test: {test_name}, model: {model}") + + with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server: + server.test_name = test_name + print("OmniServer started successfully") + yield server + print("OmniServer stopping...") + + print("OmniServer stopped") + + +@pytest.fixture(params=benchmark_indices) +def stability_benchmark_params(request, omni_server): + """Benchmark parameters fixture with proper parametrization (same as perf).""" + test_name, param_index = request.param + + if test_name != omni_server.test_name: + pytest.skip(f"Skipping parameter for {test_name} - current server is {omni_server.test_name}") + + all_params = get_benchmark_params_for_server(test_name, server_to_benchmark_mapping) + + if not all_params: + raise ValueError(f"No benchmark parameters found for test: {test_name}") + + if param_index >= len(all_params): + raise ValueError(f"No benchmark parameters found for index {param_index} in test: {test_name}") + + current = param_index + 1 + total = len(all_params) + print(f"\n Running benchmark {current}/{total} for {test_name}") + + return {"test_name": test_name, "params": all_params[param_index]} + + +@pytest.mark.slow +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +@pytest.mark.parametrize("stability_benchmark_params", benchmark_indices, indirect=True) +def test_benchmark_stability(omni_server, stability_benchmark_params): + """Run the benchmark for a fixed duration using request-rate or max-concurrency and assert zero failed requests.""" + test_name = stability_benchmark_params["test_name"] + params = stability_benchmark_params["params"] + duration_sec = params.get("duration_sec", 300) + num_prompts_per_batch = params.get("num_prompts_per_batch", DEFAULT_NUM_PROMPTS_PER_BATCH) + request_rate = params.get("request_rate") + max_concurrency = params.get("max_concurrency") + + bench_params = { + k: v + for k, v in params.items() + if k not in ("duration_sec", "request_rate", "max_concurrency", "num_prompts_per_batch") + } + + result = run_stability_benchmark( + host=omni_server.host, + port=omni_server.port, + duration_sec=duration_sec, + params=bench_params, + request_rate=request_rate, + max_concurrency=max_concurrency, + result_dir=str(STABILITY_DIR), + num_prompts_per_batch=num_prompts_per_batch, + ) + + assert result.get("failed", 0) == 0, f"[{test_name}] Failed requests detected: {result.get('errors', [])}" + assert result.get("completed", 0) > 0, f"[{test_name}] No requests completed" diff --git a/tests/dfx/stability/scripts/test_stability_qwen3_omni.py b/tests/dfx/stability/scripts/test_stability_qwen3_omni.py deleted file mode 100644 index d1c2af8cf08..00000000000 --- a/tests/dfx/stability/scripts/test_stability_qwen3_omni.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Qwen3-Omni stability: OmniServer + ``vllm bench serve --omni`` for a fixed duration. - -Configuration: ``tests/dfx/stability/tests/test_qwen3_omni.json``. -""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from tests.dfx.conftest import ( - create_benchmark_indices, - create_test_parameter_mapping, - create_unique_server_params, - load_configs, -) -from tests.dfx.stability.helpers import _run_one_vllm_bench_batch, run_stability_benchmark_loop - -STABILITY_DIR = Path(__file__).resolve().parent.parent -DEPLOY_CONFIGS_DIR = STABILITY_DIR / "deploy" -CONFIG_FILE_PATH = str(STABILITY_DIR / "tests" / "test_qwen3_omni.json") -DEFAULT_NUM_PROMPTS_PER_BATCH = 20 -STABILITY_SERVER_TIMEOUT_ARGS = ["--stage-init-timeout", "600"] - -try: - BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) -except FileNotFoundError: - BENCHMARK_CONFIGS = [] - -test_params = create_unique_server_params(BENCHMARK_CONFIGS, DEPLOY_CONFIGS_DIR) if BENCHMARK_CONFIGS else [] -server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS) if BENCHMARK_CONFIGS else {} -benchmark_indices = create_benchmark_indices(BENCHMARK_CONFIGS, server_to_benchmark_mapping) - - -@pytest.mark.slow -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -@pytest.mark.parametrize("stability_benchmark_params", benchmark_indices, indirect=True) -def test_stability_qwen3_omni(omni_server, stability_benchmark_params): - test_name = stability_benchmark_params["test_name"] - params = stability_benchmark_params["params"] - duration_sec = params.get("duration_sec", 300) - num_prompts_per_batch = params.get("num_prompts_per_batch", DEFAULT_NUM_PROMPTS_PER_BATCH) - request_rate = params.get("request_rate") - max_concurrency = params.get("max_concurrency") - - bench_params = { - k: v - for k, v in params.items() - if k not in ("duration_sec", "request_rate", "max_concurrency", "num_prompts_per_batch") - } - - result = run_stability_benchmark_loop( - host=omni_server.host, - port=omni_server.port, - model=omni_server.model, - duration_sec=duration_sec, - params=bench_params, - request_rate=request_rate, - max_concurrency=max_concurrency, - result_dir=str(STABILITY_DIR), - num_prompts_per_batch=num_prompts_per_batch, - run_one_batch=_run_one_vllm_bench_batch, - ) - - assert result.get("failed", 0) == 0, f"[{test_name}] Failed requests detected: {result.get('errors', [])}" - assert result.get("completed", 0) > 0, f"[{test_name}] No requests completed" diff --git a/tests/dfx/stability/scripts/test_stability_qwen3_tts.py b/tests/dfx/stability/scripts/test_stability_qwen3_tts.py deleted file mode 100644 index beccd67d964..00000000000 --- a/tests/dfx/stability/scripts/test_stability_qwen3_tts.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Qwen3-TTS stability: OmniServer + ``vllm bench serve --omni`` for a fixed duration. - -Configuration: ``tests/dfx/stability/tests/test_qwen3_tts.json``. -""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from tests.dfx.conftest import ( - create_benchmark_indices, - create_test_parameter_mapping, - create_unique_server_params, - load_configs, -) -from tests.dfx.stability.helpers import _run_one_vllm_bench_batch, run_stability_benchmark_loop - -STABILITY_DIR = Path(__file__).resolve().parent.parent -DEPLOY_CONFIGS_DIR = STABILITY_DIR / "deploy" -CONFIG_FILE_PATH = str(STABILITY_DIR / "tests" / "test_qwen3_tts.json") -DEFAULT_NUM_PROMPTS_PER_BATCH = 20 -STABILITY_SERVER_TIMEOUT_ARGS = ["--stage-init-timeout", "600"] - -try: - BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) -except FileNotFoundError: - BENCHMARK_CONFIGS = [] - -test_params = create_unique_server_params(BENCHMARK_CONFIGS, DEPLOY_CONFIGS_DIR) if BENCHMARK_CONFIGS else [] -server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS) if BENCHMARK_CONFIGS else {} -benchmark_indices = create_benchmark_indices(BENCHMARK_CONFIGS, server_to_benchmark_mapping) - - -@pytest.mark.slow -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -@pytest.mark.parametrize("stability_benchmark_params", benchmark_indices, indirect=True) -def test_stability_qwen3_tts(omni_server, stability_benchmark_params): - test_name = stability_benchmark_params["test_name"] - params = stability_benchmark_params["params"] - duration_sec = params.get("duration_sec", 300) - num_prompts_per_batch = params.get("num_prompts_per_batch", DEFAULT_NUM_PROMPTS_PER_BATCH) - request_rate = params.get("request_rate") - max_concurrency = params.get("max_concurrency") - - bench_params = { - k: v - for k, v in params.items() - if k not in ("duration_sec", "request_rate", "max_concurrency", "num_prompts_per_batch") - } - - result = run_stability_benchmark_loop( - host=omni_server.host, - port=omni_server.port, - model=omni_server.model, - duration_sec=duration_sec, - params=bench_params, - request_rate=request_rate, - max_concurrency=max_concurrency, - result_dir=str(STABILITY_DIR), - num_prompts_per_batch=num_prompts_per_batch, - run_one_batch=_run_one_vllm_bench_batch, - ) - - assert result.get("failed", 0) == 0, f"[{test_name}] Failed requests detected: {result.get('errors', [])}" - assert result.get("completed", 0) > 0, f"[{test_name}] No requests completed" diff --git a/tests/dfx/stability/scripts/test_stability_qwen_image.py b/tests/dfx/stability/scripts/test_stability_qwen_image.py deleted file mode 100644 index a90e2092f5e..00000000000 --- a/tests/dfx/stability/scripts/test_stability_qwen_image.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Qwen-Image stability: OmniServer (diffusion) + ``diffusion_benchmark_serving.py``. - -Configuration: ``tests/dfx/stability/tests/test_qwen_image.json``. -""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from tests.dfx.conftest import ( - create_benchmark_indices, - create_test_parameter_mapping, - create_unique_server_params, - load_configs, -) -from tests.dfx.stability.helpers import _run_one_diffusion_batch, run_stability_benchmark_loop - -STABILITY_DIR = Path(__file__).resolve().parent.parent -DEPLOY_CONFIGS_DIR = STABILITY_DIR / "deploy" -CONFIG_FILE_PATH = str(STABILITY_DIR / "tests" / "test_qwen_image.json") -DEFAULT_NUM_PROMPTS_PER_BATCH = 20 -STABILITY_SERVER_TIMEOUT_ARGS = ["--stage-init-timeout", "600", "--init-timeout", "900"] - -try: - BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) -except FileNotFoundError: - BENCHMARK_CONFIGS = [] - -test_params = create_unique_server_params(BENCHMARK_CONFIGS, DEPLOY_CONFIGS_DIR) if BENCHMARK_CONFIGS else [] -server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS) if BENCHMARK_CONFIGS else {} -benchmark_indices = create_benchmark_indices(BENCHMARK_CONFIGS, server_to_benchmark_mapping) - - -@pytest.mark.slow -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -@pytest.mark.parametrize("stability_benchmark_params", benchmark_indices, indirect=True) -def test_stability_qwen_image(omni_server, stability_benchmark_params): - test_name = stability_benchmark_params["test_name"] - params = stability_benchmark_params["params"] - duration_sec = params.get("duration_sec", 300) - num_prompts_per_batch = params.get("num_prompts_per_batch", DEFAULT_NUM_PROMPTS_PER_BATCH) - request_rate = params.get("request_rate") - max_concurrency = params.get("max_concurrency") - - bench_params = { - k: v - for k, v in params.items() - if k not in ("duration_sec", "request_rate", "max_concurrency", "num_prompts_per_batch") - } - - result = run_stability_benchmark_loop( - host=omni_server.host, - port=omni_server.port, - model=omni_server.model, - duration_sec=duration_sec, - params=bench_params, - request_rate=request_rate, - max_concurrency=max_concurrency, - result_dir=str(STABILITY_DIR), - num_prompts_per_batch=num_prompts_per_batch, - run_one_batch=_run_one_diffusion_batch, - ) - - assert result.get("failed", 0) == 0, f"[{test_name}] Failed requests detected: {result.get('errors', [])}" - assert result.get("completed", 0) > 0, f"[{test_name}] No requests completed" diff --git a/tests/dfx/stability/scripts/test_stability_wan22.py b/tests/dfx/stability/scripts/test_stability_wan22.py deleted file mode 100644 index afe9c4d0ca7..00000000000 --- a/tests/dfx/stability/scripts/test_stability_wan22.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Wan2.2 T2V stability: OmniServer (diffusion) + ``diffusion_benchmark_serving.py`` / ``v1/videos``. - -Configuration: ``tests/dfx/stability/tests/test_wan22.json``. -""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from tests.dfx.conftest import ( - create_benchmark_indices, - create_test_parameter_mapping, - create_unique_server_params, - load_configs, -) -from tests.dfx.stability.helpers import _run_one_diffusion_batch, run_stability_benchmark_loop - -STABILITY_DIR = Path(__file__).resolve().parent.parent -DEPLOY_CONFIGS_DIR = STABILITY_DIR / "deploy" -CONFIG_FILE_PATH = str(STABILITY_DIR / "tests" / "test_wan22.json") -DEFAULT_NUM_PROMPTS_PER_BATCH = 20 -STABILITY_SERVER_TIMEOUT_ARGS = ["--stage-init-timeout", "600", "--init-timeout", "900"] - -try: - BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) -except FileNotFoundError: - BENCHMARK_CONFIGS = [] - -test_params = create_unique_server_params(BENCHMARK_CONFIGS, DEPLOY_CONFIGS_DIR) if BENCHMARK_CONFIGS else [] -server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS) if BENCHMARK_CONFIGS else {} -benchmark_indices = create_benchmark_indices(BENCHMARK_CONFIGS, server_to_benchmark_mapping) - - -@pytest.mark.slow -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -@pytest.mark.parametrize("stability_benchmark_params", benchmark_indices, indirect=True) -def test_stability_wan22(omni_server, stability_benchmark_params): - test_name = stability_benchmark_params["test_name"] - params = stability_benchmark_params["params"] - duration_sec = params.get("duration_sec", 300) - num_prompts_per_batch = params.get("num_prompts_per_batch", DEFAULT_NUM_PROMPTS_PER_BATCH) - request_rate = params.get("request_rate") - max_concurrency = params.get("max_concurrency") - - bench_params = { - k: v - for k, v in params.items() - if k not in ("duration_sec", "request_rate", "max_concurrency", "num_prompts_per_batch") - } - - result = run_stability_benchmark_loop( - host=omni_server.host, - port=omni_server.port, - model=omni_server.model, - duration_sec=duration_sec, - params=bench_params, - request_rate=request_rate, - max_concurrency=max_concurrency, - result_dir=str(STABILITY_DIR), - num_prompts_per_batch=num_prompts_per_batch, - run_one_batch=_run_one_diffusion_batch, - ) - - assert result.get("failed", 0) == 0, f"[{test_name}] Failed requests detected: {result.get('errors', [])}" - assert result.get("completed", 0) > 0, f"[{test_name}] No requests completed" diff --git a/tests/dfx/stability/stage_configs/qwen3_omni.yaml b/tests/dfx/stability/stage_configs/qwen3_omni.yaml new file mode 100644 index 00000000000..802f8dd2494 --- /dev/null +++ b/tests/dfx/stability/stage_configs/qwen3_omni.yaml @@ -0,0 +1,101 @@ +# Stage config for running Qwen3-Omni-MoE with 3-stage architecture +# Stage 0: Thinker (multimodal understanding + text generation) +# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) +# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) + +# The following config has been verified on 2x H100-80G GPUs. +async_chunk: false +stage_args: + - stage_id: 0 + stage_type: llm # Use llm stage type to launch OmniLLM + runtime: + devices: "0" + max_batch_size: 64 + engine_args: + model_stage: thinker + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.9 + enforce_eager: false + trust_remote_code: true + engine_output_type: latent # Output hidden states for talker + distributed_executor_backend: "mp" + enable_prefix_caching: false + max_num_batched_tokens: 32768 + hf_config_name: thinker_config + tensor_parallel_size: 1 + final_output: true + final_output_type: text + is_comprehension: true + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + top_k: 1 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.05 + + - stage_id: 1 + stage_type: llm # Use llm stage type to launch OmniLLM + runtime: + devices: "1" + max_batch_size: 64 + engine_args: + model_stage: talker + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.6 + enforce_eager: false + trust_remote_code: true + engine_output_type: latent # Output codec codes for code2wav + enable_prefix_caching: false + max_num_batched_tokens: 32768 + distributed_executor_backend: "mp" + hf_config_name: talker_config + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker + # final_output: true + # final_output_type: text + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: False + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 2 + stage_type: llm # Use llm stage type to launch OmniLLM + runtime: + devices: "1" + max_batch_size: 64 + engine_args: + model_stage: code2wav + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio # Final output: audio waveform + gpu_memory_utilization: 0.1 + distributed_executor_backend: "mp" + max_num_batched_tokens: 1000000 + hf_config_name: thinker_config + engine_input_source: [1] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: True + repetition_penalty: 1.1 diff --git a/tests/dfx/stability/tests/test.json b/tests/dfx/stability/tests/test.json new file mode 100644 index 00000000000..95993c9c556 --- /dev/null +++ b/tests/dfx/stability/tests/test.json @@ -0,0 +1,86 @@ +[ + { + "test_name": "test_qwen3_omni_stability", + "server_params": { + "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", + "stage_config_name": "qwen3_omni.yaml" + }, + "benchmark_params": [ + { + "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "duration_sec": 300, + "request_rate": 0.2, + "num_prompts_per_batch": 10, + "random_input_len": 2500, + "random_output_len": 900, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration" + }, + { + "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "duration_sec": 300, + "max_concurrency": 2, + "num_prompts_per_batch": 10, + "random_input_len": 2500, + "random_output_len": 900, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration" + } + ] + }, + { + "test_name": "test_qwen3_omni_stability_async_chunk", + "server_params": { + "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", + "stage_config_name": "qwen3_omni.yaml", + "update": { + "async_chunk": true, + "stage_args": { + "0": { + "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" + }, + "1": { + "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk" + } + } + }, + "delete": { + "stage_args": { + "2": [ + "custom_process_input_func" + ] + } + } + }, + "benchmark_params": [ + { + "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "duration_sec": 300, + "request_rate": 0.2, + "num_prompts_per_batch": 10, + "random_input_len": 2500, + "random_output_len": 900, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration" + }, + { + "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "duration_sec": 300, + "max_concurrency": 2, + "num_prompts_per_batch": 10, + "random_input_len": 2500, + "random_output_len": 900, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration" + } + ] + } +] diff --git a/tests/dfx/stability/tests/test_qwen3_omni.json b/tests/dfx/stability/tests/test_qwen3_omni.json deleted file mode 100644 index a16ab805cc6..00000000000 --- a/tests/dfx/stability/tests/test_qwen3_omni.json +++ /dev/null @@ -1,97 +0,0 @@ -[ - { - "test_name": "test_qwen3_omni_stability", - "server_params": { - "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_overrides": { - "2": { - "max_num_batched_tokens": 1000000 - } - } - }, - "benchmark_params": [ - { - "dataset_name": "random-mm", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "duration_sec": 86400, - "request_rate": 0.3, - "num_prompts_per_batch": 10, - "random_input_len": { - "min": 0, - "max": 8000 - }, - "random_output_len": { - "min": 0, - "max": 1000 - }, - "random_range_ratio": 0.0, - "random_mm_base_items_per_request": { - "min": 0, - "max": 6 - }, - "random_mm_num_mm_items_range_ratio": 0.0, - "random_mm_limit_mm_per_prompt": { - "image": 2, - "video": 2, - "audio": 2 - }, - "random_mm_bucket_config": { - "(128-1024, 128-1024, 1)": 0.34, - "(256-1080, 256-1920, 2-16)": 0.33, - "(0, 1-60, 1-3)": 0.33 - }, - "ignore_eos": true, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration" - } - ] - }, - { - "test_name": "test_qwen3_omni_stability_async_chunk", - "server_params": { - "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_overrides": { - "2": { - "max_num_batched_tokens": 1000000 - } - }, - "extra_cli_args": ["--async-chunk"] - }, - "benchmark_params": [ - { - "dataset_name": "random-mm", - "backend": "openai-chat-omni", - "endpoint": "/v1/chat/completions", - "duration_sec": 86400, - "max_concurrency": 2, - "num_prompts_per_batch": 10, - "random_input_len": { - "min": 0, - "max": 8000 - }, - "random_output_len": { - "min": 0, - "max": 1000 - }, - "random_range_ratio": 0.0, - "random_mm_base_items_per_request": { - "min": 0, - "max": 6 - }, - "random_mm_num_mm_items_range_ratio": 0.0, - "random_mm_limit_mm_per_prompt": { - "image": 2, - "video": 2, - "audio": 2 - }, - "random_mm_bucket_config": { - "(128-1024, 128-1024, 1)": 0.34, - "(256-1080, 256-1920, 2-16)": 0.33, - "(0, 1-60, 1-3)": 0.33 - }, - "ignore_eos": true, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration" - } - ] - } -] diff --git a/tests/dfx/stability/tests/test_qwen3_tts.json b/tests/dfx/stability/tests/test_qwen3_tts.json deleted file mode 100644 index fbf30d88ab2..00000000000 --- a/tests/dfx/stability/tests/test_qwen3_tts.json +++ /dev/null @@ -1,56 +0,0 @@ -[ - { - "test_name": "test_qwen3_tts_stability", - "server_params": { - "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" - }, - "benchmark_params": [ - { - "dataset_name": "random", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "duration_sec": 86400, - "request_rate": 0.3, - "num_prompts_per_batch": 10, - "random_input_len": { - "min": 0, - "max": 1000 - }, - "random_output_len": { - "min": 0, - "max": 1000 - }, - "random_range_ratio": 0.0, - "extra_body": { - "voice": "Vivian", - "language": "English" - }, - "ignore_eos": true, - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration" - }, - { - "dataset_name": "random", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "duration_sec": 86400, - "max_concurrency": 2, - "num_prompts_per_batch": 10, - "random_input_len": { - "min": 0, - "max": 1000 - }, - "random_output_len": { - "min": 0, - "max": 1000 - }, - "random_range_ratio": 0.0, - "extra_body": { - "voice": "Vivian", - "language": "English" - }, - "ignore_eos": true, - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration" - } - ] - } -] diff --git a/tests/dfx/stability/tests/test_qwen_image.json b/tests/dfx/stability/tests/test_qwen_image.json deleted file mode 100644 index f3dd93f6f25..00000000000 --- a/tests/dfx/stability/tests/test_qwen_image.json +++ /dev/null @@ -1,28 +0,0 @@ -[ - { - "test_name": "test_qwen_image_stability", - "server_params": { - "model": "Qwen/Qwen-Image" - }, - "benchmark_params": [ - { - "dataset": "random", - "task": "t2i", - "backend": "vllm-omni", - "duration_sec": 86400, - "max_concurrency": 1, - "num_prompts_per_batch": 10, - "width": { - "min": 512, - "max": 2048 - }, - "height": { - "min": 512, - "max": 2048 - }, - "num_inference_steps": 50, - "enable_negative_prompt": true - } - ] - } -] diff --git a/tests/dfx/stability/tests/test_wan22.json b/tests/dfx/stability/tests/test_wan22.json deleted file mode 100644 index c787ce96a07..00000000000 --- a/tests/dfx/stability/tests/test_wan22.json +++ /dev/null @@ -1,31 +0,0 @@ -[ - { - "test_name": "test_wan22_i2v_stability_v1_videos", - "server_params": { - "model": "Wan-AI/Wan2.2-I2V-A14B-Diffusers", - "serve_args": { - "ulysses-degree": 2, - "vae-patch-parallel-size": 2, - "tensor-parallel-size": 1, - "use-hsdp": true, - "vae-use-slicing": true, - "vae-use-tiling": true - } - }, - "benchmark_params": [ - { - "dataset": "random", - "task": "i2v", - "backend": "v1/videos", - "duration_sec": 86400, - "max_concurrency": 1, - "num_prompts_per_batch": 20, - "enable_negative_prompt": true, - "random_request_config": [ - {"width": 832, "height": 480, "num_inference_steps": 4, "num_frames": 81, "fps": 16, "weight": 0.5}, - {"width": 1280, "height": 720, "num_inference_steps": 4, "num_frames": 121, "fps": 16, "weight": 0.5} - ] - } - ] - } -] diff --git a/tests/diffusion/cache/test_cache_dit.py b/tests/diffusion/cache/test_cache_dit.py deleted file mode 100644 index 0b7ef723585..00000000000 --- a/tests/diffusion/cache/test_cache_dit.py +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -""" -Model specific tests for CacheDiT enablement. -""" - -from unittest.mock import Mock, patch - -import pytest - -import vllm_omni.diffusion.cache.cache_dit_backend as cd_backend -from vllm_omni.diffusion.data import DiffusionCacheConfig - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - -SEPARATE_CFG_ENABLERS = [ - cd_backend.enable_cache_for_ltx2, - cd_backend.enable_cache_for_wan22, - cd_backend.enable_cache_for_longcat_image, -] - -SAMPLE_CACHE_CONFIG = DiffusionCacheConfig() - - -@pytest.mark.parametrize("enabler", SEPARATE_CFG_ENABLERS) -@patch("vllm_omni.diffusion.cache.cache_dit_backend.BlockAdapter") -@patch("vllm_omni.diffusion.cache.cache_dit_backend.cache_dit") -def test_separate_cfg(mock_cache_dit, mock_block_adapter, enabler): - """Ensure that custom enablers for models with separate CFG pass - the param through to cache_dit correctly. - - Regression test for: https://github.com/vllm-project/vllm-omni/pull/2860 - """ - mock_pipeline = Mock() - enabler(mock_pipeline, SAMPLE_CACHE_CONFIG) - - mock_cache_dit.enable_cache.assert_called_once() - adapter_kwargs = mock_block_adapter.call_args.kwargs - assert adapter_kwargs["has_separate_cfg"] is True diff --git a/tests/diffusion/cache/test_teacache_extractors.py b/tests/diffusion/cache/test_teacache_extractors.py index 4bb958a36c1..5ba52ddfe2d 100644 --- a/tests/diffusion/cache/test_teacache_extractors.py +++ b/tests/diffusion/cache/test_teacache_extractors.py @@ -21,13 +21,12 @@ import pytest import torch -from tests.helpers.mark import hardware_test -from vllm_omni.diffusion.cache.teacache.extractors import extract_flux2_context, extract_flux2_klein_context +from vllm_omni.diffusion.cache.teacache.extractors import extract_flux2_klein_context from vllm_omni.diffusion.models.flux2_klein.flux2_klein_transformer import ( Flux2Transformer2DModel, ) -pytestmark = [pytest.mark.core_model] +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @pytest.fixture(scope="function", autouse=True) @@ -114,7 +113,6 @@ def sample_inputs(self): def get_sample_inputs(self, sample_inputs): return sample_inputs - @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_modulated_input_shape(self, flux2_klein_module, sample_inputs): """Test that modulated_input has correct shape matching the model's inner_dim. @@ -128,19 +126,16 @@ def test_modulated_input_shape(self, flux2_klein_module, sample_inputs): inner_dim = flux2_klein_module.inner_dim assert context.modulated_input.shape == (batch_size, img_seq_len, inner_dim) - @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_run_transformer_blocks_callable(self, flux2_klein_module, sample_inputs): """Test that run_transformer_blocks is callable.""" context = extract_flux2_klein_context(flux2_klein_module, **sample_inputs) assert callable(context.run_transformer_blocks) - @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_postprocess_callable(self, flux2_klein_module, sample_inputs): """Test that postprocess is callable.""" context = extract_flux2_klein_context(flux2_klein_module, **sample_inputs) assert callable(context.postprocess) - @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_extra_states_contains_full_transformer(self, flux2_klein_module, sample_inputs): """Test that extra_states contains run_flux2_full_transformer_with_single.""" context = extract_flux2_klein_context(flux2_klein_module, **sample_inputs) @@ -159,7 +154,6 @@ def test_without_guidance(self, flux2_klein_module, sample_inputs): assert context is not None assert context.temb is not None - @pytest.mark.cpu def test_invalid_module_raises_error(self): """Test that invalid module without transformer_blocks raises ValueError.""" invalid_module = Mock() @@ -174,106 +168,3 @@ def test_invalid_module_raises_error(self): img_ids=torch.randint(0, 64, (1, 1024, 4)), txt_ids=torch.randint(0, 64, (1, 512, 4)), ) - - -class TestFlux2Extractor(BaseExtractorTest): - """Test extract_flux2_context function.""" - - def get_extractor(self): - return extract_flux2_context - - @pytest.fixture - def flux2_module(self): - """Create a minimal Flux2Transformer2DModel for testing.""" - from vllm_omni.diffusion.models.flux2.flux2_transformer import Flux2Transformer2DModel - - model = Flux2Transformer2DModel( - num_layers=2, - num_single_layers=2, - num_attention_heads=48, - attention_head_dim=128, - joint_attention_dim=15360, - ) - return model - - def get_module(self, flux2_module): - return flux2_module - - @pytest.fixture - def sample_inputs(self): - """Create sample input tensors for Flux2. - - Note: hidden_states uses in_channels=128 (default for Flux2), - not inner_dim=6144. The x_embedder projects from 128 -> 6144. - encoder_hidden_states uses joint_attention_dim=15360 (model default), - which then gets projected to inner_dim=6144 by context_embedder. - """ - batch_size = 1 - img_seq_len = 1024 - txt_seq_len = 512 - in_channels = 128 # Model default in_channels - txt_dim = 15360 # Model default joint_attention_dim - - return { - "hidden_states": torch.randn(batch_size, img_seq_len, in_channels), - "encoder_hidden_states": torch.randn(batch_size, txt_seq_len, txt_dim), - "timestep": torch.tensor([500]), - "img_ids": torch.randint(0, 64, (batch_size, img_seq_len, 4)), - "txt_ids": torch.randint(0, 64, (batch_size, txt_seq_len, 4)), - "guidance": torch.tensor([3.5]), - } - - def get_sample_inputs(self, sample_inputs): - return sample_inputs - - @hardware_test(res={"cuda": "L4"}, num_cards=1) - def test_modulated_input_shape(self, flux2_module, sample_inputs): - """Test that modulated_input has correct shape matching the model's inner_dim. - - Note: After x_embedder projection, hidden_states are projected from - in_channels (128) to inner_dim (6144), so modulated_input should match - the projected shape, not the input shape. - """ - context = extract_flux2_klein_context(flux2_module, **sample_inputs) - - batch_size, img_seq_len, _ = sample_inputs["hidden_states"].shape - inner_dim = flux2_module.inner_dim - assert context.modulated_input.shape == (batch_size, img_seq_len, inner_dim) - - @hardware_test(res={"cuda": "L4"}, num_cards=1) - def test_run_transformer_blocks_callable(self, flux2_module, sample_inputs): - """Test that run_transformer_blocks is callable.""" - context = extract_flux2_context(flux2_module, **sample_inputs) - assert callable(context.run_transformer_blocks) - - @hardware_test(res={"cuda": "L4"}, num_cards=1) - def test_postprocess_callable(self, flux2_module, sample_inputs): - """Test that postprocess is callable.""" - context = extract_flux2_context(flux2_module, **sample_inputs) - assert callable(context.postprocess) - - def test_without_guidance(self, flux2_module, sample_inputs): - """Test context extraction works without guidance (no CFG).""" - inputs = sample_inputs.copy() - inputs["guidance"] = None - - context = extract_flux2_context(flux2_module, **inputs) - - assert context is not None - assert context.temb is not None - - @pytest.mark.cpu - def test_invalid_module_raises_error(self): - """Test that invalid module without transformer_blocks raises ValueError.""" - invalid_module = Mock() - invalid_module.transformer_blocks = [] - - with pytest.raises(ValueError, match="Module must have transformer_blocks"): - extract_flux2_context( - invalid_module, - hidden_states=torch.randn(1, 1024, 6144), - encoder_hidden_states=torch.randn(1, 512, 15360), - timestep=torch.tensor([500]), - img_ids=torch.randint(0, 64, (1, 1024, 4)), - txt_ids=torch.randint(0, 64, (1, 512, 4)), - ) diff --git a/tests/diffusion/distributed/test_autoencoder_kl_wan.py b/tests/diffusion/distributed/test_autoencoder_kl_wan.py deleted file mode 100644 index 2ea1c1214b8..00000000000 --- a/tests/diffusion/distributed/test_autoencoder_kl_wan.py +++ /dev/null @@ -1,43 +0,0 @@ -import pytest -import torch - -from vllm_omni.diffusion.distributed.autoencoders import autoencoder_kl_wan as wan_vae_module -from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import OmniAutoencoderKLWan - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -class _DummyOmniAutoencoderKLWan(OmniAutoencoderKLWan): - def __init__(self, *, dtype: torch.dtype): - torch.nn.Module.__init__(self) - self.register_parameter("dummy_weight", torch.nn.Parameter(torch.ones(1, dtype=dtype))) - - -def test_wan_vae_execution_context_handles_fp32(): - model = _DummyOmniAutoencoderKLWan(dtype=torch.float32) - with model._execution_context(): - output = model.dummy_weight + 1 - assert output.dtype == torch.float32 - - -def test_wan_vae_execution_context_handles_bf16(): - model = _DummyOmniAutoencoderKLWan(dtype=torch.bfloat16) - with model._execution_context(): - output = model.dummy_weight + 1 - assert output.dtype == torch.bfloat16 - - -def test_wan_vae_execution_context_uses_platform_autocast(mocker): - sentinel = object() - platform = mocker.Mock() - platform.create_autocast_context.return_value = sentinel - mocker.patch.object(wan_vae_module, "current_omni_platform", platform) - - model = _DummyOmniAutoencoderKLWan(dtype=torch.bfloat16) - - assert model._execution_context() is sentinel - platform.create_autocast_context.assert_called_once_with( - device_type=model.dummy_weight.device.type, - dtype=torch.bfloat16, - enabled=True, - ) diff --git a/tests/diffusion/distributed/test_autoencoder_kl_wan_encode.py b/tests/diffusion/distributed/test_autoencoder_kl_wan_encode.py deleted file mode 100644 index 7a18fa66da3..00000000000 --- a/tests/diffusion/distributed/test_autoencoder_kl_wan_encode.py +++ /dev/null @@ -1,273 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -"""Unit tests for DistributedAutoencoderKLWan encode parallel (CPU-only).""" - -import pytest -import torch - -pytestmark = [pytest.mark.cpu, pytest.mark.core_model] - - -class _DummyConfig: - def __init__(self, patch_size=None, scale_factor_temporal=4): - self.patch_size = patch_size - self.scale_factor_temporal = scale_factor_temporal - - -class _DummyWanVae: - """Minimal mock of DistributedAutoencoderKLWan for testing encode_tile_split.""" - - def __init__( - self, - config=None, - spatial_compression_ratio=8, - tile_sample_min_height=256, - tile_sample_min_width=256, - tile_sample_stride_height=192, - tile_sample_stride_width=192, - ): - self.config = config or _DummyConfig() - self.spatial_compression_ratio = spatial_compression_ratio - self.tile_sample_min_height = tile_sample_min_height - self.tile_sample_min_width = tile_sample_min_width - self.tile_sample_stride_height = tile_sample_stride_height - self.tile_sample_stride_width = tile_sample_stride_width - self.dtype = torch.float32 - - # Mock caches - self._enc_feat_map = None - self._enc_conv_idx = [0] - - def clear_cache(self): - self._enc_feat_map = None - self._enc_conv_idx = [0] - - def encoder(self, x, feat_cache=None, feat_idx=None): # noqa: ARG002 - # Simple mock: just return the input - return x - - def quant_conv(self, x): - return x - - def blend_v(self, _a, b, _blend_extent): - return b - - def blend_h(self, _a, b, _blend_extent): - return b - - -def _import_encode_tile_split(): - """Import the encode_tile_split method from the module.""" - from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import ( - DistributedAutoencoderKLWan, - ) - - return DistributedAutoencoderKLWan.encode_tile_split - - -def _import_encode_tile_exec(): - from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import ( - DistributedAutoencoderKLWan, - ) - - return DistributedAutoencoderKLWan.encode_tile_exec - - -def _import_encode_tile_merge(): - from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import ( - DistributedAutoencoderKLWan, - ) - - return DistributedAutoencoderKLWan.encode_tile_merge - - -class TestEncodeTileSplit: - """Tests for encode_tile_split method.""" - - def test_basic_split_without_patch_size(self): - """Test basic tile splitting without patch_size.""" - encode_tile_split = _import_encode_tile_split() - - vae = _DummyWanVae( - config=_DummyConfig(patch_size=None, scale_factor_temporal=4), - spatial_compression_ratio=8, - tile_sample_min_height=256, - tile_sample_min_width=256, - tile_sample_stride_height=192, - tile_sample_stride_width=192, - ) - - # Input: (B, C, T, H, W) = (1, 3, 5, 256, 256) - x = torch.randn(1, 3, 5, 256, 256) - - tiletask_list, grid_spec = encode_tile_split(vae, x) - - # With stride 192 and input size 256, we should get: - # Height: ceil(256/192) = 2 positions (0, 192) but 192+256 > 256, so only 1 - # Actually for i in range(0, 256, 192): i = 0, 192 but 192 is out of bounds - # So we get 1x1 grid - assert len(tiletask_list) >= 1 - assert grid_spec.grid_shape[0] >= 1 - assert grid_spec.grid_shape[1] >= 1 - - # Check temporal chunking: 5 frames -> 1 + (5-1)//4 = 2 chunks - first_task = tiletask_list[0] - assert len(first_task.tensor) == 2 # 2 temporal chunks - - def test_split_with_patch_size_scales_coordinates(self): - """Test that patch_size properly scales tile coordinates.""" - encode_tile_split = _import_encode_tile_split() - - # Without patch_size - vae_no_patch = _DummyWanVae( - config=_DummyConfig(patch_size=None, scale_factor_temporal=4), - spatial_compression_ratio=8, - tile_sample_min_height=256, - tile_sample_min_width=256, - tile_sample_stride_height=128, - tile_sample_stride_width=128, - ) - - # With patch_size=2 (simulating patchified input) - vae_with_patch = _DummyWanVae( - config=_DummyConfig(patch_size=2, scale_factor_temporal=4), - spatial_compression_ratio=8, - tile_sample_min_height=256, - tile_sample_min_width=256, - tile_sample_stride_height=128, - tile_sample_stride_width=128, - ) - - # Same patchified input size - x = torch.randn(1, 3, 5, 256, 256) - - tasks_no_patch, _ = encode_tile_split(vae_no_patch, x) - tasks_with_patch, _ = encode_tile_split(vae_with_patch, x) - - # With patch_size=2, stride becomes 128//2=64, so more tiles - assert len(tasks_with_patch) >= len(tasks_no_patch) - - def test_temporal_compression_from_config(self): - """Test that temporal compression ratio is read from config.""" - encode_tile_split = _import_encode_tile_split() - - # temporal_compression=4 (default) - vae_4x = _DummyWanVae( - config=_DummyConfig(scale_factor_temporal=4), - tile_sample_min_height=512, - tile_sample_min_width=512, - tile_sample_stride_height=512, - tile_sample_stride_width=512, - ) - - # temporal_compression=2 - vae_2x = _DummyWanVae( - config=_DummyConfig(scale_factor_temporal=2), - tile_sample_min_height=512, - tile_sample_min_width=512, - tile_sample_stride_height=512, - tile_sample_stride_width=512, - ) - - # 9 frames input - x = torch.randn(1, 3, 9, 512, 512) - - tasks_4x, _ = encode_tile_split(vae_4x, x) - tasks_2x, _ = encode_tile_split(vae_2x, x) - - # With 4x compression: 1 + (9-1)//4 = 3 chunks - assert len(tasks_4x[0].tensor) == 3 - - # With 2x compression: 1 + (9-1)//2 = 5 chunks - assert len(tasks_2x[0].tensor) == 5 - - def test_grid_spec_latent_dimensions(self): - """Test that grid_spec contains correct latent dimensions.""" - encode_tile_split = _import_encode_tile_split() - - vae = _DummyWanVae( - config=_DummyConfig(patch_size=None), - spatial_compression_ratio=8, - tile_sample_min_height=512, - tile_sample_min_width=512, - tile_sample_stride_height=512, - tile_sample_stride_width=512, - ) - - # Input: 512x512 with compression 8 -> 64x64 latent - x = torch.randn(1, 3, 5, 512, 512) - - _, grid_spec = encode_tile_split(vae, x) - - assert grid_spec.tile_spec["latent_height"] == 64 - assert grid_spec.tile_spec["latent_width"] == 64 - - -class TestEncodeTileExec: - """Tests for encode_tile_exec method.""" - - def test_basic_exec(self): - """Test basic tile execution.""" - encode_tile_exec = _import_encode_tile_exec() - - vae = _DummyWanVae() - - from vllm_omni.diffusion.distributed.autoencoders.distributed_vae_executor import ( - TileTask, - ) - - # Create a simple task with 2 temporal chunks - tile1 = torch.randn(1, 3, 1, 32, 32) - tile2 = torch.randn(1, 3, 4, 32, 32) - task = TileTask(tile_id=0, grid_coord=(0, 0), tensor=[tile1, tile2]) - - result = encode_tile_exec(vae, task) - - # Result should concatenate temporal dimension - assert result.shape[2] == 5 # 1 + 4 frames - - -class TestEncodeTileMerge: - """Tests for encode_tile_merge method.""" - - def test_basic_merge(self): - """Test basic tile merging.""" - encode_tile_merge = _import_encode_tile_merge() - - vae = _DummyWanVae() - - from vllm_omni.diffusion.distributed.autoencoders.distributed_vae_executor import ( - GridSpec, - ) - - # Create 2x2 grid of tiles - tile_00 = torch.ones(1, 16, 2, 32, 32) * 0 - tile_01 = torch.ones(1, 16, 2, 32, 32) * 1 - tile_10 = torch.ones(1, 16, 2, 32, 32) * 2 - tile_11 = torch.ones(1, 16, 2, 32, 32) * 3 - - coord_tensor_map = { - (0, 0): tile_00, - (0, 1): tile_01, - (1, 0): tile_10, - (1, 1): tile_11, - } - - grid_spec = GridSpec( - split_dims=(3, 4), - grid_shape=(2, 2), - tile_spec={ - "latent_height": 48, - "latent_width": 48, - "blend_height": 8, - "blend_width": 8, - "tile_latent_stride_height": 24, - "tile_latent_stride_width": 24, - }, - ) - - result = encode_tile_merge(vae, coord_tensor_map, grid_spec) - - # Output should be (1, 16, 2, 48, 48) - assert result.shape == (1, 16, 2, 48, 48) diff --git a/tests/diffusion/distributed/test_cfg_parallel.py b/tests/diffusion/distributed/test_cfg_parallel.py index bf709618de2..79dbe9e6dd6 100644 --- a/tests/diffusion/distributed/test_cfg_parallel.py +++ b/tests/diffusion/distributed/test_cfg_parallel.py @@ -2,9 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Unit tests for CFG (Classifier-Free Guidance) parallel functionality. -This test verifies that predict_noise_maybe_with_cfg and -predict_noise_with_multi_branch_cfg produce numerically equivalent results -with and without CFG parallel using fixed random inputs. +This test verifies that predict_noise_maybe_with_cfg produces numerically +equivalent results with and without CFG parallel using fixed random inputs. """ import os @@ -430,340 +429,3 @@ def test_predict_noise_without_cfg(dtype: torch.dtype): assert noise_pred.shape == (1, 4, 16, 16) print(f"✓ Test passed: predict_noise without CFG (dtype={dtype})") - - -class MultiBranchTestPipeline(CFGParallelMixin): - """Test pipeline with custom 3-branch combine logic (like OmniGen2).""" - - def __init__(self, in_channels: int = 4, hidden_dim: int = 128, seed: int = 42): - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) - - self.transformer = SimpleTransformer(in_channels, hidden_dim) - - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) - for param in self.transformer.parameters(): - torch.nn.init.normal_(param, mean=0.0, std=0.02) - - def combine_multi_branch_cfg_noise(self, predictions, true_cfg_scale, cfg_normalize=False): - """N-branch combine with weighted sum for testing. - - - 2-branch: standard CFG formula (true_cfg_scale is float) - - 3-branch: OmniGen2-style dual guidance scale (true_cfg_scale is dict) - - 4-branch: DreamID-style weighted sum (true_cfg_scale is dict) - """ - if len(predictions) == 4: - text_scale = true_cfg_scale["text"] - image_scale = true_cfg_scale["image"] - vid_ref_scale = true_cfg_scale["vid_ref"] - pos, neg, vid_neg, audio_neg = predictions - combined = ( - audio_neg - + vid_ref_scale * (vid_neg - audio_neg) - + image_scale * (neg - vid_neg) - + text_scale * (pos - neg) - ) - elif len(predictions) == 3: - text_scale = true_cfg_scale["text"] - image_scale = true_cfg_scale["image"] - pos, ref, uncond = predictions - combined = uncond + image_scale * (ref - uncond) + text_scale * (pos - ref) - else: - pos, neg = predictions[0], predictions[1] - combined = neg + true_cfg_scale * (pos - neg) - - if cfg_normalize: - combined = self.cfg_normalize_function(pos, combined) - return combined - - -def _test_multi_branch_parallel_worker( - local_rank: int, - world_size: int, - cfg_parallel_size: int, - dtype: torch.dtype, - test_config: dict, - result_queue: torch.multiprocessing.Queue, -): - """Worker function for multi-branch CFG parallel test.""" - device = torch.device(f"{current_omni_platform.device_type}:{local_rank}") - current_omni_platform.set_device(device) - - update_environment_variables( - { - "RANK": str(local_rank), - "LOCAL_RANK": str(local_rank), - "WORLD_SIZE": str(world_size), - "MASTER_ADDR": "localhost", - "MASTER_PORT": "29504", - } - ) - - init_distributed_environment() - initialize_model_parallel(cfg_parallel_size=cfg_parallel_size) - - cfg_rank = get_classifier_free_guidance_rank() - cfg_world_size = get_classifier_free_guidance_world_size() - assert cfg_world_size == cfg_parallel_size - - pipeline = MultiBranchTestPipeline( - in_channels=test_config["channels"], - hidden_dim=test_config["hidden_dim"], - seed=test_config["model_seed"], - ) - pipeline.transformer = pipeline.transformer.to(device=device, dtype=dtype) - pipeline.transformer.eval() - - n_branches = test_config["n_branches"] - batch_size = test_config["batch_size"] - channels = test_config["channels"] - height = test_config["height"] - width = test_config["width"] - - # Create N branch inputs with distinct seeds - branches_kwargs = [] - for b in range(n_branches): - torch.manual_seed(test_config["input_seed"] + b) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(test_config["input_seed"] + b) - x = torch.randn(batch_size, channels, height, width, dtype=dtype, device=device) - branches_kwargs.append({"x": x}) - - with torch.no_grad(): - noise_pred = pipeline.predict_noise_with_multi_branch_cfg( - do_true_cfg=True, - true_cfg_scale=test_config["cfg_scale"], - branches_kwargs=branches_kwargs, - cfg_normalize=test_config["cfg_normalize"], - ) - - assert noise_pred is not None - result_queue.put((cfg_rank, noise_pred.cpu())) - - destroy_distributed_env() - - -def _test_multi_branch_sequential_worker( - local_rank: int, - world_size: int, - dtype: torch.dtype, - test_config: dict, - result_queue: torch.multiprocessing.Queue, -): - """Worker function for sequential multi-branch CFG test (baseline).""" - device = torch.device(f"{current_omni_platform.device_type}:{local_rank}") - current_omni_platform.set_device(device) - - update_environment_variables( - { - "RANK": str(local_rank), - "LOCAL_RANK": str(local_rank), - "WORLD_SIZE": str(world_size), - "MASTER_ADDR": "localhost", - "MASTER_PORT": "29505", - } - ) - - init_distributed_environment() - initialize_model_parallel(cfg_parallel_size=1) - - cfg_world_size = get_classifier_free_guidance_world_size() - assert cfg_world_size == 1 - - pipeline = MultiBranchTestPipeline( - in_channels=test_config["channels"], - hidden_dim=test_config["hidden_dim"], - seed=test_config["model_seed"], - ) - pipeline.transformer = pipeline.transformer.to(device=device, dtype=dtype) - pipeline.transformer.eval() - - n_branches = test_config["n_branches"] - batch_size = test_config["batch_size"] - channels = test_config["channels"] - height = test_config["height"] - width = test_config["width"] - - branches_kwargs = [] - for b in range(n_branches): - torch.manual_seed(test_config["input_seed"] + b) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(test_config["input_seed"] + b) - x = torch.randn(batch_size, channels, height, width, dtype=dtype, device=device) - branches_kwargs.append({"x": x}) - - with torch.no_grad(): - noise_pred = pipeline.predict_noise_with_multi_branch_cfg( - do_true_cfg=True, - true_cfg_scale=test_config["cfg_scale"], - branches_kwargs=branches_kwargs, - cfg_normalize=test_config["cfg_normalize"], - ) - - assert noise_pred is not None - result_queue.put(noise_pred.cpu()) - - destroy_distributed_env() - - -@pytest.mark.parametrize( - "cfg_parallel_size,n_branches", - [ - (2, 2), # 2 branches on 2 GPUs: [[0],[1]] - (2, 3), # 3 branches on 2 GPUs: [[0,2],[1]] - (3, 3), # 3 branches on 3 GPUs: [[0],[1],[2]] - (2, 4), # 4 branches on 2 GPUs: [[0,2],[1,3]] - ], -) -@pytest.mark.parametrize("dtype", [torch.bfloat16]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("cfg_normalize", [False, True]) -def test_predict_noise_with_multi_branch_cfg( - cfg_parallel_size: int, - n_branches: int, - dtype: torch.dtype, - batch_size: int, - cfg_normalize: bool, -): - """ - Test that predict_noise_with_multi_branch_cfg produces identical results - with and without CFG parallel for N-branch models. - - Args: - cfg_parallel_size: Number of GPUs for CFG parallel - n_branches: Number of CFG branches - dtype: Data type for computation - batch_size: Batch size for testing - cfg_normalize: Whether to normalize CFG output - """ - available_gpus = current_omni_platform.get_device_count() - if available_gpus < cfg_parallel_size: - pytest.skip(f"Test requires {cfg_parallel_size} GPUs but only {available_gpus} available") - - if n_branches == 2: - cfg_scale = 5.0 - elif n_branches == 3: - cfg_scale = {"text": 5.0, "image": 2.0} - else: - cfg_scale = {"text": 5.0, "image": 2.0, "vid_ref": 1.5} - - test_config = { - "batch_size": batch_size, - "channels": 4, - "height": 16, - "width": 16, - "hidden_dim": 128, - "cfg_scale": cfg_scale, - "cfg_normalize": cfg_normalize, - "model_seed": 42, - "input_seed": 123, - "n_branches": n_branches, - } - - mp_context = torch.multiprocessing.get_context("spawn") - manager = mp_context.Manager() - baseline_queue = manager.Queue() - cfg_parallel_queue = manager.Queue() - - # Run baseline (sequential, cfgp=1) - torch.multiprocessing.spawn( - _test_multi_branch_sequential_worker, - args=(1, dtype, test_config, baseline_queue), - nprocs=1, - ) - - # Run CFG parallel - torch.multiprocessing.spawn( - _test_multi_branch_parallel_worker, - args=(cfg_parallel_size, cfg_parallel_size, dtype, test_config, cfg_parallel_queue), - nprocs=cfg_parallel_size, - ) - - baseline_output = baseline_queue.get() - cfg_parallel_outputs = [cfg_parallel_queue.get() for _ in range(cfg_parallel_size)] - cfg_parallel_outputs.sort(key=lambda item: item[0]) - cfg_parallel_output = cfg_parallel_outputs[0][1] - - # All ranks should produce identical output - for cfg_rank, rank_output in cfg_parallel_outputs[1:]: - torch.testing.assert_close( - rank_output, - cfg_parallel_output, - rtol=0, - atol=0, - msg=f"Multi-branch CFG parallel ranks differ (rank 0 vs rank {cfg_rank})", - ) - - assert baseline_output.shape == cfg_parallel_output.shape, ( - f"Shape mismatch: baseline {baseline_output.shape} vs CFG parallel {cfg_parallel_output.shape}" - ) - - if dtype == torch.float32: - rtol, atol = 1e-5, 1e-5 - elif dtype == torch.bfloat16: - rtol, atol = 1e-2, 1e-2 - else: - rtol, atol = 1e-3, 1e-3 - - torch.testing.assert_close( - cfg_parallel_output, - baseline_output, - rtol=rtol, - atol=atol, - msg=( - f"Multi-branch CFG parallel output differs from sequential\n" - f" n_branches={n_branches}, cfg_parallel_size={cfg_parallel_size}\n" - f" dtype={dtype}, cfg_normalize={cfg_normalize}\n" - f" Max diff: {(cfg_parallel_output - baseline_output).abs().max().item():.6e}" - ), - ) - - print( - f"✓ Test passed: multi_branch n_branches={n_branches}, " - f"cfg_size={cfg_parallel_size}, dtype={dtype}, cfg_normalize={cfg_normalize}" - ) - - -@pytest.mark.parametrize("dtype", [torch.bfloat16]) -def test_multi_branch_without_cfg(dtype: torch.dtype): - """ - Test predict_noise_with_multi_branch_cfg when do_true_cfg=False. - - When CFG is disabled, only the first branch (positive) should be computed. - This test runs on a single GPU without distributed environment. - """ - available_gpus = current_omni_platform.get_device_count() - if available_gpus < 1: - pytest.skip("Test requires at least 1 GPU") - - device = torch.device(f"{current_omni_platform.device_type}:0") - current_omni_platform.set_device(device) - - pipeline = MultiBranchTestPipeline(in_channels=4, hidden_dim=128, seed=42) - pipeline.transformer = pipeline.transformer.to(device=device, dtype=dtype) - pipeline.transformer.eval() - - # Create 3 branch inputs (only first should be used) - branches_kwargs = [] - for b in range(3): - torch.manual_seed(123 + b) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(123 + b) - x = torch.randn(1, 4, 16, 16, dtype=dtype, device=device) - branches_kwargs.append({"x": x}) - - with torch.no_grad(): - noise_pred = pipeline.predict_noise_with_multi_branch_cfg( - do_true_cfg=False, # No CFG - true_cfg_scale=5.0, - branches_kwargs=branches_kwargs, - cfg_normalize=False, - ) - - assert noise_pred is not None - assert noise_pred.shape == (1, 4, 16, 16) - - print(f"✓ Test passed: multi_branch predict_noise without CFG (dtype={dtype})") diff --git a/tests/diffusion/distributed/test_distributed_vae_executor.py b/tests/diffusion/distributed/test_distributed_vae_executor.py index b2ee7c10d33..42e9f3300bc 100644 --- a/tests/diffusion/distributed/test_distributed_vae_executor.py +++ b/tests/diffusion/distributed/test_distributed_vae_executor.py @@ -1,4 +1,4 @@ -from types import SimpleNamespace +from unittest.mock import MagicMock, patch import pytest import torch @@ -11,8 +11,6 @@ TileTask, ) -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - class E2EOperator: """tiles with (2, 3) -- (H,W)""" @@ -61,31 +59,40 @@ def merge(self, coord_tensor_map, grid_spec): class DummyMixin(DistributedVaeMixin): def __init__(self): self.use_tiling = True - self.distributed_executor = SimpleNamespace(parallel_size=2, group=None) + self.distributed_decoder = MagicMock() + self.distributed_decoder.parallel_size = 2 + self.distributed_decoder.group = None @pytest.fixture(autouse=True) -def mock_dist(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setattr(dist, "get_world_size", lambda *args, **kwargs: 2) - monkeypatch.setattr(dist, "get_rank", lambda *args, **kwargs: 0) - monkeypatch.setattr(dist, "is_initialized", lambda: True) - monkeypatch.setattr(dist, "all_reduce", lambda *args, **kwargs: None) - monkeypatch.setattr(dist, "gather", lambda *args, **kwargs: None) - monkeypatch.setattr(dist, "broadcast", lambda *args, **kwargs: None) +def mock_dist(): + with ( + patch.object(dist, "get_world_size", return_value=2), + patch.object(dist, "get_rank", return_value=0), + patch.object(dist, "is_initialized", return_value=True), + patch.object(dist, "all_reduce", return_value=None), + patch.object(dist, "gather", return_value=None), + patch.object(dist, "broadcast", return_value=None), + ): + yield @pytest.fixture(autouse=True) -def mock_dit_group(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setattr( +def mock_dit_group(): + with patch( "vllm_omni.diffusion.distributed.autoencoders.distributed_vae_executor.get_dit_group", - lambda: None, - ) + new=MagicMock(return_value=None), + ): + yield @pytest.fixture(autouse=True) -def mock_dist_vae_executor(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setattr(DistributedVaeExecutor, "gather_tensors", lambda self, x: [x]) - monkeypatch.setattr(DistributedVaeExecutor, "broadcast_tensor", lambda self, x: x) +def mock_dist_vae_executor(): + with ( + patch.object(DistributedVaeExecutor, "gather_tensors", side_effect=lambda x: [x]), + patch.object(DistributedVaeExecutor, "broadcast_tensor", side_effect=lambda x: x), + ): + yield # ============================ diff --git a/tests/diffusion/distributed/test_ulysses_uaa_perf.py b/tests/diffusion/distributed/test_ulysses_uaa_perf.py index 2a16a9ae578..c8b07ba152a 100644 --- a/tests/diffusion/distributed/test_ulysses_uaa_perf.py +++ b/tests/diffusion/distributed/test_ulysses_uaa_perf.py @@ -17,7 +17,6 @@ import torch import torch.distributed as dist -from tests.helpers.mark import hardware_test from vllm_omni.diffusion.attention.parallel.ulysses import ( _all_gather_int, _ulysses_all_to_all_any_o, @@ -70,8 +69,6 @@ def world_size(self) -> int: @pytest.mark.parametrize("case", PERF_CASES) -@pytest.mark.core_model -@hardware_test(res={"cuda": "L4"}, num_cards=4) def test_ulysses_advanced_uaa_comm_overhead(case: _PerfCase) -> None: available_gpus = current_omni_platform.get_device_count() if available_gpus < case.world_size: diff --git a/tests/diffusion/hooks/test_hook_registry.py b/tests/diffusion/hooks/test_hook_registry.py deleted file mode 100644 index 6c8535cfec4..00000000000 --- a/tests/diffusion/hooks/test_hook_registry.py +++ /dev/null @@ -1,164 +0,0 @@ -""" -Tests for hook registry. - -NOTE: The hook registry is also tested indirectly through a lot of -other tests, e.g., tests/diffusion/distributed/test_sp_plan_hooks.py -""" - -from typing import Any - -import pytest -from torch import nn - -from vllm_omni.diffusion.hooks.base import HookRegistry, ModelHook - -DEFAULT_OUT = "ECHO" -OVERRIDE_OUT = "OVERRIDE" -INPUT_KWARG = "inp" - - -class EchoModule(nn.Module): - """Just echo the input.""" - - def __init__(self, *args: Any, **kwargs: Any) -> None: - super().__init__(*args, **kwargs) - - def forward(self, *args, **kwargs): - input_val = kwargs[INPUT_KWARG] - return input_val + DEFAULT_OUT - - -class AppendHook(ModelHook): - """Append an echo value to the input string on pre / post forward.""" - - def __init__(self, echo_val: str): - self.echo_val = echo_val - - def pre_forward(self, module: nn.Module, *args, **kwargs): - input_val = kwargs[INPUT_KWARG] - return (), {INPUT_KWARG: input_val + self.echo_val} - - def post_forward(self, module: nn.Module, output): - return output + self.echo_val - - -class OverrideAppendHook(AppendHook): - """Same as AppendHook, but replace the forward call with a different string.""" - - def new_forward(self, module: nn.Module, *args, **kwargs): - return kwargs[INPUT_KWARG] + OVERRIDE_OUT - - -def test_register_no_fwd_override_hooks(): - """Ensure registration is correct with no forward hooks.""" - mod = EchoModule() - registry = HookRegistry.get_or_create(mod) - first_hook = AppendHook("1") - second_hook = AppendHook("2") - sorted_no_fwd_hooks = [first_hook, second_hook] - - # Will add and sort the hook by key - registry.register_hook(name="b", hook=second_hook) - registry.register_hook(name="a", hook=first_hook) - - assert len(registry._hooks) == 2 - assert len(registry._sorted_hooks) == 2 - assert registry._new_fwd_impl_hook is None - # Ensure registering a new hook sorting alphabetically - for actual_hook, expected_hook in zip(registry._sorted_hooks, sorted_no_fwd_hooks): - assert actual_hook is expected_hook - - -def test_register_with_forward_hooks(): - """Ensure registration is correct with a forward hooks.""" - mod = EchoModule() - registry = HookRegistry.get_or_create(mod) - first_hook = AppendHook("1") - second_hook = AppendHook("2") - exec_hook = OverrideAppendHook("3") - sorted_no_fwd_hooks = [first_hook, second_hook] - - # Will add and sort the hook by key - registry.register_hook(name="b", hook=second_hook) - registry.register_hook(name="a", hook=first_hook) - registry.register_hook(name="c", hook=exec_hook) - - assert len(registry._hooks) == 3 - assert len(registry._sorted_hooks) == 3 - assert registry._new_fwd_impl_hook is exec_hook - # Ensure registering a new hook sorting alphabetically - for actual_hook, expected_hook in zip(registry._sorted_hooks, sorted_no_fwd_hooks): - assert actual_hook is expected_hook - - -def test_register_fails_with_multiple_forward_hooks(): - """Ensure registration only allows one hook overriding new_forward""" - mod = EchoModule() - registry = HookRegistry.get_or_create(mod) - - registry.register_hook(name="foo", hook=OverrideAppendHook("1")) - with pytest.raises(RuntimeError): - registry.register_hook(name="bar", hook=OverrideAppendHook("2")) - - -def test_remove_hooks(): - """Ensure removal sorts hooks.""" - mod = EchoModule() - registry = HookRegistry.get_or_create(mod) - - first_hook = AppendHook("1") - second_hook = AppendHook("2") - exec_hook = OverrideAppendHook("3") - - registry.register_hook(name="b", hook=second_hook) - registry.register_hook(name="a", hook=first_hook) - registry.register_hook(name="c", hook=exec_hook) - # Explicitly reorder our hooks to be in the wrong order, since register - # forces them to be sorted too. Ensure that remove the hook will also - # enforce the sorted order. - registry._sorted_hooks = [second_hook, first_hook] - - assert registry._new_fwd_impl_hook is exec_hook - registry.remove_hook("c") - assert registry._new_fwd_impl_hook is None - - sorted_no_fwd_hooks = [first_hook, second_hook] - for actual_hook, expected_hook in zip(registry._sorted_hooks, sorted_no_fwd_hooks): - assert actual_hook is expected_hook - - -def test_dispatch_no_fwd_override_hooks(): - """Ensure dispatch runs hooks in deterministic sorted order.""" - mod = EchoModule() - registry = HookRegistry.get_or_create(mod) - - first_hook = AppendHook("1") - second_hook = AppendHook("2") - - # Register will sort the hooks, so hook 1 will run first - # on preprocess and last in post process - registry.register_hook(name="2", hook=second_hook) - registry.register_hook(name="1", hook=first_hook) - res = registry.dispatch(inp="") - assert isinstance(res, str) - assert res == f"12{DEFAULT_OUT}21" - - -def test_dispatch_with_fwd_hooks(): - """Ensure dispatch runs hooks in deterministic sorted order.""" - mod = EchoModule() - registry = HookRegistry.get_or_create(mod) - - first_hook = AppendHook("1") - second_hook = AppendHook("2") - exec_hook = OverrideAppendHook("3") - - # Register will sort the hooks, so hook 1 will run first on preprocess and last in - # post process. Since the override hook mutates forward, it will run last even - # though the name of the exec_hook is alphabetically before the second hook. - registry.register_hook(name="c", hook=second_hook) - registry.register_hook(name="a", hook=first_hook) - registry.register_hook(name="b", hook=exec_hook) - res = registry.dispatch(inp="") - assert isinstance(res, str) - assert res == f"123{OVERRIDE_OUT}321" diff --git a/tests/diffusion/layers/test_norm.py b/tests/diffusion/layers/test_norm.py deleted file mode 100644 index e420415285d..00000000000 --- a/tests/diffusion/layers/test_norm.py +++ /dev/null @@ -1,453 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for LayerNorm and RMSNorm custom ops in diffusion layers.""" - -import pytest -import torch - -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] - - -# ── Import tests ── - - -def test_layernorm_import(): - """Verify LayerNorm can be imported from the norm module.""" - from vllm_omni.diffusion.layers.norm import LayerNorm # noqa: F401 - - -def test_rmsnorm_import(): - """Verify RMSNorm can be imported from the norm module.""" - from vllm_omni.diffusion.layers.norm import RMSNorm # noqa: F401 - - -# ── LayerNorm tests ── - - -def test_layernorm_forward_shape(): - """LayerNorm produces correct output shapes.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - batch = 2 - seq_len = 4 - norm = LayerNorm(dim) - - x = torch.randn(batch, seq_len, dim) - out = norm(x) - - assert out.shape == (batch, seq_len, dim) - - -def test_layernorm_forward_shape_2d(): - """LayerNorm works with 2D input tensors.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - batch = 2 - norm = LayerNorm(dim) - - x = torch.randn(batch, dim) - out = norm(x) - - assert out.shape == (batch, dim) - - -def test_layernorm_preserves_dtype_fp32(): - """LayerNorm preserves float32 dtype.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - norm = LayerNorm(dim) - - x = torch.randn(2, 4, dim, dtype=torch.float32) - out = norm(x) - - assert out.dtype == torch.float32 - - -def test_layernorm_preserves_dtype_fp16(): - """LayerNorm preserves float16 dtype.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - norm = LayerNorm(dim) - - x = torch.randn(2, 4, dim, dtype=torch.float16) - out = norm(x) - - assert out.dtype == torch.float16 - - -def test_layernorm_preserves_dtype_bf16(): - """LayerNorm preserves bfloat16 dtype.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - norm = LayerNorm(dim) - - x = torch.randn(2, 4, dim, dtype=torch.bfloat16) - out = norm(x) - - assert out.dtype == torch.bfloat16 - - -def test_layernorm_without_elementwise_affine(): - """LayerNorm works without elementwise_affine (no learned parameters).""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - norm = LayerNorm(dim, elementwise_affine=False) - - assert norm.weight is None - assert norm.bias is None - - x = torch.randn(2, 4, dim) - out = norm(x) - - assert out.shape == (2, 4, dim) - - -def test_layernorm_custom_eps(): - """LayerNorm accepts custom epsilon value.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - eps = 1e-5 - norm = LayerNorm(dim, eps=eps) - - assert norm.eps == eps - - -def test_layernorm_has_learnable_parameters(): - """LayerNorm has learnable weight and bias by default.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - norm = LayerNorm(dim) - - assert norm.weight is not None - assert norm.bias is not None - assert norm.weight.shape == (dim,) - assert norm.bias.shape == (dim,) - - -def test_layernorm_matches_fp32_reference(): - """Verify LayerNorm produces identical output to FP32 nn.LayerNorm.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - eps = 1e-6 - torch.manual_seed(42) - - ours = LayerNorm(dim, eps=eps) - ref = torch.nn.LayerNorm(dim, eps=eps) - - # Copy weights - ref.weight.data.copy_(ours.weight.data) - ref.bias.data.copy_(ours.bias.data) - - x = torch.randn(2, 4, dim) - - out_ours = ours(x) - out_ref = ref(x.float()).to(x.dtype) - - torch.testing.assert_close(out_ours, out_ref, atol=1e-5, rtol=1e-5) - - -def test_layernorm_matches_diffusers_fp32layernorm(): - """Verify LayerNorm produces identical output to diffusers FP32LayerNorm.""" - from diffusers.models.normalization import FP32LayerNorm - - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - eps = 1e-6 - torch.manual_seed(42) - - ours = LayerNorm(dim, eps=eps) - ref = FP32LayerNorm(dim, eps=eps) - - # Copy weights - ref.weight.data.copy_(ours.weight.data) - ref.bias.data.copy_(ours.bias.data) - - # Test with fp16 input to verify FP32 computation - x = torch.randn(2, 4, dim, dtype=torch.float16) - - out_ours = ours(x) - out_ref = ref(x) - - torch.testing.assert_close(out_ours, out_ref, atol=1e-3, rtol=1e-3) - - -# ── RMSNorm tests ── - - -def test_rmsnorm_forward_shape(): - """RMSNorm produces correct output shapes.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - hidden_size = 64 - batch = 2 - seq_len = 4 - norm = RMSNorm(hidden_size) - - x = torch.randn(batch, seq_len, hidden_size) - out = norm(x) - - assert out.shape == (batch, seq_len, hidden_size) - - -def test_rmsnorm_forward_shape_2d(): - """RMSNorm works with 2D input tensors.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - hidden_size = 64 - batch = 2 - norm = RMSNorm(hidden_size) - - x = torch.randn(batch, hidden_size) - out = norm(x) - - assert out.shape == (batch, hidden_size) - - -def test_rmsnorm_preserves_dtype_fp32(): - """RMSNorm preserves float32 dtype.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - hidden_size = 64 - norm = RMSNorm(hidden_size) - - x = torch.randn(2, 4, hidden_size, dtype=torch.float32) - out = norm(x) - - assert out.dtype == torch.float32 - - -def test_rmsnorm_preserves_dtype_fp16(): - """RMSNorm preserves float16 dtype.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - hidden_size = 64 - norm = RMSNorm(hidden_size) - - x = torch.randn(2, 4, hidden_size, dtype=torch.float16) - out = norm(x) - - assert out.dtype == torch.float16 - - -def test_rmsnorm_preserves_dtype_bf16(): - """RMSNorm preserves bfloat16 dtype.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - hidden_size = 64 - norm = RMSNorm(hidden_size) - - x = torch.randn(2, 4, hidden_size, dtype=torch.bfloat16) - out = norm(x) - - assert out.dtype == torch.bfloat16 - - -def test_rmsnorm_custom_eps(): - """RMSNorm accepts custom epsilon value.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - hidden_size = 64 - eps = 1e-5 - norm = RMSNorm(hidden_size, eps=eps) - - assert norm.variance_epsilon == eps - - -def test_rmsnorm_has_weight_parameter(): - """RMSNorm has learnable weight parameter initialized to ones.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - hidden_size = 64 - norm = RMSNorm(hidden_size) - - assert norm.weight is not None - assert norm.weight.shape == (hidden_size,) - torch.testing.assert_close(norm.weight, torch.ones(hidden_size)) - - -def test_rmsnorm_numerical_correctness(): - """Verify RMSNorm produces numerically correct output.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - hidden_size = 64 - eps = 1e-6 - torch.manual_seed(42) - - norm = RMSNorm(hidden_size, eps=eps) - x = torch.randn(2, 4, hidden_size) - - # Compute expected output manually - x_fp32 = x.to(torch.float32) - variance = x_fp32.pow(2).mean(-1, keepdim=True) - expected = x_fp32 * torch.rsqrt(variance + eps) - expected = norm.weight.to(torch.float32) * expected - expected = expected.to(x.dtype) - - out = norm(x) - - torch.testing.assert_close(out, expected, atol=1e-5, rtol=1e-5) - - -def test_rmsnorm_matches_reference_implementation(): - """Verify RMSNorm matches a reference implementation.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - def reference_rmsnorm(x, weight, eps): - """Reference RMSNorm implementation.""" - input_dtype = x.dtype - x = x.to(torch.float32) - variance = x.pow(2).mean(-1, keepdim=True) - out = x * torch.rsqrt(variance + eps) - out = weight.to(torch.float32) * out - return out.to(input_dtype) - - hidden_size = 128 - eps = 1e-6 - torch.manual_seed(123) - - norm = RMSNorm(hidden_size, eps=eps) - - # Test with various dtypes - for dtype in [torch.float32, torch.float16, torch.bfloat16]: - x = torch.randn(4, 8, hidden_size, dtype=dtype) - expected = reference_rmsnorm(x, norm.weight, eps) - out = norm(x) - torch.testing.assert_close(out, expected, atol=1e-3, rtol=1e-3) - - -# ── CustomOp dispatch tests ── - - -def test_layernorm_inherits_from_customop(): - """LayerNorm inherits from CustomOp for platform dispatch.""" - from vllm_omni.diffusion.layers.custom_op import CustomOp - from vllm_omni.diffusion.layers.norm import LayerNorm - - norm = LayerNorm(64) - assert isinstance(norm, CustomOp) - - -def test_rmsnorm_inherits_from_customop(): - """RMSNorm inherits from CustomOp for platform dispatch.""" - from vllm_omni.diffusion.layers.custom_op import CustomOp - from vllm_omni.diffusion.layers.norm import RMSNorm - - norm = RMSNorm(64) - assert isinstance(norm, CustomOp) - - -def test_layernorm_has_platform_methods(): - """LayerNorm has forward methods for each platform.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - norm = LayerNorm(64) - - assert hasattr(norm, "forward_cuda") - assert hasattr(norm, "forward_hip") - assert hasattr(norm, "forward_xpu") - assert hasattr(norm, "forward_npu") - assert hasattr(norm, "forward_native") - - -def test_rmsnorm_has_platform_methods(): - """RMSNorm has forward methods for each platform.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - norm = RMSNorm(64) - - assert hasattr(norm, "forward_cuda") - assert hasattr(norm, "forward_hip") - assert hasattr(norm, "forward_xpu") - assert hasattr(norm, "forward_npu") - assert hasattr(norm, "forward_native") - - -def test_layernorm_forward_native_directly(): - """LayerNorm.forward_native can be called directly.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - norm = LayerNorm(dim) - x = torch.randn(2, 4, dim) - - out = norm.forward_native(x) - - assert out.shape == (2, 4, dim) - - -def test_rmsnorm_forward_native_directly(): - """RMSNorm.forward_native can be called directly.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - hidden_size = 64 - norm = RMSNorm(hidden_size) - x = torch.randn(2, 4, hidden_size) - - out = norm.forward_native(x) - - assert out.shape == (2, 4, hidden_size) - - -# ── Edge case tests ── - - -def test_layernorm_with_large_dim(): - """LayerNorm works with large hidden dimensions.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 4096 - norm = LayerNorm(dim) - x = torch.randn(1, 16, dim) - - out = norm(x) - - assert out.shape == (1, 16, dim) - - -def test_rmsnorm_with_large_dim(): - """RMSNorm works with large hidden dimensions.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - hidden_size = 4096 - norm = RMSNorm(hidden_size) - x = torch.randn(1, 16, hidden_size) - - out = norm(x) - - assert out.shape == (1, 16, hidden_size) - - -def test_layernorm_with_single_element_batch(): - """LayerNorm works with batch size of 1.""" - from vllm_omni.diffusion.layers.norm import LayerNorm - - dim = 64 - norm = LayerNorm(dim) - x = torch.randn(1, 1, dim) - - out = norm(x) - - assert out.shape == (1, 1, dim) - - -def test_rmsnorm_with_single_element_batch(): - """RMSNorm works with batch size of 1.""" - from vllm_omni.diffusion.layers.norm import RMSNorm - - hidden_size = 64 - norm = RMSNorm(hidden_size) - x = torch.randn(1, 1, hidden_size) - - out = norm(x) - - assert out.shape == (1, 1, hidden_size) diff --git a/tests/diffusion/layers/test_rotary_emb_equivalence.py b/tests/diffusion/layers/test_rotary_emb_equivalence.py deleted file mode 100644 index 2fbb7a31f5a..00000000000 --- a/tests/diffusion/layers/test_rotary_emb_equivalence.py +++ /dev/null @@ -1,112 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Numerical equivalence tests for rotary embedding implementations (#2436). - -Verifies that the optimized stack+flatten RoPE produces bit-identical results -to the original strided-slice implementation across various tensor shapes and -dtypes, ensuring the refactor is safe. -""" - -from __future__ import annotations - -import pytest -import torch - - -def _apply_rotary_emb_helios_original( - hidden_states: torch.Tensor, - freqs_cis: torch.Tensor, -) -> torch.Tensor: - """Original Helios RoPE using strided slice assignment (pre-#2436).""" - x_1, x_2 = hidden_states.unflatten(-1, (-1, 2)).unbind(-1) - cos, sin = freqs_cis.unsqueeze(-2).chunk(2, dim=-1) - out = torch.empty_like(hidden_states) - out[..., 0::2] = x_1 * cos[..., 0::2] - x_2 * sin[..., 1::2] - out[..., 1::2] = x_1 * sin[..., 1::2] + x_2 * cos[..., 0::2] - return out.type_as(hidden_states) - - -def _apply_rotary_emb_helios_optimized( - hidden_states: torch.Tensor, - freqs_cis: torch.Tensor, -) -> torch.Tensor: - """Optimized Helios RoPE using stack+flatten (post-#2436).""" - x_1, x_2 = hidden_states.unflatten(-1, (-1, 2)).unbind(-1) - cos, sin = freqs_cis.unsqueeze(-2).chunk(2, dim=-1) - rotated = torch.stack( - ( - x_1 * cos[..., 0::2] - x_2 * sin[..., 1::2], - x_1 * sin[..., 1::2] + x_2 * cos[..., 0::2], - ), - dim=-1, - ) - return rotated.flatten(-2, -1).type_as(hidden_states) - - -def _make_inputs( - batch: int, - seq_len: int, - num_heads: int, - head_dim: int, - dtype: torch.dtype = torch.float32, -) -> tuple[torch.Tensor, torch.Tensor]: - """Generate random hidden_states and freqs_cis for testing.""" - torch.manual_seed(42) - hidden_states = torch.randn(batch, seq_len, num_heads, head_dim, dtype=dtype) - # freqs_cis: [B, seq, head_dim*2] — cos and sin concatenated along last dim - freqs_cis = torch.randn(batch, seq_len, head_dim * 2, dtype=dtype) - return hidden_states, freqs_cis - - -class TestHeliosRoPEEquivalence: - """Verify optimized Helios RoPE is numerically identical to original.""" - - @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) - def test_equivalence_across_dtypes(self, dtype: torch.dtype) -> None: - """Optimized output must be bit-identical to original across dtypes.""" - hidden, freqs = _make_inputs(2, 16, 8, 64, dtype=dtype) - original = _apply_rotary_emb_helios_original(hidden, freqs) - optimized = _apply_rotary_emb_helios_optimized(hidden, freqs) - torch.testing.assert_close(optimized, original, atol=0, rtol=0) - - @pytest.mark.parametrize( - "batch,seq_len,num_heads,head_dim", - [ - (1, 8, 1, 32), # minimal: single batch, single head - (2, 16, 8, 64), # typical transformer config - (1, 8192, 4, 64), # video-scale patch tokens (720p DiT) - (4, 32, 16, 128), # large head_dim - ], - ) - def test_equivalence_across_shapes(self, batch: int, seq_len: int, num_heads: int, head_dim: int) -> None: - """Equivalence must hold across different tensor shapes.""" - hidden, freqs = _make_inputs(batch, seq_len, num_heads, head_dim) - original = _apply_rotary_emb_helios_original(hidden, freqs) - optimized = _apply_rotary_emb_helios_optimized(hidden, freqs) - torch.testing.assert_close(optimized, original, atol=0, rtol=0) - - def test_output_contiguous(self) -> None: - """Optimized output should be contiguous in memory.""" - hidden, freqs = _make_inputs(2, 16, 8, 64) - optimized = _apply_rotary_emb_helios_optimized(hidden, freqs) - assert optimized.is_contiguous() - - def test_output_shape_preserved(self) -> None: - """Output shape must match input shape.""" - hidden, freqs = _make_inputs(2, 16, 8, 64) - optimized = _apply_rotary_emb_helios_optimized(hidden, freqs) - assert optimized.shape == hidden.shape - - def test_output_dtype_preserved(self) -> None: - """Output dtype must match input dtype.""" - hidden, freqs = _make_inputs(2, 16, 8, 64, dtype=torch.float16) - optimized = _apply_rotary_emb_helios_optimized(hidden, freqs) - assert optimized.dtype == hidden.dtype - - def test_odd_head_dim_raises(self) -> None: - """Odd head_dim should fail at unflatten (not a valid RoPE config).""" - hidden = torch.randn(1, 4, 2, 63) - freqs = torch.randn(1, 4, 126) - with pytest.raises(RuntimeError): - _apply_rotary_emb_helios_optimized(hidden, freqs) diff --git a/tests/diffusion/lora/helpers.py b/tests/diffusion/lora/helpers.py deleted file mode 100644 index 8b9b1ef4d20..00000000000 --- a/tests/diffusion/lora/helpers.py +++ /dev/null @@ -1,56 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Shared test helpers for diffusion LoRA tests.""" - -from __future__ import annotations - -import torch -from vllm.model_executor.layers.linear import LinearBase - - -class FakeLinearBase(LinearBase): - """Minimal LinearBase stub for LoRA layer discovery.""" - - def __init__(self): - torch.nn.Module.__init__(self) - - -class DummyBaseLayerWithLoRA(torch.nn.Module): - """Fake LoRA wrapper that records set/reset/create calls.""" - - def __init__(self, base_layer: torch.nn.Module): - super().__init__() - self.base_layer = base_layer - - self.set_calls: list[ - tuple[list[torch.Tensor | None] | torch.Tensor, list[torch.Tensor | None] | torch.Tensor] - ] = [] - self.reset_calls: int = 0 - self.create_calls: int = 0 - - def set_lora(self, index: int, lora_a, lora_b): - assert index == 0 - self.set_calls.append((lora_a, lora_b)) - - def reset_lora(self, index: int): - assert index == 0 - self.reset_calls += 1 - - def create_lora_weights(self, max_loras, lora_config, model_config): - self.create_calls += 1 - - -def fake_replace_submodule( - root: torch.nn.Module, - module_name: str, - submodule: torch.nn.Module, - replace_calls: list[str] | None = None, -) -> None: - """Replace a submodule by traversing dotted paths correctly.""" - if replace_calls is not None: - replace_calls.append(module_name) - parts = module_name.split(".") - parent = root - for attr in parts[:-1]: - parent = getattr(parent, attr) - setattr(parent, parts[-1], submodule) diff --git a/tests/diffusion/lora/test_lora_manager.py b/tests/diffusion/lora/test_lora_manager.py index 785f5d84217..8d4a1487fd0 100644 --- a/tests/diffusion/lora/test_lora_manager.py +++ b/tests/diffusion/lora/test_lora_manager.py @@ -7,12 +7,8 @@ import torch from vllm.lora.lora_weights import LoRALayerWeights from vllm.lora.utils import get_supported_lora_modules +from vllm.model_executor.layers.linear import LinearBase -from tests.diffusion.lora.helpers import ( - DummyBaseLayerWithLoRA, - FakeLinearBase, - fake_replace_submodule, -) from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager from vllm_omni.lora.request import LoRARequest @@ -37,9 +33,35 @@ def reset_lora(self, index: int): self.reset_calls += 1 -# Aliases for backward compatibility within this file -_FakeLinearBase = FakeLinearBase -_DummyBaseLayerWithLoRA = DummyBaseLayerWithLoRA +class _FakeLinearBase(LinearBase): + def __init__(self): + torch.nn.Module.__init__(self) + + +class _DummyBaseLayerWithLoRA(torch.nn.Module): + def __init__(self, base_layer: torch.nn.Module): + super().__init__() + self.base_layer = base_layer + + self.set_calls: list[ + tuple[list[torch.Tensor | None] | torch.Tensor, list[torch.Tensor | None] | torch.Tensor] + ] = [] + self.reset_calls: int = 0 + self.create_calls: int = 0 + + def set_lora(self, index: int, lora_a, lora_b): + assert index == 0 + self.set_calls.append((lora_a, lora_b)) + + def reset_lora(self, index: int): + assert index == 0 + self.reset_calls += 1 + + def create_lora_weights(self, max_loras, lora_config, model_config): + # Needs to be callable for scale test when rank changes, but not + # actually used since we mock everything and check everything based + # on set calls. + self.create_calls += 1 class _DummyPipeline(torch.nn.Module): @@ -533,45 +555,3 @@ def _fake_load(_req: LoRARequest): req1 = _dummy_lora_request(1) with pytest.raises(ValueError): manager.add_adapter(req1) - - -def test_lora_manager_discovers_bagel_component(monkeypatch): - """Verify that _replace_layers_with_lora finds layers under 'bagel'.""" - import vllm_omni.diffusion.lora.manager as manager_mod - - monkeypatch.setattr(manager_mod, "BaseLayerWithLoRA", _DummyBaseLayerWithLoRA) - - def _fake_from_layer_diffusion(*, layer: torch.nn.Module, **_kwargs): - if isinstance(layer, _FakeLinearBase): - return _DummyBaseLayerWithLoRA(layer) - return layer - - replace_calls: list[str] = [] - - monkeypatch.setattr(manager_mod, "from_layer_diffusion", _fake_from_layer_diffusion) - monkeypatch.setattr( - manager_mod, - "replace_submodule", - lambda root, name, sub: fake_replace_submodule(root, name, sub, replace_calls), - ) - - # Pipeline with a 'bagel' component (no 'transformer') - pipeline = torch.nn.Module() - pipeline.bagel = torch.nn.Module() - pipeline.bagel.language_model = torch.nn.Module() - pipeline.bagel.language_model.qkv_proj = _FakeLinearBase() - - manager = DiffusionLoRAManager( - pipeline=pipeline, - device=torch.device("cpu"), - dtype=torch.bfloat16, - max_cached_adapters=1, - ) - - peft_helper = type("_PH", (), {"r": 1})() - manager._replace_layers_with_lora(peft_helper) - - assert "language_model.qkv_proj" in replace_calls - assert "bagel.language_model.qkv_proj" in manager._lora_modules - # Verify the module was actually replaced in the tree (not just recorded) - assert isinstance(pipeline.bagel.language_model.qkv_proj, _DummyBaseLayerWithLoRA) diff --git a/tests/diffusion/models/bagel/test_bagel_lora.py b/tests/diffusion/models/bagel/test_bagel_lora.py deleted file mode 100644 index c285758fe86..00000000000 --- a/tests/diffusion/models/bagel/test_bagel_lora.py +++ /dev/null @@ -1,248 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for BAGEL LoRA support across Stage 0 (Thinker) and Stage 1 (DiT).""" - -from __future__ import annotations - -import json -from pathlib import Path - -import pytest -import torch -from safetensors.torch import save_file - -from tests.diffusion.lora.helpers import ( - DummyBaseLayerWithLoRA, - FakeLinearBase, - fake_replace_submodule, -) -from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager -from vllm_omni.lora.request import LoRARequest - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - -_FakeLinearBase = FakeLinearBase - - -# --------------------------------------------------------------------------- -# Stage 0 (Thinker / AR) -- packed_modules_mapping on the AR model class -# --------------------------------------------------------------------------- - - -class TestStage0ThinkerLoRA: - """Validate that OmniBagelForConditionalGeneration declares correct LoRA metadata.""" - - def test_omni_bagel_supports_lora(self): - from vllm_omni.model_executor.models.bagel.bagel import ( - OmniBagelForConditionalGeneration, - ) - - assert getattr(OmniBagelForConditionalGeneration, "supports_lora", False) is True - - def test_omni_bagel_packed_modules_mapping_complete(self): - from vllm_omni.model_executor.models.bagel.bagel import ( - OmniBagelForConditionalGeneration, - ) - - mapping = OmniBagelForConditionalGeneration.packed_modules_mapping - # Standard Qwen2 projections - assert mapping["qkv_proj"] == ["q_proj", "k_proj", "v_proj"] - assert mapping["gate_up_proj"] == ["gate_proj", "up_proj"] - # MoE generation-mode projections - assert mapping["qkv_proj_moe_gen"] == [ - "q_proj_moe_gen", - "k_proj_moe_gen", - "v_proj_moe_gen", - ] - assert mapping["mlp_moe_gen.gate_up_proj"] == [ - "mlp_moe_gen.gate_proj", - "mlp_moe_gen.up_proj", - ] - - -# --------------------------------------------------------------------------- -# Stage 1 (DiT / Diffusion) -- DiffusionLoRAManager with bagel component -# --------------------------------------------------------------------------- - - -class TestStage1DiTLoRA: - """Validate DiffusionLoRAManager discovers BAGEL's packed modules.""" - - def test_diffusion_lora_manager_discovers_bagel_packed_modules(self): - """Manager should derive packed→sublayer mapping from stacked_params_mapping.""" - pipeline = torch.nn.Module() - pipeline.bagel = torch.nn.Module() - - # Simulate a submodule that exposes stacked_params_mapping - # (as Bagel does after load_weights()) - language_model = torch.nn.Module() - language_model.stacked_params_mapping = [ - (".qkv_proj_moe_gen", ".q_proj_moe_gen", "q"), - (".qkv_proj_moe_gen", ".k_proj_moe_gen", "k"), - (".qkv_proj_moe_gen", ".v_proj_moe_gen", "v"), - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - pipeline.bagel.language_model = language_model - - manager = DiffusionLoRAManager( - pipeline=pipeline, - device=torch.device("cpu"), - dtype=torch.bfloat16, - max_cached_adapters=1, - ) - - mapping = manager._packed_modules_mapping - assert mapping["qkv_proj"] == ["q_proj", "k_proj", "v_proj"] - assert mapping["qkv_proj_moe_gen"] == [ - "q_proj_moe_gen", - "k_proj_moe_gen", - "v_proj_moe_gen", - ] - assert mapping["gate_up_proj"] == ["gate_proj", "up_proj"] - - def test_diffusion_lora_manager_replaces_bagel_packed_layer_via_sublayer_target(self, monkeypatch): - """Targeting sublayer 'q_proj' should replace the fused 'qkv_proj' under bagel.""" - import vllm_omni.diffusion.lora.manager as manager_mod - - monkeypatch.setattr(manager_mod, "BaseLayerWithLoRA", DummyBaseLayerWithLoRA) - - def _fake_from_layer_diffusion(*, layer, **_kwargs): - return DummyBaseLayerWithLoRA(layer) - - replace_calls: list[str] = [] - - monkeypatch.setattr(manager_mod, "from_layer_diffusion", _fake_from_layer_diffusion) - monkeypatch.setattr( - manager_mod, - "replace_submodule", - lambda root, name, sub: fake_replace_submodule(root, name, sub, replace_calls), - ) - - # Build pipeline with bagel component - pipeline = torch.nn.Module() - pipeline.bagel = torch.nn.Module() - lm = torch.nn.Module() - lm.stacked_params_mapping = [ - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - ] - lm.attn = torch.nn.Module() - lm.attn.qkv_proj = _FakeLinearBase() - pipeline.bagel.language_model = lm - - manager = DiffusionLoRAManager( - pipeline=pipeline, - device=torch.device("cpu"), - dtype=torch.bfloat16, - max_cached_adapters=1, - ) - - # Treat qkv_proj as 3-slice packed layer - monkeypatch.setattr(manager, "_get_packed_modules_list", lambda _module: ["q", "k", "v"]) - - # Target sublayer "q_proj" -- manager should replace the packed "qkv_proj" - peft_helper = type("_PH", (), {"r": 1, "target_modules": ["q_proj"]})() - manager._replace_layers_with_lora(peft_helper) - - assert "language_model.attn.qkv_proj" in replace_calls - assert "bagel.language_model.attn.qkv_proj" in manager._lora_modules - # Verify the module was actually replaced in the tree (not just recorded) - assert isinstance(pipeline.bagel.language_model.attn.qkv_proj, DummyBaseLayerWithLoRA) - - -# --------------------------------------------------------------------------- -# Round-trip: synthetic checkpoint → set_active_adapter → verify weights -# --------------------------------------------------------------------------- - - -def _write_synthetic_lora( - adapter_dir: Path, - module_name: str, - rank: int, - in_dim: int, - out_dim: int, -) -> str: - """Write a minimal LoRA adapter (safetensors + config) to *adapter_dir*.""" - adapter_dir.mkdir(parents=True, exist_ok=True) - lora_a = torch.ones((rank, in_dim), dtype=torch.float32) - lora_b = torch.ones((out_dim, rank), dtype=torch.float32) * 2.0 - save_file( - { - f"base_model.model.{module_name}.lora_A.weight": lora_a, - f"base_model.model.{module_name}.lora_B.weight": lora_b, - }, - str(adapter_dir / "adapter_model.safetensors"), - ) - (adapter_dir / "adapter_config.json").write_text( - json.dumps({"r": rank, "lora_alpha": rank, "target_modules": [module_name]}), - encoding="utf-8", - ) - return str(adapter_dir) - - -class TestBagelLoRARoundTrip: - """End-to-end: synthetic checkpoint → load → activate → verify weights in fused layer.""" - - def test_set_active_adapter_loads_and_activates_bagel_lora(self, tmp_path, monkeypatch): - """Full round-trip through set_active_adapter for a bagel component module.""" - import vllm_omni.diffusion.lora.manager as manager_mod - - monkeypatch.setattr(manager_mod, "BaseLayerWithLoRA", DummyBaseLayerWithLoRA) - - # Build pipeline with bagel.language_model.foo (simple non-packed layer) - pipeline = torch.nn.Module() - pipeline.bagel = torch.nn.Module() - lm = torch.nn.Module() - lm.foo = _FakeLinearBase() - pipeline.bagel.language_model = lm - - def _fake_from_layer(*, layer, **_kwargs): - if isinstance(layer, FakeLinearBase): - return DummyBaseLayerWithLoRA(layer) - return layer - - monkeypatch.setattr(manager_mod, "from_layer_diffusion", _fake_from_layer) - monkeypatch.setattr( - manager_mod, - "replace_submodule", - lambda root, name, sub: fake_replace_submodule(root, name, sub), - ) - - manager = DiffusionLoRAManager( - pipeline=pipeline, - device=torch.device("cpu"), - dtype=torch.bfloat16, - max_cached_adapters=1, - ) - - # Write synthetic adapter targeting bagel.language_model.foo - module_name = "bagel.language_model.foo" - rank = 2 - in_dim = 4 - out_dim = 4 - lora_dir = _write_synthetic_lora(tmp_path / "lora", module_name, rank, in_dim, out_dim) - - lora_request = LoRARequest( - lora_name="test_bagel", - lora_int_id=42, - lora_path=lora_dir, - ) - - # Full round-trip: load from disk → replace layer → activate weights - manager.set_active_adapter(lora_request, lora_scale=0.5) - - # Verify the layer was replaced and weights were set - replaced_layer = pipeline.bagel.language_model.foo - assert isinstance(replaced_layer, DummyBaseLayerWithLoRA), "Layer should be wrapped with LoRA" - assert len(replaced_layer.set_calls) == 1, "set_lora should have been called once" - - lora_a, lora_b = replaced_layer.set_calls[0] - # A weights should be ones (as written) - assert torch.all(lora_a == 1.0), f"lora_a should be all ones, got {lora_a}" - # B weights should be 2.0 * scale(0.5) = 1.0 - assert torch.allclose(lora_b, torch.ones_like(lora_b)), f"lora_b should be 2.0 * 0.5 = 1.0, got {lora_b}" diff --git a/tests/diffusion/models/bagel/test_trajectory_recording.py b/tests/diffusion/models/bagel/test_trajectory_recording.py deleted file mode 100644 index 345eac10784..00000000000 --- a/tests/diffusion/models/bagel/test_trajectory_recording.py +++ /dev/null @@ -1,244 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for BAGEL trajectory recording in the denoising loop.""" - -import types -from dataclasses import dataclass - -import pytest -import torch -from pytest_mock import MockerFixture - -from vllm_omni.diffusion.models.bagel.bagel_transformer import ( - Bagel, - NaiveCache, -) - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - -NUM_TOKENS = 8 -HIDDEN_DIM = 16 -NUM_TIMESTEPS = 5 -# generate_image uses timesteps[:-1], so actual steps = NUM_TIMESTEPS - 1 -EXPECTED_STEPS = NUM_TIMESTEPS - 1 - - -def _make_mock_bagel(mocker: MockerFixture): - """Create a mock Bagel with forward returning constant velocity.""" - mock = mocker.MagicMock(spec=Bagel) - mock._sp_size = 1 - - # forward returns a small constant velocity so x_t changes each step - def fake_forward(self, x_t, **kwargs): - return torch.ones_like(x_t) * 0.1 - - mock.forward = types.MethodType(fake_forward, mock) - # _merge_naive_caches is called in the batched CFG path - mock._merge_naive_caches = types.MethodType(lambda self, caches: NaiveCache(1), mock) - - # Bind the real generate_image to our mock - mock.generate_image = types.MethodType(Bagel.generate_image, mock) - return mock - - -def _make_generate_args(num_tokens=NUM_TOKENS, hidden_dim=HIDDEN_DIM, cfg=False): - """Tensor arguments for generate_image. - - Args: - cfg: If True, enable batched CFG path (cfg_text_scale > 1.0). - """ - seq_len = num_tokens + 2 # packed_seqlens includes 2 extra tokens - base = dict( - packed_text_ids=torch.zeros(2, dtype=torch.long), - packed_text_indexes=torch.tensor([0, 1], dtype=torch.long), - packed_init_noises=torch.randn(num_tokens, hidden_dim), - packed_vae_position_ids=torch.arange(num_tokens, dtype=torch.long), - packed_vae_token_indexes=torch.arange(2, seq_len, dtype=torch.long), - packed_seqlens=torch.tensor([seq_len], dtype=torch.int), - packed_position_ids=torch.arange(seq_len, dtype=torch.long), - packed_indexes=torch.arange(seq_len, dtype=torch.long), - past_key_values=NaiveCache(1), - key_values_lens=torch.tensor([0], dtype=torch.int), - packed_key_value_indexes=torch.zeros(0, dtype=torch.long), - num_timesteps=NUM_TIMESTEPS, - timestep_shift=1.0, - cfg_text_scale=1.0, - cfg_img_scale=1.0, - ) - if cfg: - base |= dict( - cfg_text_scale=4.0, - cfg_text_packed_query_indexes=torch.arange(seq_len, dtype=torch.long), - cfg_text_packed_position_ids=torch.arange(seq_len, dtype=torch.long), - cfg_text_past_key_values=NaiveCache(1), - cfg_text_key_values_lens=torch.tensor([0], dtype=torch.int), - cfg_text_packed_key_value_indexes=torch.zeros(0, dtype=torch.long), - ) - return base - - -@pytest.fixture(params=[False, True], ids=["no_cfg", "batched_cfg"]) -def bagel_and_args( - request, - monkeypatch: pytest.MonkeyPatch, - mocker: MockerFixture, -): - """Mock Bagel instance and generate_image arguments. - - Parametrized over CFG mode so every test runs on both the no-CFG - and batched-CFG code paths. - """ - cfg = request.param - monkeypatch.setattr( - "vllm_omni.diffusion.models.bagel.bagel_transformer.get_classifier_free_guidance_world_size", - lambda: 1, - ) - yield _make_mock_bagel(mocker), _make_generate_args(cfg=cfg) - - -class TestTrajectoryRecording: - """Tests for trajectory latent/timestep recording in generate_image.""" - - def test_trajectory_disabled_returns_none(self, bagel_and_args): - bagel, args = bagel_and_args - - unpacked, trajectory_latents, trajectory_timesteps, trajectory_log_probs = bagel.generate_image( - **args, return_trajectory_latents=False - ) - - assert isinstance(unpacked, (list, tuple)) - assert len(unpacked) == 1 # one sequence - assert trajectory_latents is None - assert trajectory_timesteps is None - assert trajectory_log_probs is None - - def test_trajectory_enabled_returns_correct_count(self, bagel_and_args): - bagel, args = bagel_and_args - - _, trajectory_latents, trajectory_timesteps, trajectory_log_probs = bagel.generate_image( - **args, return_trajectory_latents=True - ) - - assert trajectory_latents is not None - assert trajectory_timesteps is not None - assert len(trajectory_latents) == EXPECTED_STEPS - assert len(trajectory_timesteps) == EXPECTED_STEPS - # log_probs is None without a scheduler (default ODE path) - assert trajectory_log_probs is None - - def test_trajectory_latents_shape_matches_input(self, bagel_and_args): - bagel, args = bagel_and_args - expected_shape = args["packed_init_noises"].shape - - _, trajectory_latents, *_ = bagel.generate_image(**args, return_trajectory_latents=True) - - for i, lat in enumerate(trajectory_latents): - assert lat.shape == expected_shape, f"Step {i}: expected {expected_shape}, got {lat.shape}" - - def test_trajectory_latents_are_distinct(self, bagel_and_args): - bagel, args = bagel_and_args - - _, trajectory_latents, *_ = bagel.generate_image(**args, return_trajectory_latents=True) - - for i in range(1, len(trajectory_latents)): - assert not torch.equal(trajectory_latents[i], trajectory_latents[i - 1]), ( - f"Steps {i - 1} and {i} should differ" - ) - - def test_trajectory_timesteps_are_decreasing(self, bagel_and_args): - bagel, args = bagel_and_args - - _, _, trajectory_timesteps, _ = bagel.generate_image(**args, return_trajectory_latents=True) - - for i in range(1, len(trajectory_timesteps)): - assert trajectory_timesteps[i] < trajectory_timesteps[i - 1], ( - f"Timestep {i} ({trajectory_timesteps[i]:.4f}) should be less than " - f"timestep {i - 1} ({trajectory_timesteps[i - 1]:.4f})" - ) - - def test_trajectory_final_latent_matches_output(self, bagel_and_args): - bagel, args = bagel_and_args - - unpacked, trajectory_latents, *_ = bagel.generate_image(**args, return_trajectory_latents=True) - - # Reconstruct the full final latent from unpacked pieces - final_latent = torch.cat(unpacked, dim=0) - assert torch.allclose(trajectory_latents[-1], final_latent, atol=1e-6), ( - "Last trajectory latent should match the final output" - ) - - -# --------------------------------------------------------------------------- -# Mock scheduler for log-prob tests -# --------------------------------------------------------------------------- - - -@dataclass -class _MockStepOutput: - prev_sample: torch.Tensor - log_prob: torch.Tensor - - -class _MockScheduler: - """Minimal scheduler: Euler step + constant log-prob per step.""" - - def step(self, model_output, sigma, sample, dt, **kwargs): - prev_sample = sample - model_output * dt - log_prob = torch.tensor(-1.0) - return _MockStepOutput(prev_sample=prev_sample, log_prob=log_prob) - - -class TestTrajectoryLogProbs: - """Tests for log-prob recording when a scheduler is provided.""" - - @pytest.fixture() - def bagel_scheduler_args( - self, - monkeypatch: pytest.MonkeyPatch, - mocker: MockerFixture, - ): - monkeypatch.setattr( - "vllm_omni.diffusion.models.bagel.bagel_transformer.get_classifier_free_guidance_world_size", - lambda: 1, - ) - yield _make_mock_bagel(mocker), _make_generate_args(), _MockScheduler() - - def test_log_probs_recorded_with_scheduler(self, bagel_scheduler_args): - bagel, args, scheduler = bagel_scheduler_args - - _, _, _, trajectory_log_probs = bagel.generate_image( - **args, return_trajectory_latents=True, scheduler=scheduler - ) - - assert trajectory_log_probs is not None - assert len(trajectory_log_probs) == EXPECTED_STEPS - - def test_log_probs_are_finite(self, bagel_scheduler_args): - bagel, args, scheduler = bagel_scheduler_args - - _, _, _, trajectory_log_probs = bagel.generate_image( - **args, return_trajectory_latents=True, scheduler=scheduler - ) - - for i, lp in enumerate(trajectory_log_probs): - assert torch.isfinite(lp).all(), f"Step {i}: log_prob is not finite" - - def test_log_probs_none_without_scheduler(self, bagel_scheduler_args): - bagel, args, _ = bagel_scheduler_args - - _, _, _, trajectory_log_probs = bagel.generate_image(**args, return_trajectory_latents=True, scheduler=None) - - assert trajectory_log_probs is None - - def test_scheduler_updates_latents(self, bagel_scheduler_args): - """Verify the scheduler's prev_sample is used (not the raw Euler step).""" - bagel, args, scheduler = bagel_scheduler_args - - _, traj_with_sched, *_ = bagel.generate_image(**args, return_trajectory_latents=True, scheduler=scheduler) - _, traj_without, *_ = bagel.generate_image(**args, return_trajectory_latents=True, scheduler=None) - - # Mock scheduler does the same Euler step, so latents should match - for i in range(len(traj_with_sched)): - assert torch.allclose(traj_with_sched[i], traj_without[i], atol=1e-5), ( - f"Step {i}: scheduler and ODE paths should produce same latents" - ) diff --git a/tests/diffusion/models/dmd2/__init__.py b/tests/diffusion/models/dmd2/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/diffusion/models/dmd2/test_dmd2_request_sanitization.py b/tests/diffusion/models/dmd2/test_dmd2_request_sanitization.py deleted file mode 100644 index e270390bd99..00000000000 --- a/tests/diffusion/models/dmd2/test_dmd2_request_sanitization.py +++ /dev/null @@ -1,180 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from unittest.mock import MagicMock, patch - -import pytest -import torch - -from vllm_omni.diffusion.models.ltx2.pipeline_ltx2 import LTX2Pipeline, LTX2T2VDMD2Pipeline -from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_image2video import LTX2I2VDMD2Pipeline, LTX2ImageToVideoPipeline -from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import Wan22Pipeline, WanT2VDMD2Pipeline -from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_i2v import Wan22I2VPipeline, WanI2VDMD2Pipeline -from vllm_omni.diffusion.request import OmniDiffusionRequest, OmniDiffusionSamplingParams - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - -# DMD2 subclass → immediate base pipeline whose __init__ loads model weights (mocked in tests). -_DMD2_BASE = { - WanT2VDMD2Pipeline: Wan22Pipeline, - WanI2VDMD2Pipeline: Wan22I2VPipeline, - LTX2T2VDMD2Pipeline: LTX2Pipeline, - LTX2I2VDMD2Pipeline: LTX2ImageToVideoPipeline, -} - - -def _make_pipeline(cls): - """Run the DMD2 __init__ with the base pipeline mocked out (no model weights loaded).""" - - base = _DMD2_BASE[cls] - od_config = MagicMock() - od_config.model = "/nonexistent" - - def _mock_base_init(self, *a, **kw): - self.od_config = od_config - - with patch.object(base, "__init__", _mock_base_init): - pipeline = object.__new__(cls) - torch.nn.Module.__init__(pipeline) - cls.__init__(pipeline, od_config=od_config) - return pipeline - - -def _make_request(prompts=None, **sp_kwargs) -> OmniDiffusionRequest: - sp = OmniDiffusionSamplingParams(**sp_kwargs) - return OmniDiffusionRequest( - prompts=prompts or [{"prompt": "a cat dancing"}], - sampling_params=sp, - ) - - -@pytest.fixture( - params=list(_DMD2_BASE.keys()), - ids=["wan_t2v", "wan_i2v", "ltx2_t2v", "ltx2_i2v"], -) -def pipeline(request): - return _make_pipeline(request.param) - - -# --------------------------------------------------------------------------- -# num_inference_steps -# --------------------------------------------------------------------------- - - -def test_num_inference_steps_forced_to_dmd2_value(pipeline): - req = _make_request(num_inference_steps=40) - pipeline._sanitize_dmd2_request(req) - assert req.sampling_params.num_inference_steps == pipeline.num_inference_steps - - -def test_num_inference_steps_already_correct(pipeline): - req = _make_request(num_inference_steps=pipeline.num_inference_steps) - pipeline._sanitize_dmd2_request(req) - assert req.sampling_params.num_inference_steps == pipeline.num_inference_steps - - -# --------------------------------------------------------------------------- -# guidance_scale -# --------------------------------------------------------------------------- - - -def test_guidance_scale_forced_to_one(pipeline): - req = _make_request(guidance_scale=5.0, guidance_scale_provided=True) - pipeline._sanitize_dmd2_request(req) - assert req.sampling_params.guidance_scale == pipeline.dmd2_guidance_scale - assert req.sampling_params.guidance_scale_provided is False - - -def test_guidance_scale_already_correct(pipeline): - req = _make_request(guidance_scale=pipeline.dmd2_guidance_scale, guidance_scale_provided=False) - pipeline._sanitize_dmd2_request(req) - assert req.sampling_params.guidance_scale == pipeline.dmd2_guidance_scale - - -def test_guidance_scale_provided_flag_cleared(pipeline): - """guidance_scale_provided=True must be cleared even if scale is already dmd2_guidance_scale.""" - req = _make_request(guidance_scale=pipeline.dmd2_guidance_scale, guidance_scale_provided=True) - pipeline._sanitize_dmd2_request(req) - assert req.sampling_params.guidance_scale_provided is False - - -def test_guidance_scale_2_cleared(pipeline): - req = _make_request(guidance_scale_2=3.0) - pipeline._sanitize_dmd2_request(req) - assert req.sampling_params.guidance_scale_2 is None - - -def test_guidance_scale_2_unset_unchanged(pipeline): - req = _make_request() - pipeline._sanitize_dmd2_request(req) - assert req.sampling_params.guidance_scale_2 is None - - -def test_true_cfg_scale_cleared(pipeline): - req = _make_request(true_cfg_scale=2.0) - pipeline._sanitize_dmd2_request(req) - assert req.sampling_params.true_cfg_scale is None - - -def test_do_classifier_free_guidance_forced_false(pipeline): - req = _make_request(do_classifier_free_guidance=True) - pipeline._sanitize_dmd2_request(req) - assert req.sampling_params.do_classifier_free_guidance is False - - -def test_is_cfg_negative_forced_false(pipeline): - req = _make_request(is_cfg_negative=True) - pipeline._sanitize_dmd2_request(req) - assert req.sampling_params.is_cfg_negative is False - - -def test_negative_prompt_stripped_from_prompt_dict(pipeline): - req = _make_request(prompts=[{"prompt": "a cat", "negative_prompt": "blurry"}]) - pipeline._sanitize_dmd2_request(req) - assert "negative_prompt" not in req.prompts[0] - assert req.prompts[0]["prompt"] == "a cat" - - -def test_no_negative_prompt_unchanged(pipeline): - req = _make_request(prompts=[{"prompt": "a cat"}]) - pipeline._sanitize_dmd2_request(req) - assert req.prompts[0] == {"prompt": "a cat"} - - -def test_string_prompt_not_mutated(pipeline): - """String prompts (not dicts) must pass through unchanged.""" - req = _make_request(prompts=["a cat dancing"]) - pipeline._sanitize_dmd2_request(req) - assert req.prompts == ["a cat dancing"] - - -def test_multiple_prompts_all_sanitized(pipeline): - req = _make_request( - prompts=[ - {"prompt": "a cat", "negative_prompt": "blurry"}, - {"prompt": "a dog", "negative_prompt": "ugly"}, - ] - ) - pipeline._sanitize_dmd2_request(req) - for p in req.prompts: - assert "negative_prompt" not in p - - -# --------------------------------------------------------------------------- -# Clean request — nothing changes -# --------------------------------------------------------------------------- - - -def test_clean_request_no_changes(pipeline): - req = _make_request( - guidance_scale=pipeline.dmd2_guidance_scale, - guidance_scale_provided=False, - do_classifier_free_guidance=False, - is_cfg_negative=False, - ) - pipeline._sanitize_dmd2_request(req) - assert req.sampling_params.guidance_scale == pipeline.dmd2_guidance_scale - assert req.sampling_params.guidance_scale_provided is False - assert req.sampling_params.guidance_scale_2 is None - assert req.sampling_params.true_cfg_scale is None - assert req.sampling_params.do_classifier_free_guidance is False - assert req.sampling_params.is_cfg_negative is False diff --git a/tests/diffusion/models/dmd2/test_dmd2_scheduler.py b/tests/diffusion/models/dmd2/test_dmd2_scheduler.py deleted file mode 100644 index 32d00dbf18e..00000000000 --- a/tests/diffusion/models/dmd2/test_dmd2_scheduler.py +++ /dev/null @@ -1,93 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from unittest.mock import MagicMock, patch - -import pytest -import torch - -from vllm_omni.diffusion.models.ltx2.pipeline_ltx2 import LTX2Pipeline, LTX2T2VDMD2Pipeline -from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_image2video import LTX2I2VDMD2Pipeline, LTX2ImageToVideoPipeline -from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import Wan22Pipeline, WanT2VDMD2Pipeline -from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_i2v import Wan22I2VPipeline, WanI2VDMD2Pipeline -from vllm_omni.diffusion.request import OmniDiffusionRequest, OmniDiffusionSamplingParams - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - -_DMD2_TIMESTEPS = [999, 937, 833, 624] - -# DMD2 subclass → immediate base pipeline whose __init__ loads model weights (mocked in tests). -_DMD2_BASE = { - WanT2VDMD2Pipeline: Wan22Pipeline, - WanI2VDMD2Pipeline: Wan22I2VPipeline, - LTX2T2VDMD2Pipeline: LTX2Pipeline, - LTX2I2VDMD2Pipeline: LTX2ImageToVideoPipeline, -} - - -def _make_pipeline(cls): - """Run the DMD2 __init__ (including __init_dmd2__) with the base pipeline mocked.""" - - base = _DMD2_BASE[cls] - od_config = MagicMock() - od_config.model = "/nonexistent" - - def _mock_base_init(self, *a, **kw): - self.od_config = od_config # __init_dmd2__ needs this - - with patch.object(base, "__init__", _mock_base_init): - pipeline = object.__new__(cls) - torch.nn.Module.__init__(pipeline) - cls.__init__(pipeline, od_config=od_config) - return pipeline - - -def _make_request(**sp_kwargs) -> OmniDiffusionRequest: - sp = OmniDiffusionSamplingParams(**sp_kwargs) - return OmniDiffusionRequest(prompts=[{"prompt": "a cat"}], sampling_params=sp) - - -@pytest.fixture( - params=list(_DMD2_BASE.keys()), - ids=["wan_t2v", "wan_i2v", "ltx2_t2v", "ltx2_i2v"], -) -def pipeline(request): - return _make_pipeline(request.param) - - -# --------------------------------------------------------------------------- -# forward() timestep injection -# --------------------------------------------------------------------------- - - -def _fake_parent_forward(self, req, *args, num_inference_steps=40, **kwargs): - """Stub that calls set_timesteps as the real parent does.""" - self.scheduler.set_timesteps(num_inference_steps, device="cpu") - return MagicMock() - - -def test_forward_timesteps_match_dmd2_schedule(pipeline): - """After forward() runs, scheduler.timesteps must equal the DMD2 training schedule.""" - parent = _DMD2_BASE[type(pipeline)] - - # Baseline: calling set_timesteps(40) without the DMD2 override gives a different schedule - pipeline.scheduler.set_timesteps(40, device="cpu") - default_timesteps = pipeline.scheduler.timesteps.long().tolist() - assert default_timesteps == _DMD2_TIMESTEPS, ( - "DMD2EulerScheduler should always return DMD2 timesteps regardless of num_steps" - ) - - with patch.object(parent, "forward", _fake_parent_forward): - pipeline.forward(_make_request()) - - assert pipeline.scheduler.timesteps.long().tolist() == _DMD2_TIMESTEPS - - -def test_forward_timesteps_idempotent_across_calls(pipeline): - """Successive forward() calls must not cause scheduler state to drift.""" - parent = _DMD2_BASE[type(pipeline)] - - with patch.object(parent, "forward", _fake_parent_forward): - pipeline.forward(_make_request()) - pipeline.forward(_make_request()) - - assert pipeline.scheduler.timesteps.long().tolist() == _DMD2_TIMESTEPS diff --git a/tests/diffusion/models/flux2/test_flux2_transformer_tp.py b/tests/diffusion/models/flux2/test_flux2_transformer_tp.py index c613bb0b4c8..a2d1fe6abd3 100644 --- a/tests/diffusion/models/flux2/test_flux2_transformer_tp.py +++ b/tests/diffusion/models/flux2/test_flux2_transformer_tp.py @@ -1,8 +1,8 @@ +from unittest.mock import MagicMock, patch + import pytest import torch -from pytest_mock import MockerFixture -from tests.helpers.mark import hardware_test from vllm_omni.diffusion.models.flux2.flux2_transformer import ( Flux2PosEmbed, Flux2Transformer2DModel, @@ -11,24 +11,19 @@ # Initialize TP group before tests @pytest.fixture(scope="function", autouse=True) -def setup_tp_group(mocker: MockerFixture): +def setup_tp_group(): """Set up TP group for each test function""" - mocker.patch( - "vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", - return_value=2, - ) - mock_get_tp_group = mocker.patch("vllm.distributed.parallel_state.get_tp_group") - mock_tp_group = mocker.MagicMock() - mock_tp_group.world_size = 2 - mock_get_tp_group.return_value = mock_tp_group - yield + with patch("vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", return_value=2): + with patch("vllm.distributed.parallel_state.get_tp_group") as mock_get_tp_group: + mock_tp_group = MagicMock() + mock_tp_group.world_size = 2 + mock_get_tp_group.return_value = mock_tp_group + yield class TestFlux2TransformerWeightLoading: """Test Flux2Transformer weight loading functionality""" - @pytest.mark.core_model - @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_weight_loading_tp2(self, setup_tp_group): """Verify weights load correctly with TP=2""" # Prepare test data @@ -83,8 +78,6 @@ def test_weight_loading_tp2(self, setup_tp_group): class TestFlux2RopePositionEmbedding: """Test Flux2 RoPE position embedding functionality""" - @pytest.mark.core_model - @pytest.mark.cpu def test_rope_position_embedding(self): """Verify RoPE produces correct embeddings for 4D coordinates""" # Prepare test data - use model default configuration @@ -139,8 +132,6 @@ def test_rope_position_embedding(self): class TestFlux2PackedModuleMapping: """Test Flux2 packed module mapping functionality""" - @pytest.mark.core_model - @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_packed_module_mapping(self, setup_tp_group): """Verify to_qkv packing matches HF checkpoint""" model = Flux2Transformer2DModel( @@ -217,8 +208,6 @@ def test_packed_module_mapping(self, setup_tp_group): f"add_kv_proj weight dimension should be {expected_add_kv_shape}, got {attn_block.add_kv_proj.weight.shape}" ) - @pytest.mark.core_model - @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_packed_mapping_edge_cases(self, setup_tp_group): """Test edge cases for packed mapping""" model = Flux2Transformer2DModel( diff --git a/tests/diffusion/models/glm_image/test_glm_image_sp.py b/tests/diffusion/models/glm_image/test_glm_image_sp.py deleted file mode 100644 index 06a1a116dff..00000000000 --- a/tests/diffusion/models/glm_image/test_glm_image_sp.py +++ /dev/null @@ -1,132 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for GLM-Image Sequence Parallelism support.""" - -import pytest - -from vllm_omni.diffusion.data import DiffusionParallelConfig - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -@pytest.fixture(scope="function", autouse=True) -def setup_sp_groups(mocker): - """Set up SP and TP groups for each test function.""" - mock_get_sp_group = mocker.patch("vllm_omni.diffusion.distributed.parallel_state.get_sp_group") - mocker.patch("vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", return_value=1) - mock_get_tp_group = mocker.patch("vllm.distributed.parallel_state.get_tp_group") - - mock_sp_group = mocker.MagicMock() - mock_sp_group.world_size = 4 - mock_get_sp_group.return_value = mock_sp_group - - mock_tp_group = mocker.MagicMock() - mock_tp_group.world_size = 1 - mock_get_tp_group.return_value = mock_tp_group - yield - - -def test_glm_image_sp_plan_defined(): - """Test that _sp_plan is properly defined on GlmImageTransformer2DModel.""" - from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( - GlmImageTransformer2DModel, - ) - - assert hasattr(GlmImageTransformer2DModel, "_sp_plan") - plan = GlmImageTransformer2DModel._sp_plan - assert plan is not None - - # Verify plan structure - assert "prepare" in plan - assert "proj_out" in plan - - -def test_glm_image_sp_plan_valid(): - """Validate _sp_plan structure.""" - from vllm_omni.diffusion.distributed.sp_plan import validate_sp_plan - from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( - GlmImageTransformer2DModel, - ) - - plan = GlmImageTransformer2DModel._sp_plan - validate_sp_plan(plan) - - -def test_glm_image_prepare_module_exists(): - """Test that GlmImagePrepare module exists.""" - from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( - GlmImagePrepare, - ) - - assert GlmImagePrepare is not None - - -def test_glm_image_attention_accepts_parallel_config(): - """Test that GlmImageAttention accepts parallel_config parameter.""" - from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( - GlmImageAttention, - ) - - parallel_config = DiffusionParallelConfig( - ulysses_degree=2, - ring_degree=2, - tensor_parallel_size=1, - sequence_parallel_size=4, - ) - - attn = GlmImageAttention( - dim=2560, - num_heads=64, - head_dim=40, - parallel_config=parallel_config, - ) - - assert attn.parallel_config is not None - assert attn.parallel_config.sequence_parallel_size == 4 - - -def test_glm_image_transformer_block_accepts_parallel_config(): - """Test that GlmImageTransformerBlock accepts parallel_config parameter.""" - from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( - GlmImageTransformerBlock, - ) - - parallel_config = DiffusionParallelConfig( - ulysses_degree=2, - ring_degree=2, - tensor_parallel_size=1, - sequence_parallel_size=4, - ) - - block = GlmImageTransformerBlock( - dim=2560, - num_attention_heads=64, - attention_head_dim=40, - time_embed_dim=512, - parallel_config=parallel_config, - ) - - assert block.attn1.parallel_config is not None - assert block.attn1.parallel_config.sequence_parallel_size == 4 - - -def test_glm_image_has_sp_support(): - """Test that GLM-Image has SP support implemented.""" - from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( - GlmImageTransformer2DModel, - ) - - # Check that the model has parallel_config support - assert hasattr(GlmImageTransformer2DModel, "__init__") - - # Verify the model can be instantiated with SP config - - # This test just verifies the structure exists - # Actual SP testing requires multi-GPU setup - - -@pytest.mark.cuda -@pytest.mark.sp -def test_glm_image_sp_inference(): - """Test SP inference (requires multi-GPU setup).""" - pytest.skip("Requires multi-GPU SP setup") diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_sampler.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_sampler.py deleted file mode 100644 index 51f6a85f580..00000000000 --- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_sampler.py +++ /dev/null @@ -1,190 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for HunyuanImage3 AR sampler logic (stage transitions, -ratio restriction, comprehension blocking).""" - -import pytest -import torch - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - -# Fake token IDs for testing (avoid importing the real model). -END_OF_THINK = 100 -RECAPTION = 101 -END_OF_RECAPTION = 102 -ANSWER = 103 -BOI = 104 -SIZE_TOKEN = 105 -EOS = 106 -RATIO_START = 200 -RATIO_END = 210 -RATIO_OTHER_START = 220 -RATIO_OTHER_END = 223 - - -class FakeSamplerModel: - """Minimal stub that replicates the sampler-relevant attributes of - HunyuanImage3ForConditionalGeneration without loading real weights.""" - - def __init__(self, *, is_comprehension: bool = False): - self._is_comprehension = is_comprehension - self._eos_token_id = EOS - self._end_of_think_id = END_OF_THINK - self._recaption_id = RECAPTION - self._end_of_recaption_id = END_OF_RECAPTION - self._answer_id = ANSWER - self._mrope_boi_token_id = BOI - self._size_token_id = SIZE_TOKEN - self._start_ratio_id = RATIO_START - self._end_ratio_id = RATIO_END - self._ratio_other_slices = [(RATIO_OTHER_START, RATIO_OTHER_END + 1)] - self._all_ratio_ids = set(range(RATIO_START, RATIO_END + 1)) - self._all_ratio_ids.update(range(RATIO_OTHER_START, RATIO_OTHER_END + 1)) - - self._stage_transitions: dict[int, list[int]] = {} - if not is_comprehension: - self._stage_transitions[END_OF_THINK] = [RECAPTION] - self._stage_transitions[END_OF_RECAPTION] = [ANSWER, BOI, SIZE_TOKEN] - - self._blocked_token_ids: set[int] = set() - if is_comprehension: - self._blocked_token_ids.update([BOI, SIZE_TOKEN]) - self._blocked_token_ids.update(self._all_ratio_ids) - - # Bind the real methods from the model class. - from vllm_omni.model_executor.models.hunyuan_image3.hunyuan_image3 import ( - HunyuanImage3ForConditionalGeneration as _Real, - ) - - _get_forced_token = _Real._get_forced_token - _apply_ratio_restriction = _Real._apply_ratio_restriction - - -class TestGetForcedToken: - """Tests for the stateless _get_forced_token method.""" - - def setup_method(self): - self.model = FakeSamplerModel(is_comprehension=False) - - def test_no_trigger_returns_none(self): - assert self.model._get_forced_token([1, 2, 3]) is None - - def test_empty_history_returns_none(self): - assert self.model._get_forced_token([]) is None - - def test_end_of_think_forces_recaption(self): - assert self.model._get_forced_token([END_OF_THINK]) == RECAPTION - - def test_end_of_think_completed(self): - assert self.model._get_forced_token([END_OF_THINK, RECAPTION]) is None - - def test_end_of_recaption_forces_answer(self): - tokens = [END_OF_THINK, RECAPTION, END_OF_RECAPTION] - assert self.model._get_forced_token(tokens) == ANSWER - - def test_end_of_recaption_forces_boi_after_answer(self): - tokens = [END_OF_THINK, RECAPTION, END_OF_RECAPTION, ANSWER] - assert self.model._get_forced_token(tokens) == BOI - - def test_end_of_recaption_forces_size_after_boi(self): - tokens = [END_OF_THINK, RECAPTION, END_OF_RECAPTION, ANSWER, BOI] - assert self.model._get_forced_token(tokens) == SIZE_TOKEN - - def test_full_sequence_complete(self): - tokens = [END_OF_THINK, RECAPTION, END_OF_RECAPTION, ANSWER, BOI, SIZE_TOKEN] - assert self.model._get_forced_token(tokens) is None - - def test_diverged_history_returns_none(self): - tokens = [END_OF_RECAPTION, 999] # 999 != ANSWER - assert self.model._get_forced_token(tokens) is None - - def test_later_trigger_takes_precedence(self): - tokens = [END_OF_THINK, RECAPTION, END_OF_RECAPTION] - assert self.model._get_forced_token(tokens) == ANSWER - - def test_trigger_with_extra_tokens_before(self): - tokens = [1, 2, 3, END_OF_THINK] - assert self.model._get_forced_token(tokens) == RECAPTION - - -class TestComprehensionBlocking: - """Tests for comprehension mode token blocking.""" - - def test_blocked_tokens_masked(self): - model = FakeSamplerModel(is_comprehension=True) - vocab_size = 300 - logits = torch.zeros(1, vocab_size) - logits[0, BOI] = 5.0 - logits[0, SIZE_TOKEN] = 3.0 - logits[0, RATIO_START] = 2.0 - min_score = torch.finfo(logits.dtype).min - - for tid in model._blocked_token_ids: - if tid < vocab_size: - logits[0, tid] = min_score - - assert logits[0, BOI].item() == min_score - assert logits[0, SIZE_TOKEN].item() == min_score - assert logits[0, RATIO_START].item() == min_score - - def test_non_blocked_tokens_preserved(self): - model = FakeSamplerModel(is_comprehension=True) - vocab_size = 300 - logits = torch.zeros(1, vocab_size) - logits[0, 50] = 7.0 - min_score = torch.finfo(logits.dtype).min - - for tid in model._blocked_token_ids: - if tid < vocab_size: - logits[0, tid] = min_score - - assert logits[0, 50].item() == 7.0 - - -class TestRatioRestriction: - """Tests for _apply_ratio_restriction (greedy: only argmax ratio survives).""" - - def test_greedy_selects_single_ratio_token(self): - model = FakeSamplerModel(is_comprehension=False) - vocab_size = 300 - logits = torch.zeros(1, vocab_size) - logits[0, RATIO_START + 3] = 10.0 - logits[0, RATIO_START + 1] = 5.0 - logits[0, 50] = 20.0 # non-ratio, should be masked - min_score = torch.finfo(logits.dtype).min - - model._apply_ratio_restriction(logits, 0, min_score) - - assert logits[0, RATIO_START + 3].item() == 0 - assert logits[0, RATIO_START + 1].item() == min_score - assert logits[0, 50].item() == min_score - - def test_extra_ratio_slices_considered(self): - model = FakeSamplerModel(is_comprehension=False) - vocab_size = 300 - logits = torch.zeros(1, vocab_size) - logits[0, RATIO_OTHER_START] = 15.0 - logits[0, RATIO_START] = 5.0 - min_score = torch.finfo(logits.dtype).min - - model._apply_ratio_restriction(logits, 0, min_score) - - assert logits[0, RATIO_OTHER_START].item() == 0 - assert logits[0, RATIO_START].item() == min_score - - -class TestForceEosAfterRatio: - """Tests that a ratio token as last_token forces EOS.""" - - def test_ratio_token_forces_eos(self): - model = FakeSamplerModel(is_comprehension=False) - vocab_size = 300 - logits = torch.randn(1, vocab_size) - min_score = torch.finfo(logits.dtype).min - - logits[0].fill_(min_score) - logits[0, model._eos_token_id] = 0 - - assert logits[0, EOS].item() == 0 - non_eos_max = logits[0, :EOS].max().item() - assert non_eos_max == min_score diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py b/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py similarity index 85% rename from tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py rename to tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py index 626f78eed9c..2cda9116c7d 100644 --- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py +++ b/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py @@ -12,7 +12,7 @@ class TestSetForwardContextNumTokens: def test_sets_num_tokens_when_context_available(self, mocker): """num_tokens should be set on ForwardContext when available.""" - import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe mock_ctx = mocker.MagicMock() del mock_ctx.in_profile_run # simulate missing attr @@ -26,7 +26,7 @@ def test_sets_num_tokens_when_context_available(self, mocker): def test_sets_in_profile_run_only_if_missing(self, mocker): """in_profile_run should not be overwritten if already set.""" - import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe mock_ctx = mocker.MagicMock() mock_ctx.in_profile_run = True # already set @@ -40,7 +40,7 @@ def test_sets_in_profile_run_only_if_missing(self, mocker): def test_noop_when_context_unavailable(self, mocker): """Should do nothing when ForwardContext is not available.""" - import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe mocker.patch.object(hunyuan_moe._vllm_fc, "is_forward_context_available", return_value=False) mock_get = mocker.patch.object(hunyuan_moe._vllm_fc, "get_forward_context") @@ -55,11 +55,11 @@ class TestHunyuanFusedMoEPlatformDispatch: def test_default_platform_uses_default_impl_qualname(self, mocker): """HunyuanFusedMoE should resolve the impl class from the platform hook.""" - import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe mock_platform = mocker.MagicMock() mock_platform.get_diffusion_model_impl_qualname.return_value = ( - "vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe.HunyuanFusedMoEDefault" + "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault" ) mocker.patch.object( @@ -71,7 +71,7 @@ def test_default_platform_uses_default_impl_qualname(self, mocker): mock_impl = mocker.MagicMock() mock_resolve.return_value = mock_impl - from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import ( + from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import ( HunyuanFusedMoE, ) @@ -80,7 +80,7 @@ def test_default_platform_uses_default_impl_qualname(self, mocker): mock_platform.prepare_diffusion_op_runtime.assert_called_once_with("hunyuan_fused_moe") mock_platform.get_diffusion_model_impl_qualname.assert_called_once_with("hunyuan_fused_moe") mock_resolve.assert_called_once_with( - "vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe.HunyuanFusedMoEDefault" + "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault" ) mock_impl.assert_called_once_with(prefix="") @@ -90,7 +90,7 @@ class TestHunyuanFusedMoEFactory: def test_new_delegates_to_impl_class(self, mocker): """HunyuanFusedMoE(prefix=..., **kwargs) should instantiate and return impl instance.""" - import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe class MockImpl: def __init__(self, *, prefix: str = "", **kwargs): @@ -104,7 +104,7 @@ def __init__(self, *, prefix: str = "", **kwargs): mock_impl_class = mocker.MagicMock(return_value=MockImpl(prefix="test", a=1)) mocker.patch.object(hunyuan_moe, "resolve_obj_by_qualname", return_value=mock_impl_class) - from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import ( + from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import ( HunyuanFusedMoE, ) @@ -119,7 +119,7 @@ def __init__(self, *, prefix: str = "", **kwargs): def test_make_expert_params_mapping_delegates_to_impl(self, mocker): """make_expert_params_mapping should delegate to impl class method.""" - import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe expected_mapping = [("a", "b", 0, "c")] mock_platform = mocker.MagicMock() @@ -130,7 +130,7 @@ def test_make_expert_params_mapping_delegates_to_impl(self, mocker): mock_impl_class.make_expert_params_mapping = mocker.MagicMock(return_value=expected_mapping) mocker.patch.object(hunyuan_moe, "resolve_obj_by_qualname", return_value=mock_impl_class) - from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import ( + from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import ( HunyuanFusedMoE, ) diff --git a/tests/diffusion/models/ltx2/test_ltx2_3_pipeline.py b/tests/diffusion/models/ltx2/test_ltx2_3_pipeline.py deleted file mode 100644 index 665126df737..00000000000 --- a/tests/diffusion/models/ltx2/test_ltx2_3_pipeline.py +++ /dev/null @@ -1,230 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -"""Unit tests for LTX-2.3 pipeline integration. - -These tests verify: -- Pipeline is properly registered in the diffusion registry -- Post-process function is registered -- Cache-DiT enablers are registered -- Pipeline does NOT inherit from LTX2Pipeline -- Vocoder sample rate detection logic -- Re-export module works correctly -""" - -import json -import os -import tempfile - -import pytest - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -class TestPipelineIndependence: - """Verify LTX23Pipeline is fully independent from LTX2Pipeline.""" - - def test_ltx23_pipeline_does_not_inherit_from_ltx2(self): - """LTX23Pipeline must NOT inherit from LTX2Pipeline.""" - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2 import LTX2Pipeline - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3 import LTX23Pipeline - - assert not issubclass(LTX23Pipeline, LTX2Pipeline), ( - "LTX23Pipeline should be fully independent and not inherit from LTX2Pipeline" - ) - - def test_ltx23_pipeline_is_nn_module(self): - """LTX23Pipeline must be an nn.Module.""" - import torch.nn as nn - - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3 import LTX23Pipeline - - assert issubclass(LTX23Pipeline, nn.Module) - - def test_ltx23_pipeline_has_progress_bar(self): - """LTX23Pipeline must mix in ProgressBarMixin.""" - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3 import LTX23Pipeline - from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin - - assert issubclass(LTX23Pipeline, ProgressBarMixin) - - -class TestRegistryIntegration: - """Verify all LTX-2.3 pipeline variants are registered.""" - - def test_pipeline_models_registered(self): - """LTX-2.3 pipeline variants must be in _DIFFUSION_MODELS.""" - from vllm_omni.diffusion.registry import _DIFFUSION_MODELS - - expected = [ - "LTX23Pipeline", - "LTX23ImageToVideoPipeline", - ] - for name in expected: - assert name in _DIFFUSION_MODELS, f"{name} not found in _DIFFUSION_MODELS" - - def test_pipeline_module_paths(self): - """Registry entries must point to the correct modules.""" - from vllm_omni.diffusion.registry import _DIFFUSION_MODELS - - # T2V -> pipeline_ltx2_3 - assert _DIFFUSION_MODELS["LTX23Pipeline"] == ("ltx2", "pipeline_ltx2_3", "LTX23Pipeline") - - # I2V -> pipeline_ltx2_3_image2video - assert _DIFFUSION_MODELS["LTX23ImageToVideoPipeline"] == ( - "ltx2", - "pipeline_ltx2_3_image2video", - "LTX23ImageToVideoPipeline", - ) - - def test_post_process_funcs_registered(self): - """Pipeline variants must map to get_ltx2_post_process_func.""" - from vllm_omni.diffusion.registry import _DIFFUSION_POST_PROCESS_FUNCS - - expected = [ - "LTX23Pipeline", - "LTX23ImageToVideoPipeline", - ] - for name in expected: - assert name in _DIFFUSION_POST_PROCESS_FUNCS, f"{name} not in _DIFFUSION_POST_PROCESS_FUNCS" - assert _DIFFUSION_POST_PROCESS_FUNCS[name] == "get_ltx2_post_process_func" - - def test_cache_dit_enablers_registered(self): - """Pipeline variants must be registered in CUSTOM_DIT_ENABLERS.""" - from vllm_omni.diffusion.cache.cache_dit_backend import CUSTOM_DIT_ENABLERS - - expected = [ - "LTX23Pipeline", - "LTX23ImageToVideoPipeline", - ] - for name in expected: - assert name in CUSTOM_DIT_ENABLERS, f"{name} not in CUSTOM_DIT_ENABLERS" - - -class TestVocoderSampleRateDetection: - """Test _detect_vocoder_output_sample_rate logic.""" - - def test_detects_48khz_from_config(self): - """Should detect output_sampling_rate=48000 from vocoder/config.json.""" - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3 import _detect_vocoder_output_sample_rate - - with tempfile.TemporaryDirectory() as tmpdir: - vocoder_dir = os.path.join(tmpdir, "vocoder") - os.makedirs(vocoder_dir) - with open(os.path.join(vocoder_dir, "config.json"), "w") as f: - json.dump({"output_sampling_rate": 48000, "input_sampling_rate": 16000}, f) - - result = _detect_vocoder_output_sample_rate(tmpdir) - assert result == 48000 - - def test_returns_none_for_no_output_sr(self): - """Should return None if vocoder config has no output_sampling_rate.""" - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3 import _detect_vocoder_output_sample_rate - - with tempfile.TemporaryDirectory() as tmpdir: - vocoder_dir = os.path.join(tmpdir, "vocoder") - os.makedirs(vocoder_dir) - with open(os.path.join(vocoder_dir, "config.json"), "w") as f: - json.dump({"sampling_rate": 16000}, f) - - result = _detect_vocoder_output_sample_rate(tmpdir) - assert result is None - - def test_returns_none_for_missing_directory(self): - """Should return None if vocoder directory doesn't exist.""" - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3 import _detect_vocoder_output_sample_rate - - result = _detect_vocoder_output_sample_rate("/nonexistent/path") - assert result is None - - -class TestPostProcessFunction: - """Test the post-process function factory.""" - - def test_post_process_includes_audio_sample_rate(self): - """Post-process func should include audio_sample_rate when detected.""" - import torch - - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3 import get_ltx2_post_process_func - - with tempfile.TemporaryDirectory() as tmpdir: - vocoder_dir = os.path.join(tmpdir, "vocoder") - os.makedirs(vocoder_dir) - with open(os.path.join(vocoder_dir, "config.json"), "w") as f: - json.dump({"output_sampling_rate": 48000}, f) - - # Create a minimal od_config mock - class MockConfig: - model = tmpdir - - func = get_ltx2_post_process_func(MockConfig()) - - video = torch.zeros(1, 3, 4, 64, 64) - audio = torch.zeros(1, 1, 48000) - result = func((video, audio)) - - assert isinstance(result, dict) - assert "video" in result - assert "audio" in result - assert result["audio_sample_rate"] == 48000 - - def test_post_process_without_vocoder_config(self): - """Post-process func should work without vocoder config (no audio_sample_rate key).""" - import torch - - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3 import get_ltx2_post_process_func - - class MockConfig: - model = "/nonexistent/path" - - func = get_ltx2_post_process_func(MockConfig()) - - video = torch.zeros(1, 3, 4, 64, 64) - audio = torch.zeros(1, 1, 16000) - result = func((video, audio)) - - assert isinstance(result, dict) - assert "video" in result - assert "audio" in result - assert "audio_sample_rate" not in result - - -class TestReExportModule: - """Test that pipeline_ltx2_3_image2video.py correctly re-exports.""" - - def test_i2v_classes_importable(self): - """I2V classes must be importable from the re-export module.""" - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3_image2video import LTX23ImageToVideoPipeline - - assert LTX23ImageToVideoPipeline is not None - - def test_post_process_func_importable(self): - """get_ltx2_post_process_func must be importable from re-export module.""" - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3_image2video import get_ltx2_post_process_func - - assert callable(get_ltx2_post_process_func) - - def test_i2v_classes_are_same_as_direct_import(self): - """Re-exported classes must be the same objects as direct imports.""" - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3 import LTX23ImageToVideoPipeline as Direct - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3_image2video import ( - LTX23ImageToVideoPipeline as ReExported, - ) - - assert Direct is ReExported - - -class TestInitExports: - """Test that __init__.py exports all LTX-2.3 classes.""" - - def test_all_ltx23_classes_exported(self): - """All LTX-2.3 pipeline classes must be in the ltx2 package __all__.""" - from vllm_omni.diffusion.models import ltx2 - - expected_classes = [ - "LTX23Pipeline", - "LTX23ImageToVideoPipeline", - ] - for name in expected_classes: - assert hasattr(ltx2, name), f"{name} not exported from ltx2 package" - assert name in ltx2.__all__, f"{name} not in ltx2.__all__" diff --git a/tests/diffusion/models/ltx2/test_ltx2_cfg_parallel_adaptation.py b/tests/diffusion/models/ltx2/test_ltx2_cfg_parallel_adaptation.py deleted file mode 100644 index bbfe63dfa58..00000000000 --- a/tests/diffusion/models/ltx2/test_ltx2_cfg_parallel_adaptation.py +++ /dev/null @@ -1,58 +0,0 @@ -from types import SimpleNamespace - -import pytest -import torch - -from vllm_omni.diffusion.models.ltx2.pipeline_ltx2 import LTX2Pipeline - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -def _make_pipeline(sequence_parallel_size: int = 1) -> LTX2Pipeline: - pipeline = object.__new__(LTX2Pipeline) - torch.nn.Module.__init__(pipeline) - pipeline.audio_vae_temporal_compression_ratio = 4 - pipeline.audio_vae_mel_compression_ratio = 4 - pipeline.od_config = SimpleNamespace(parallel_config=SimpleNamespace(sequence_parallel_size=sequence_parallel_size)) - # Mock audio_vae with identity normalization (mean=0, std=1) so - # _normalize_audio_latents is a no-op and test values are preserved. - pipeline.audio_vae = SimpleNamespace( - latents_mean=torch.tensor(0.0), - latents_std=torch.tensor(1.0), - ) - return pipeline - - -def test_prepare_audio_latents_pads_packed_sequence_dim_for_provided_latents(): - pipeline = _make_pipeline(sequence_parallel_size=4) - latents = torch.arange(40, dtype=torch.float32).view(1, 10, 4) - - padded, original_num_frames, padded_num_frames = pipeline.prepare_audio_latents( - batch_size=1, - num_channels_latents=2, - num_mel_bins=8, - audio_latent_length=10, - dtype=torch.float32, - device=torch.device("cpu"), - latents=latents, - ) - - assert original_num_frames == 10 - assert padded_num_frames == 12 - assert padded.shape == (1, 12, 4) - torch.testing.assert_close(padded[:, :10], latents) - torch.testing.assert_close(padded[:, 10:], torch.zeros(1, 2, 4)) - - -def test_unpad_audio_latents_restores_original_frames_before_unpack(): - pipeline = _make_pipeline() - original = torch.arange(40, dtype=torch.float32).view(1, 10, 4) - padded = torch.cat([original, torch.full((1, 2, 4), 999.0)], dim=1) - - unpadded = pipeline._unpad_audio_latents(padded, 10) - unpacked = pipeline._unpack_audio_latents(unpadded, latent_length=10, num_mel_bins=2) - expected = pipeline._unpack_audio_latents(original, latent_length=10, num_mel_bins=2) - - assert unpacked.shape == (1, 2, 10, 2) - assert not (unpacked == 999.0).any() - torch.testing.assert_close(unpacked, expected) diff --git a/tests/diffusion/models/ltx2/test_ltx2_hsdp.py b/tests/diffusion/models/ltx2/test_ltx2_hsdp.py deleted file mode 100644 index 4dd07e1bf82..00000000000 --- a/tests/diffusion/models/ltx2/test_ltx2_hsdp.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest -import torch.nn as nn - -from vllm_omni.diffusion.models.ltx2.ltx2_transformer import LTX2VideoTransformer3DModel - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -def test_ltx2_exposes_hsdp_shard_conditions_for_transformer_blocks(): - model = object.__new__(LTX2VideoTransformer3DModel) - nn.Module.__init__(model) - model.transformer_blocks = nn.ModuleList([nn.Linear(4, 4) for _ in range(2)]) - model.norm_out = nn.LayerNorm(4) - - conditions = getattr(model, "_hsdp_shard_conditions", None) - - assert conditions is not None - assert len(conditions) == 1 - - matched = [] - for name, module in model.named_modules(): - if any(cond(name, module) for cond in conditions): - matched.append(name) - - assert matched == ["transformer_blocks.0", "transformer_blocks.1"] diff --git a/tests/diffusion/models/qwen_image/test_qwen_image_edit_plus.py b/tests/diffusion/models/qwen_image/test_qwen_image_edit_plus.py deleted file mode 100644 index 873b52bf7a6..00000000000 --- a/tests/diffusion/models/qwen_image/test_qwen_image_edit_plus.py +++ /dev/null @@ -1,38 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import json -from pathlib import Path -from types import SimpleNamespace - -import numpy as np -import pytest -from PIL import Image - -from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image_edit_plus import ( - get_qwen_image_edit_plus_pre_process_func, -) - -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] - - -def test_qwen_image_edit_plus_rejects_too_many_input_images(tmp_path: Path): - vae_dir = tmp_path / "vae" - vae_dir.mkdir() - # Keep the mock config intentionally minimal: this test only needs the - # fields touched during pre-process initialization. - (vae_dir / "config.json").write_text(json.dumps({"z_dim": 16})) - - pre_process = get_qwen_image_edit_plus_pre_process_func(SimpleNamespace(model=str(tmp_path))) - image = Image.fromarray(np.zeros((32, 32, 3), dtype=np.uint8)) - request = SimpleNamespace( - prompts=[ - { - "prompt": "combine", - "multi_modal_data": {"image": [image, image, image, image, image]}, - } - ], - sampling_params=SimpleNamespace(height=None, width=None), - ) - - with pytest.raises(ValueError, match=r"At most 4 images are supported by this model"): - pre_process(request) diff --git a/tests/diffusion/models/qwen_image/test_qwen_image_max_sequence_length.py b/tests/diffusion/models/qwen_image/test_qwen_image_max_sequence_length.py deleted file mode 100644 index f5676a0056f..00000000000 --- a/tests/diffusion/models/qwen_image/test_qwen_image_max_sequence_length.py +++ /dev/null @@ -1,260 +0,0 @@ -import inspect -from types import SimpleNamespace - -import pytest -import torch -from torch import nn - -from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image import ( - QwenImagePipeline, -) -from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image_edit import ( - QwenImageEditPipeline, -) -from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image_edit_plus import ( - QwenImageEditPlusPipeline, -) -from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image_layered import ( - QwenImageLayeredPipeline, -) - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -class _RejectingTextEncoder: - dtype = torch.float32 - - def __call__(self, *args, **kwargs): - raise AssertionError("text encoder should not run for prompts that exceed max_sequence_length") - - -class _FakeModelInputs: - def __init__(self, total_sequence_length: int): - attention_mask = torch.ones((1, total_sequence_length), dtype=torch.long) - self.input_ids = attention_mask.clone() - self.attention_mask = attention_mask - self.pixel_values = None - self.image_grid_thw = None - - def to(self, device): - return self - - -class _FakeTokenizer: - def __init__(self, total_sequence_length: int | list[int]): - if isinstance(total_sequence_length, list): - self.total_sequence_lengths = list(total_sequence_length) - else: - self.total_sequence_lengths = [total_sequence_length] - - def __call__(self, *args, **kwargs): - if len(self.total_sequence_lengths) > 1: - total_sequence_length = self.total_sequence_lengths.pop(0) - else: - total_sequence_length = self.total_sequence_lengths[0] - return _FakeModelInputs(total_sequence_length) - - -class _FakeProcessor(_FakeTokenizer): - pass - - -class _FakeScheduler: - def __init__(self): - self.begin_index = None - - def set_begin_index(self, begin_index: int): - self.begin_index = begin_index - - -PIPELINE_CASES = [ - pytest.param(QwenImagePipeline, 34, "tokenizer", id="qwen-image"), - pytest.param(QwenImageLayeredPipeline, 34, "tokenizer", id="qwen-image-layered"), - pytest.param(QwenImageEditPipeline, 64, "processor", id="qwen-image-edit"), - pytest.param(QwenImageEditPlusPipeline, 64, "processor", id="qwen-image-edit-plus"), -] - - -def _make_pipeline( - pipeline_class: type, - *, - total_sequence_length: int, - drop_idx: int, - input_kind: str, -): - pipeline = object.__new__(pipeline_class) - nn.Module.__init__(pipeline) - pipeline.device = torch.device("cpu") - pipeline.text_encoder = _RejectingTextEncoder() - pipeline.tokenizer_max_length = 1024 - pipeline.prompt_template_encode = "{}" - pipeline.prompt_template_encode_start_idx = drop_idx - pipeline.tokenizer = _FakeTokenizer([total_sequence_length, 0]) - if input_kind == "processor": - pipeline.processor = _FakeProcessor(total_sequence_length) - return pipeline - - -@pytest.mark.parametrize(("pipeline_class", "drop_idx", "input_kind"), PIPELINE_CASES) -def test_encode_prompt_rejects_prompt_longer_than_default_max_sequence_length( - pipeline_class: type, - drop_idx: int, - input_kind: str, -): - pipeline = _make_pipeline( - pipeline_class, - total_sequence_length=1025, - drop_idx=drop_idx, - input_kind=input_kind, - ) - - with pytest.raises(ValueError, match=r"got 1025 tokens, but `max_sequence_length` is 1024"): - pipeline.encode_prompt(prompt="prompt") - - -@pytest.mark.parametrize(("pipeline_class", "drop_idx", "input_kind"), PIPELINE_CASES) -def test_encode_prompt_rejects_prompt_longer_than_explicit_max_sequence_length( - pipeline_class: type, - drop_idx: int, - input_kind: str, -): - pipeline = _make_pipeline( - pipeline_class, - total_sequence_length=17, - drop_idx=drop_idx, - input_kind=input_kind, - ) - - with pytest.raises(ValueError, match=r"got 17 tokens, but `max_sequence_length` is 16"): - pipeline.encode_prompt(prompt="prompt", max_sequence_length=16) - - -def test_prepare_encode_defaults_to_tokenizer_max_length(): - pipeline = object.__new__(QwenImagePipeline) - nn.Module.__init__(pipeline) - pipeline.tokenizer_max_length = 1024 - pipeline.vae_scale_factor = 8 - pipeline.default_sample_size = 128 - pipeline.scheduler = _FakeScheduler() - pipeline._extract_prompts = lambda prompts: (["prompt"], None) - - captured = {} - - def _fake_prepare_generation_context(**kwargs): - captured["max_sequence_length"] = kwargs["max_sequence_length"] - embeds = torch.ones((1, 1, 1)) - mask = torch.ones((1, 1), dtype=torch.long) - return { - "prompt_embeds": embeds, - "prompt_embeds_mask": mask, - "negative_prompt_embeds": None, - "negative_prompt_embeds_mask": None, - "latents": embeds, - "timesteps": torch.tensor([1]), - "do_true_cfg": False, - "guidance": None, - "img_shapes": [[(1, 1, 1)]], - "txt_seq_lens": [1], - "negative_txt_seq_lens": None, - } - - pipeline._prepare_generation_context = _fake_prepare_generation_context - state = SimpleNamespace( - prompts=["prompt"], - sampling=SimpleNamespace( - height=None, - width=None, - num_inference_steps=None, - sigmas=None, - guidance_scale_provided=False, - num_outputs_per_prompt=0, - generator=None, - true_cfg_scale=None, - max_sequence_length=None, - ), - ) - - pipeline.prepare_encode(state) - - assert captured["max_sequence_length"] == 1024 - - -@pytest.mark.parametrize( - ("pipeline_class", "drop_idx"), - [ - pytest.param(QwenImageEditPipeline, 64, id="qwen-image-edit"), - pytest.param(QwenImageEditPlusPipeline, 64, id="qwen-image-edit-plus"), - ], -) -def test_edit_pipelines_validate_text_prompt_length_before_image_token_expansion( - pipeline_class: type, - drop_idx: int, -): - pipeline = object.__new__(pipeline_class) - nn.Module.__init__(pipeline) - pipeline.device = torch.device("cpu") - pipeline.text_encoder = _RejectingTextEncoder() - pipeline.tokenizer_max_length = 1024 - pipeline.prompt_template_encode = "{}" - pipeline.prompt_template_encode_start_idx = drop_idx - pipeline.tokenizer = _FakeTokenizer([8, 0]) - pipeline.processor = _FakeProcessor(drop_idx + 1500) - - with pytest.raises(AssertionError, match="text encoder should not run"): - pipeline.encode_prompt(prompt="short prompt") - - -@pytest.mark.parametrize( - "pipeline_class", - [ - pytest.param(QwenImagePipeline, id="qwen-image"), - pytest.param(QwenImageLayeredPipeline, id="qwen-image-layered"), - ], -) -def test_qwen_generation_validator_excludes_template_suffix_from_budget(pipeline_class: type): - pipeline = object.__new__(pipeline_class) - nn.Module.__init__(pipeline) - pipeline.device = torch.device("cpu") - pipeline.text_encoder = _RejectingTextEncoder() - pipeline.tokenizer_max_length = 1024 - pipeline.prompt_template_encode = "{}" - pipeline.prompt_template_encode_start_idx = 34 - pipeline.tokenizer = _FakeTokenizer([1029, 5]) - - with pytest.raises(AssertionError, match="text encoder should not run"): - pipeline.encode_prompt(prompt="boundary prompt") - - -@pytest.mark.parametrize( - "pipeline_class", - [ - pytest.param(QwenImageEditPipeline, id="qwen-image-edit"), - pytest.param(QwenImageEditPlusPipeline, id="qwen-image-edit-plus"), - ], -) -def test_qwen_edit_validator_excludes_image_placeholders_from_budget(pipeline_class: type): - pipeline = object.__new__(pipeline_class) - nn.Module.__init__(pipeline) - pipeline.device = torch.device("cpu") - pipeline.text_encoder = _RejectingTextEncoder() - pipeline.tokenizer_max_length = 1024 - pipeline.prompt_template_encode = "{}" - pipeline.prompt_template_encode_start_idx = 64 - pipeline.tokenizer = _FakeTokenizer([30, 20]) - pipeline.processor = _FakeProcessor(1500) - - with pytest.raises(AssertionError, match="text encoder should not run"): - pipeline.encode_prompt(prompt="short prompt") - - -@pytest.mark.parametrize( - "pipeline_class", - [ - QwenImagePipeline, - QwenImageLayeredPipeline, - QwenImageEditPipeline, - QwenImageEditPlusPipeline, - ], -) -def test_forward_max_sequence_length_default_is_1024(pipeline_class: type): - assert inspect.signature(pipeline_class.forward).parameters["max_sequence_length"].default == 1024 diff --git a/tests/diffusion/models/qwen_image/test_qwen_image_size_utils.py b/tests/diffusion/models/qwen_image/test_qwen_image_size_utils.py deleted file mode 100644 index 7ba8f108a13..00000000000 --- a/tests/diffusion/models/qwen_image/test_qwen_image_size_utils.py +++ /dev/null @@ -1,26 +0,0 @@ -import pytest - -from vllm_omni.diffusion.utils.size_utils import ( - normalize_min_aligned_size, -) - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -@pytest.mark.parametrize( - ("height", "width", "expected"), - [ - (1, 1, (16, 16)), - (15, 15, (16, 16)), - (17, 17, (16, 16)), - (31, 33, (16, 32)), - (64, 80, (64, 80)), - ], -) -def test_normalize_min_aligned_size_clamps_to_minimum_aligned_shape(height, width, expected): - assert normalize_min_aligned_size(height, width, alignment=16) == expected - - -def test_normalize_min_aligned_size_rejects_invalid_alignment(): - with pytest.raises(ValueError, match="positive alignment"): - normalize_min_aligned_size(16, 16, alignment=0) diff --git a/tests/diffusion/models/stable_audio/test_stable_audio_hsdp.py b/tests/diffusion/models/stable_audio/test_stable_audio_hsdp.py deleted file mode 100644 index 923b9a86315..00000000000 --- a/tests/diffusion/models/stable_audio/test_stable_audio_hsdp.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest -import torch.nn as nn - -from vllm_omni.diffusion.models.stable_audio.stable_audio_transformer import StableAudioDiTModel - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -def test_stable_audio_exposes_hsdp_shard_conditions_for_transformer_blocks(): - model = object.__new__(StableAudioDiTModel) - nn.Module.__init__(model) - model.transformer_blocks = nn.ModuleList([nn.Linear(4, 4) for _ in range(2)]) - model.proj_out = nn.Linear(4, 4) - - conditions = getattr(model, "_hsdp_shard_conditions", None) - - assert conditions is not None - assert len(conditions) == 1 - - matched = [] - for name, module in model.named_modules(): - if any(cond(name, module) for cond in conditions): - matched.append(name) - - assert matched == ["transformer_blocks.0", "transformer_blocks.1"] diff --git a/tests/diffusion/models/t5_encoder/test_t5_encoder_prefix.py b/tests/diffusion/models/t5_encoder/test_t5_encoder_prefix.py deleted file mode 100644 index 039150f096c..00000000000 --- a/tests/diffusion/models/t5_encoder/test_t5_encoder_prefix.py +++ /dev/null @@ -1,164 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for T5EncoderModel prefix handling and weight loading fix.""" - -import pytest -import torch -from transformers import T5Config -from vllm.config import DeviceConfig, VllmConfig, set_current_vllm_config - -from vllm_omni.diffusion.models.t5_encoder.t5_encoder import T5EncoderModel - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -_SMALL_T5_CONFIG = dict( - d_model=64, - d_kv=8, - d_ff=128, - num_heads=8, - num_layers=2, - vocab_size=256, - relative_attention_num_buckets=32, - relative_attention_max_distance=128, - is_gated_act=True, - dense_act_fn="gelu_new", - layer_norm_epsilon=1e-6, - feed_forward_proj="gated-gelu", -) - -_T5_MODULE = "vllm_omni.diffusion.models.t5_encoder.t5_encoder" - - -@pytest.fixture -def t5_config() -> T5Config: - return T5Config(**_SMALL_T5_CONFIG) - - -@pytest.fixture(scope="function", autouse=True) -def setup_vllm_config(monkeypatch, mocker): - """Set up VllmConfig and TP=2 mocks for tests.""" - device_config = DeviceConfig(device="cpu") - - monkeypatch.setattr("vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", lambda: 2) - monkeypatch.setattr(f"{_T5_MODULE}.get_tensor_model_parallel_world_size", lambda: 2) - monkeypatch.setattr( - "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size", - lambda: 2, - ) - - monkeypatch.setattr(f"{_T5_MODULE}.get_tensor_model_parallel_rank", lambda: 0) - monkeypatch.setattr( - "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank", - lambda: 0, - ) - - mock_tp_group = mocker.MagicMock() - mock_tp_group.world_size = 2 - mocker.patch("vllm.distributed.parallel_state.get_tp_group", return_value=mock_tp_group) - - monkeypatch.setattr(f"{_T5_MODULE}.get_act_fn", lambda _: torch.nn.GELU()) - - with set_current_vllm_config(VllmConfig(device_config=device_config)): - yield - - -class TestT5EncoderModelPrefixHandling: - """Test that T5EncoderModel correctly handles prefix attribute.""" - - def test_prefix_stored_in_model(self, t5_config): - """Test that prefix is stored in the model when provided.""" - prefix = "text_encoder" - model = T5EncoderModel(t5_config, prefix=prefix) - assert hasattr(model, "prefix") - assert model.prefix == prefix - - def test_prefix_empty_by_default(self, t5_config): - """Test that prefix defaults to empty string when not provided.""" - model = T5EncoderModel(t5_config) - assert hasattr(model, "prefix") - assert model.prefix == "" - - -class TestT5EncoderModelWeightLoadingWithPrefix: - """Test weight loading with prefix handling.""" - - def test_load_weights_with_prefix(self, t5_config): - """Test that weights without prefix are loaded when model has prefix.""" - config = T5Config(**{**_SMALL_T5_CONFIG, "num_layers": 1}) - model = T5EncoderModel(config, prefix="text_encoder") - - inner_dim = config.num_heads * config.d_kv - - weights = [ - ("encoder.block.0.layer.0.SelfAttention.q.weight", torch.randn(inner_dim, config.d_model)), - ("encoder.block.0.layer.0.SelfAttention.k.weight", torch.randn(inner_dim, config.d_model)), - ("encoder.block.0.layer.0.SelfAttention.v.weight", torch.randn(inner_dim, config.d_model)), - ] - - loaded = model.load_weights(weights) - assert len(loaded) > 0 - - def test_load_weights_embed_tokens_shared_sync(self, t5_config): - """Test that embed_tokens and shared weights are synced.""" - model = T5EncoderModel(t5_config, prefix="text_encoder") - - d_model = t5_config.d_model - vocab_size = t5_config.vocab_size - - embed_weight = torch.randn(vocab_size, d_model) - weights = [ - ("encoder.embed_tokens.weight", embed_weight.clone()), - ] - - model.load_weights(weights) - - shared_param = model.shared.weight - embed_param = model.encoder.embed_tokens.weight - - assert torch.allclose(shared_param, embed_param), ( - "shared and embed_tokens should have the same weights after loading" - ) - - def test_load_weights_shared_without_prefix(self, t5_config): - """Test shared.weight is recognized without relying on dot context.""" - model = T5EncoderModel(t5_config, prefix="text_encoder") - - shared_weight = torch.randn(t5_config.vocab_size, t5_config.d_model) - loaded = model.load_weights([("shared.weight", shared_weight)]) - - assert "shared.weight" in loaded - assert torch.allclose(model.shared.weight, model.encoder.embed_tokens.weight) - - def test_unmatched_weights_are_not_reported_loaded(self, t5_config): - """Test that skipped checkpoint weights are not added to loaded_params.""" - model = T5EncoderModel(t5_config, prefix="text_encoder") - - loaded = model.load_weights( - [ - ( - "text_encoder.encoder.block.0.layer.0.SelfAttention.missing.weight", - torch.randn(t5_config.d_model, t5_config.d_model), - ), - ] - ) - - assert loaded == set() - - -class TestT5EncoderModelWeightLoadingWithoutPrefix: - """Test weight loading without prefix.""" - - def test_load_weights_without_prefix(self, t5_config): - """Test that weights without prefix are loaded correctly.""" - config = T5Config(**{**_SMALL_T5_CONFIG, "num_layers": 1}) - model = T5EncoderModel(config) - - inner_dim = config.num_heads * config.d_kv - - weights = [ - ("encoder.block.0.layer.0.SelfAttention.q.weight", torch.randn(inner_dim, config.d_model)), - ] - - loaded = model.load_weights(weights) - assert len(loaded) > 0 diff --git a/tests/diffusion/models/wan2_2/__init__.py b/tests/diffusion/models/wan2_2/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/diffusion/models/wan2_2/conftest.py b/tests/diffusion/models/wan2_2/conftest.py deleted file mode 100644 index f836fa545fd..00000000000 --- a/tests/diffusion/models/wan2_2/conftest.py +++ /dev/null @@ -1,80 +0,0 @@ -from __future__ import annotations - -from contextlib import contextmanager -from types import SimpleNamespace - -import torch -from torch import nn - - -class StubTransformer(nn.Module): - def __init__(self, *, name: str = "transformer", in_channels: int = 4, out_channels: int = 4) -> None: - super().__init__() - self.name = name - self.config = SimpleNamespace( - patch_size=(1, 2, 2), - in_channels=in_channels, - out_channels=out_channels, - image_dim=None, - ) - - @property - def dtype(self) -> torch.dtype: - return torch.float32 - - def forward(self, **kwargs): - hidden_states = kwargs["hidden_states"] - return (torch.zeros_like(hidden_states[:, : self.config.out_channels]),) - - -class StubScheduler: - def __init__(self, timesteps: list[int]) -> None: - self.timesteps = torch.tensor(timesteps, dtype=torch.int64) - self.config = SimpleNamespace(num_train_timesteps=1000) - self.set_timesteps_calls: list[tuple[int, torch.device]] = [] - - def set_timesteps(self, num_steps: int, device: torch.device) -> None: - self.set_timesteps_calls.append((num_steps, device)) - - -class StubVAE: - dtype = torch.float32 - - def __init__(self, z_dim: int = 4) -> None: - self.config = SimpleNamespace( - z_dim=z_dim, - scale_factor_temporal=4, - scale_factor_spatial=8, - latents_mean=[0.0] * z_dim, - latents_std=[1.0] * z_dim, - ) - - def encode(self, video: torch.Tensor): - latent_frames = (video.shape[2] + self.config.scale_factor_temporal - 1) // self.config.scale_factor_temporal - latent_height = video.shape[-2] // self.config.scale_factor_spatial - latent_width = video.shape[-1] // self.config.scale_factor_spatial - latents = torch.ones( - video.shape[0], - self.config.z_dim, - latent_frames, - latent_height, - latent_width, - dtype=video.dtype, - device=video.device, - ) - return SimpleNamespace(latents=latents) - - def decode(self, latents: torch.Tensor, return_dict: bool = False): - del return_dict - return (latents,) - - -@contextmanager -def noop_progress_bar(*args, **kwargs): - del args, kwargs - - class Bar: - def update(self) -> None: - return None - - yield Bar() diff --git a/tests/diffusion/models/wan2_2/test_wan22_i2v_pipeline.py b/tests/diffusion/models/wan2_2/test_wan22_i2v_pipeline.py deleted file mode 100644 index 576678e2cf0..00000000000 --- a/tests/diffusion/models/wan2_2/test_wan22_i2v_pipeline.py +++ /dev/null @@ -1,125 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from types import SimpleNamespace - -import pytest -import torch -from PIL import Image -from torch import nn - -from tests.diffusion.models.wan2_2.conftest import StubTransformer, StubVAE, noop_progress_bar -from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_i2v import ( - Wan22I2VPipeline, - get_wan22_i2v_pre_process_func, -) - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] - - -def _make_i2v_pipeline(*, expand_timesteps: bool) -> Wan22I2VPipeline: - pipeline = object.__new__(Wan22I2VPipeline) - nn.Module.__init__(pipeline) - pipeline.device = torch.device("cpu") - pipeline.transformer = StubTransformer(name="high", in_channels=8, out_channels=4) - pipeline.transformer_2 = StubTransformer(name="low", in_channels=8, out_channels=4) - pipeline.vae = StubVAE(z_dim=4) - pipeline.vae_scale_factor_temporal = 4 - pipeline.vae_scale_factor_spatial = 8 - pipeline.expand_timesteps = expand_timesteps - pipeline.progress_bar = noop_progress_bar - return pipeline - - -def test_i2v_preprocess_requires_image_and_resizes_to_480p_aspect() -> None: - preprocess = get_wan22_i2v_pre_process_func(SimpleNamespace()) - request = SimpleNamespace( - prompts=[{"prompt": "p", "multi_modal_data": {"image": Image.new("RGB", (320, 160), "red")}}], - sampling_params=SimpleNamespace(height=None, width=None), - ) - - result = preprocess(request) - prompt = result.prompts[0] - - assert result.sampling_params.height == 432 - assert result.sampling_params.width == 880 - assert prompt["multi_modal_data"]["image"].size == (880, 432) - - missing_image = SimpleNamespace( - prompts=[{"prompt": "p", "multi_modal_data": {}}], - sampling_params=SimpleNamespace(height=None, width=None), - ) - with pytest.raises(ValueError, match="No image is provided"): - preprocess(missing_image) - - -def test_i2v_diffuse_selects_stage_guidance_and_expands_timesteps() -> None: - pipeline = _make_i2v_pipeline(expand_timesteps=True) - latents = torch.zeros(1, 4, 2, 4, 4) - condition = torch.ones_like(latents) - first_frame_mask = torch.ones(1, 1, 2, 4, 4) - first_frame_mask[:, :, 0] = 0 - timesteps = torch.tensor([900, 100]) - - calls = [] - - def fake_predict_noise_maybe_with_cfg(**kwargs): - positive = kwargs["positive_kwargs"] - calls.append( - { - "model": positive["current_model"].name, - "scale": kwargs["true_cfg_scale"], - "timestep_shape": tuple(positive["timestep"].shape), - "timestep_values": positive["timestep"].clone(), - "hidden_states": positive["hidden_states"].clone(), - } - ) - return torch.ones_like(latents) - - pipeline.predict_noise_maybe_with_cfg = fake_predict_noise_maybe_with_cfg # type: ignore[method-assign] - pipeline.scheduler_step_maybe_with_cfg = lambda noise, t, current, cfg: current + noise # type: ignore[method-assign] - - result = pipeline.diffuse( - latents=latents, - timesteps=timesteps, - prompt_embeds=torch.zeros(1, 2, 3), - negative_prompt_embeds=None, - image_embeds=None, - guidance_low=1.0, - guidance_high=2.0, - boundary_timestep=500.0, - dtype=torch.float32, - attention_kwargs={}, - condition=condition, - first_frame_mask=first_frame_mask, - ) - - assert [call["model"] for call in calls] == ["high", "low"] - assert [call["scale"] for call in calls] == [1.0, 2.0] - assert calls[0]["timestep_shape"] == (1, 8) - timestep_dtype = calls[0]["timestep_values"].dtype - torch.testing.assert_close(calls[0]["timestep_values"][0, :4], torch.zeros(4, dtype=timestep_dtype)) - torch.testing.assert_close(calls[0]["timestep_values"][0, 4:], torch.full((4,), 900, dtype=timestep_dtype)) - torch.testing.assert_close(calls[0]["hidden_states"][:, :, 0], torch.ones(1, 4, 4, 4)) - torch.testing.assert_close(result, torch.full_like(latents, 2.0)) - - -def test_i2v_prepare_latents_builds_expand_condition_and_first_frame_mask() -> None: - pipeline = _make_i2v_pipeline(expand_timesteps=True) - latents, condition, first_frame_mask = pipeline.prepare_latents( - image=torch.zeros(1, 3, 16, 16), - batch_size=1, - num_channels_latents=4, - height=16, - width=16, - num_frames=5, - dtype=torch.float32, - device=torch.device("cpu"), - generator=torch.Generator(device="cpu").manual_seed(0), - ) - - assert latents.shape == (1, 4, 2, 2, 2) - assert condition.shape == (1, 4, 1, 2, 2) - assert first_frame_mask.shape == (1, 1, 2, 2, 2) - assert first_frame_mask[:, :, 0].sum() == 0 - assert first_frame_mask[:, :, 1].sum() == 4 diff --git a/tests/diffusion/models/wan2_2/test_wan22_pipeline_diffuse.py b/tests/diffusion/models/wan2_2/test_wan22_pipeline_diffuse.py deleted file mode 100644 index 54bb672ef81..00000000000 --- a/tests/diffusion/models/wan2_2/test_wan22_pipeline_diffuse.py +++ /dev/null @@ -1,155 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from contextlib import contextmanager -from types import SimpleNamespace - -import pytest -import torch -from torch import nn - -from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import Wan22Pipeline - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] - - -class _StubTransformer(nn.Module): - @property - def dtype(self) -> torch.dtype: - return torch.float32 - - -class _StubScheduler: - def __init__(self, timesteps: list[int]) -> None: - self.timesteps = torch.tensor(timesteps, dtype=torch.int64) - self.config = SimpleNamespace(num_train_timesteps=1000) - self.set_timesteps_calls: list[tuple[int, torch.device]] = [] - - def set_timesteps(self, num_steps: int, device: torch.device) -> None: - self.set_timesteps_calls.append((num_steps, device)) - - -@contextmanager -def _noop_progress_bar(*args, **kwargs): - del args, kwargs - - class _Bar: - def update(self) -> None: - return None - - yield _Bar() - - -def _make_pipeline() -> Wan22Pipeline: - pipeline = object.__new__(Wan22Pipeline) - nn.Module.__init__(pipeline) - pipeline.device = torch.device("cpu") - pipeline.transformer = _StubTransformer() - pipeline.transformer_2 = None - pipeline.transformer_config = SimpleNamespace(patch_size=(1, 2, 2), in_channels=4, out_channels=4) - pipeline.scheduler = _StubScheduler([9, 5]) - pipeline.od_config = SimpleNamespace(flow_shift=5.0) - pipeline._sample_solver = "unipc" - pipeline._flow_shift = 5.0 - pipeline.vae_scale_factor_temporal = 4 - pipeline.vae_scale_factor_spatial = 8 - pipeline.boundary_ratio = 0.875 - pipeline.expand_timesteps = False - pipeline._guidance_scale = None - pipeline._guidance_scale_2 = None - pipeline._num_timesteps = None - pipeline._current_timestep = None - pipeline.check_inputs = lambda **kwargs: None - pipeline.prepare_latents = lambda **kwargs: torch.zeros((1, 4, 1, 8, 8), dtype=torch.float32) - pipeline.progress_bar = _noop_progress_bar - return pipeline - - -def test_forward_delegates_denoising_to_diffuse(monkeypatch) -> None: - pipeline = _make_pipeline() - - prompt_embeds = torch.randn(1, 8) - captured: dict[str, object] = {} - - def _fake_diffuse(**kwargs): - captured.update(kwargs) - return kwargs["latents"] + 1 - - pipeline.diffuse = _fake_diffuse # type: ignore[method-assign] - - req = SimpleNamespace( - prompts=["prompt"], - sampling_params=SimpleNamespace( - height=None, - width=None, - num_frames=1, - num_inference_steps=2, - guidance_scale_provided=False, - guidance_scale=None, - guidance_scale_2=None, - boundary_ratio=None, - generator=None, - seed=None, - num_outputs_per_prompt=1, - max_sequence_length=32, - latents=None, - extra_args={}, - ), - ) - - output = pipeline.forward(req, prompt_embeds=prompt_embeds, output_type="latent", guidance_scale=1.0) - - assert torch.equal(output.output, torch.ones((1, 4, 1, 8, 8))) - assert torch.equal(captured["timesteps"], pipeline.scheduler.timesteps) - assert captured["guidance_low"] == 1.0 - assert captured["guidance_high"] == 1.0 - assert captured["boundary_timestep"] == pytest.approx(875.0) - assert captured["latent_condition"] is None - assert captured["first_frame_mask"] is None - assert pipeline.scheduler.set_timesteps_calls == [(2, torch.device("cpu"))] - - -def test_diffuse_runs_prediction_and_scheduler_for_each_timestep() -> None: - pipeline = _make_pipeline() - latents = torch.zeros((1, 1, 1, 2, 2), dtype=torch.float32) - timesteps = torch.tensor([7, 3], dtype=torch.int64) - prompt_embeds = torch.randn(1, 8) - - predict_calls: list[dict[str, object]] = [] - scheduler_calls: list[tuple[float, int, float, bool]] = [] - - def _fake_predict_noise_maybe_with_cfg(**kwargs): - predict_calls.append(kwargs) - timestep = kwargs["positive_kwargs"]["timestep"] - assert isinstance(timestep, torch.Tensor) - return torch.full_like(latents, float(timestep[0].item())) - - def _fake_scheduler_step_maybe_with_cfg(noise_pred, t, current_latents, do_true_cfg): - scheduler_calls.append( - (float(noise_pred[0, 0, 0, 0, 0]), int(t.item()), float(current_latents.sum()), do_true_cfg) - ) - return current_latents + noise_pred - - pipeline.predict_noise_maybe_with_cfg = _fake_predict_noise_maybe_with_cfg # type: ignore[method-assign] - pipeline.scheduler_step_maybe_with_cfg = _fake_scheduler_step_maybe_with_cfg # type: ignore[method-assign] - - result = pipeline.diffuse( - latents=latents, - timesteps=timesteps, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=None, - guidance_low=1.0, - guidance_high=2.0, - boundary_timestep=5.0, - dtype=torch.float32, - attention_kwargs={}, - ) - - assert len(predict_calls) == 2 - assert predict_calls[0]["true_cfg_scale"] == 1.0 - assert predict_calls[1]["true_cfg_scale"] == 2.0 - assert scheduler_calls == [ - (7.0, 7, 0.0, False), - (3.0, 3, 28.0, False), - ] - assert torch.equal(result, torch.full_like(latents, 10.0)) diff --git a/tests/diffusion/models/wan2_2/test_wan22_pipeline_helpers.py b/tests/diffusion/models/wan2_2/test_wan22_pipeline_helpers.py deleted file mode 100644 index 31471786976..00000000000 --- a/tests/diffusion/models/wan2_2/test_wan22_pipeline_helpers.py +++ /dev/null @@ -1,81 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import json -from types import SimpleNamespace - -import pytest -import torch - -import vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 as wan22_module -from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( - create_transformer_from_config, - load_transformer_config, - retrieve_latents, -) - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] - - -class _LatentDist: - def sample(self, generator): - assert isinstance(generator, torch.Generator) - return torch.tensor([1.0]) - - def mode(self): - return torch.tensor([2.0]) - - -def test_retrieve_latents_supports_sample_mode_argmax_and_direct_latents() -> None: - generator = torch.Generator(device="cpu") - - assert retrieve_latents(SimpleNamespace(latent_dist=_LatentDist()), generator).item() == 1.0 - assert retrieve_latents(SimpleNamespace(latent_dist=_LatentDist()), sample_mode="argmax").item() == 2.0 - torch.testing.assert_close(retrieve_latents(SimpleNamespace(latents=torch.tensor([3.0]))), torch.tensor([3.0])) - - -def test_retrieve_latents_rejects_unknown_encoder_output() -> None: - with pytest.raises(AttributeError, match="Could not access latents"): - retrieve_latents(SimpleNamespace()) - - -def test_load_transformer_config_reads_local_subfolder_config(tmp_path) -> None: - config_dir = tmp_path / "transformer_2" - config_dir.mkdir(parents=True) - (config_dir / "config.json").write_text(json.dumps({"patch_size": [1, 2, 2], "num_layers": 2})) - - assert load_transformer_config(str(tmp_path), "transformer_2") == {"patch_size": [1, 2, 2], "num_layers": 2} - assert load_transformer_config(str(tmp_path), "missing") == {} - - -def test_create_transformer_from_config_maps_supported_keys(monkeypatch) -> None: - captured = {} - - class FakeTransformer: - def __init__(self, **kwargs) -> None: - captured.update(kwargs) - - monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer) - - transformer = create_transformer_from_config( - { - "patch_size": [1, 2, 2], - "num_attention_heads": 8, - "attention_head_dim": 128, - "in_channels": 16, - "out_channels": 16, - "text_dim": 4096, - "vace_layers": [0], - "ignored": "value", - } - ) - - assert isinstance(transformer, FakeTransformer) - assert captured == { - "patch_size": (1, 2, 2), - "num_attention_heads": 8, - "attention_head_dim": 128, - "in_channels": 16, - "out_channels": 16, - "text_dim": 4096, - } diff --git a/tests/diffusion/models/wan2_2/test_wan22_ti2v_pipeline.py b/tests/diffusion/models/wan2_2/test_wan22_ti2v_pipeline.py deleted file mode 100644 index e611c37b6ad..00000000000 --- a/tests/diffusion/models/wan2_2/test_wan22_ti2v_pipeline.py +++ /dev/null @@ -1,97 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from types import SimpleNamespace - -import pytest -import torch -from PIL import Image -from torch import nn - -from tests.diffusion.models.wan2_2.conftest import StubTransformer, StubVAE, noop_progress_bar -from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_ti2v import ( - Wan22TI2VPipeline, - get_wan22_ti2v_pre_process_func, -) - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] - - -def _make_ti2v_pipeline() -> Wan22TI2VPipeline: - pipeline = object.__new__(Wan22TI2VPipeline) - nn.Module.__init__(pipeline) - pipeline.device = torch.device("cpu") - pipeline.transformer = StubTransformer(in_channels=4, out_channels=4) - pipeline.vae = StubVAE(z_dim=4) - pipeline.vae_scale_factor_temporal = 4 - pipeline.vae_scale_factor_spatial = 8 - pipeline.progress_bar = noop_progress_bar - return pipeline - - -def test_ti2v_preprocess_uses_720p_area_for_image_condition() -> None: - preprocess = get_wan22_ti2v_pre_process_func(SimpleNamespace()) - request = SimpleNamespace( - prompts=[{"prompt": "p", "multi_modal_data": {"image": Image.new("RGB", (320, 160), "blue")}}], - sampling_params=SimpleNamespace(height=None, width=None), - ) - - result = preprocess(request) - - assert result.sampling_params.height == 672 - assert result.sampling_params.width == 1344 - assert result.prompts[0]["multi_modal_data"]["image"].size == (1344, 672) - - -def test_ti2v_diffuse_without_image_condition_expands_patch_timesteps() -> None: - pipeline = _make_ti2v_pipeline() - latents = torch.zeros(1, 4, 2, 4, 4) - calls = [] - - def fake_predict_noise_maybe_with_cfg(**kwargs): - calls.append(kwargs) - return torch.ones_like(latents) - - pipeline.predict_noise_maybe_with_cfg = fake_predict_noise_maybe_with_cfg # type: ignore[method-assign] - pipeline.scheduler_step_maybe_with_cfg = lambda noise, t, current, cfg: current + noise # type: ignore[method-assign] - - result = pipeline.diffuse( - latents=latents, - timesteps=torch.tensor([7]), - prompt_embeds=torch.zeros(1, 2, 3), - negative_prompt_embeds=torch.zeros(1, 2, 3), - guidance_scale=3.0, - dtype=torch.float32, - attention_kwargs={"a": "b"}, - num_latent_frames=2, - latent_height=4, - latent_width=4, - ) - - positive = calls[0]["positive_kwargs"] - assert calls[0]["do_true_cfg"] is True - assert positive["timestep"].shape == (1, 8) - torch.testing.assert_close(positive["timestep"], torch.full((1, 8), 7, dtype=positive["timestep"].dtype)) - torch.testing.assert_close(positive["hidden_states"], latents) - torch.testing.assert_close(result, torch.ones_like(latents)) - - -def test_ti2v_prepare_i2v_latents_encodes_condition_and_masks_first_frame() -> None: - pipeline = _make_ti2v_pipeline() - latents, latent_condition, first_frame_mask = pipeline.prepare_i2v_latents( - image=torch.zeros(1, 3, 16, 16), - batch_size=1, - num_channels_latents=4, - height=16, - width=16, - num_frames=5, - dtype=torch.float32, - device=torch.device("cpu"), - generator=None, - latents=torch.zeros(1, 4, 2, 2, 2), - ) - - torch.testing.assert_close(latents, torch.zeros(1, 4, 2, 2, 2)) - assert latent_condition.shape == (1, 4, 1, 2, 2) - assert first_frame_mask[:, :, 0].sum() == 0 - assert first_frame_mask[:, :, 1].sum() == 4 diff --git a/tests/diffusion/models/wan2_2/test_wan22_vace_pipeline.py b/tests/diffusion/models/wan2_2/test_wan22_vace_pipeline.py deleted file mode 100644 index 9fa9b67c499..00000000000 --- a/tests/diffusion/models/wan2_2/test_wan22_vace_pipeline.py +++ /dev/null @@ -1,137 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from types import SimpleNamespace - -import pytest -import torch -from PIL import Image -from torch import nn - -from tests.diffusion.models.wan2_2.conftest import StubTransformer, StubVAE, noop_progress_bar -from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_vace import ( - Wan22VACEPipeline, - create_vace_transformer_from_config, - get_wan22_vace_pre_process_func, -) - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] - - -def _make_vace_pipeline() -> Wan22VACEPipeline: - pipeline = object.__new__(Wan22VACEPipeline) - nn.Module.__init__(pipeline) - pipeline.device = torch.device("cpu") - pipeline.transformer = StubTransformer(in_channels=4, out_channels=4) - pipeline.transformer_config = pipeline.transformer.config - pipeline.vae = StubVAE(z_dim=4) - pipeline.vae_scale_factor_temporal = 4 - pipeline.vae_scale_factor_spatial = 8 - pipeline.progress_bar = noop_progress_bar - return pipeline - - -def test_vace_preprocess_collects_reference_video_and_mask_inputs() -> None: - preprocess = get_wan22_vace_pre_process_func(SimpleNamespace()) - ref = Image.new("RGB", (320, 160), "green") - frame = Image.new("RGB", (64, 64), "black") - mask = Image.new("L", (64, 64), 255) - request = SimpleNamespace( - prompts=[ - { - "prompt": "p", - "multi_modal_data": { - "image": ref, - "video": [frame], - "mask": mask, - }, - } - ], - sampling_params=SimpleNamespace(height=None, width=None), - ) - - result = preprocess(request) - additional_info = result.prompts[0]["additional_information"] - - assert result.sampling_params.height == 432 - assert result.sampling_params.width == 880 - assert additional_info["reference_images"] == [ref] - assert additional_info["source_video"] == [frame] - assert additional_info["mask"] == [mask] - - -def test_create_vace_transformer_from_config_maps_vace_specific_keys(monkeypatch) -> None: - captured = {} - - class FakeVACETransformer: - def __init__(self, **kwargs) -> None: - captured.update(kwargs) - - monkeypatch.setattr( - "vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_vace.WanVACETransformer3DModel", - FakeVACETransformer, - ) - - transformer = create_vace_transformer_from_config( - { - "patch_size": [1, 2, 2], - "in_channels": 96, - "out_channels": 16, - "vace_layers": [0, 1, 2], - "vace_in_channels": 132, - "unknown": "ignored", - } - ) - - assert isinstance(transformer, FakeVACETransformer) - assert captured == { - "patch_size": (1, 2, 2), - "in_channels": 96, - "out_channels": 16, - "vace_layers": [0, 1, 2], - "vace_in_channels": 132, - } - - -def test_vace_prepare_masks_encodes_spatial_stride_and_reference_padding() -> None: - pipeline = _make_vace_pipeline() - mask = torch.ones(1, 3, 5, 16, 16) - reference_images = [[torch.zeros(3, 16, 16), torch.zeros(3, 16, 16)]] - - encoded = pipeline.prepare_masks(mask, reference_images) - - assert encoded.shape == (1, 64, 4, 2, 2) - torch.testing.assert_close(encoded[:, :, :2], torch.zeros(1, 64, 2, 2, 2)) - torch.testing.assert_close(encoded[:, :, 2:], torch.ones(1, 64, 2, 2, 2)) - - -def test_vace_diffuse_passes_context_and_scale_to_cfg_branches() -> None: - pipeline = _make_vace_pipeline() - latents = torch.zeros(1, 4, 1, 2, 2) - vace_context = torch.ones(1, 12, 1, 2, 2) - calls = [] - - def fake_predict_noise_maybe_with_cfg(**kwargs): - calls.append(kwargs) - return torch.ones_like(latents) - - pipeline.predict_noise_maybe_with_cfg = fake_predict_noise_maybe_with_cfg # type: ignore[method-assign] - pipeline.scheduler_step_maybe_with_cfg = lambda noise, t, current, cfg: current + noise # type: ignore[method-assign] - - result = pipeline.diffuse( - latents=latents, - timesteps=torch.tensor([5]), - prompt_embeds=torch.zeros(1, 2, 3), - negative_prompt_embeds=torch.zeros(1, 2, 3), - guidance_scale=4.0, - dtype=torch.float32, - attention_kwargs={}, - vace_context=vace_context, - vace_context_scale=0.75, - ) - - assert calls[0]["do_true_cfg"] is True - assert calls[0]["true_cfg_scale"] == 4.0 - assert calls[0]["positive_kwargs"]["vace_context"] is vace_context - assert calls[0]["negative_kwargs"]["vace_context_scale"] == 0.75 - torch.testing.assert_close(result, torch.ones_like(latents)) diff --git a/tests/diffusion/offloader/test_layerwise_backend.py b/tests/diffusion/offloader/test_layerwise_backend.py index 5fd80e75c22..7df3c1bb1a1 100644 --- a/tests/diffusion/offloader/test_layerwise_backend.py +++ b/tests/diffusion/offloader/test_layerwise_backend.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for LayerwiseOffloadHook and LayerWiseOffloadBackend utilities.""" +"""Unit tests for LayerwiseOffloadHook.""" import gc import os @@ -15,7 +15,7 @@ from torch.distributed.tensor import DeviceMesh, DTensor, Replicate import vllm_omni.diffusion.offloader.layerwise_backend as layerwise_backend_module -from vllm_omni.diffusion.offloader.layerwise_backend import LayerWiseOffloadBackend, LayerwiseOffloadHook +from vllm_omni.diffusion.offloader.layerwise_backend import LayerwiseOffloadHook from vllm_omni.platforms import current_omni_platform pytestmark = [pytest.mark.diffusion, pytest.mark.cpu, pytest.mark.core_model] @@ -127,116 +127,3 @@ def test_dtensor_wrapper_is_preserved_across_prefetch_and_offload(self, dist_gro assert current_block.weight.to_local().is_meta assert current_block.weight.to_local().shape == torch.Size([4]) assert not hook.is_materialized - - -class _DummyBlock(nn.Module): - def __init__(self): - super().__init__() - self.weight = nn.Parameter(torch.randn(10, 10)) - - -class _SingleBlockModel(nn.Module): - _layerwise_offload_blocks_attrs = ["blocks"] - - def __init__(self, num_blocks: int = 3): - super().__init__() - self.blocks = nn.ModuleList([_DummyBlock() for _ in range(num_blocks)]) - - -class _MultiBlockModel(nn.Module): - _layerwise_offload_blocks_attrs = ["transformer_blocks", "single_transformer_blocks"] - - def __init__(self, num_transformer: int = 2, num_single: int = 2): - super().__init__() - self.transformer_blocks = nn.ModuleList([_DummyBlock() for _ in range(num_transformer)]) - self.single_transformer_blocks = nn.ModuleList([_DummyBlock() for _ in range(num_single)]) - - -class _EmptyBlocksModel(nn.Module): - _layerwise_offload_blocks_attrs = ["blocks"] - - def __init__(self): - super().__init__() - self.blocks = nn.ModuleList([]) - - -class _InvalidAttrModel(nn.Module): - _layerwise_offload_blocks_attrs = ["nonexistent_blocks", "blocks"] - - def __init__(self, num_blocks: int = 2): - super().__init__() - self.blocks = nn.ModuleList([_DummyBlock() for _ in range(num_blocks)]) - - -class _DeprecatedSingleAttrModel(nn.Module): - _layerwise_offload_blocks_attr = "blocks" - - def __init__(self, num_blocks: int = 2): - super().__init__() - self.blocks = nn.ModuleList([_DummyBlock() for _ in range(num_blocks)]) - - -class _NoAttrsModel(nn.Module): - def __init__(self, num_blocks: int = 2): - super().__init__() - self.blocks = nn.ModuleList([_DummyBlock() for _ in range(num_blocks)]) - - -class TestGetBlocksFromDit: - def test_get_blocks_from_dit_single_block_attr(self): - model = _SingleBlockModel(num_blocks=3) - attr_names, blocks = LayerWiseOffloadBackend.get_blocks_from_dit(model) - assert attr_names == ["blocks"] - assert len(blocks) == 3 - assert all(isinstance(b, _DummyBlock) for b in blocks) - - def test_get_blocks_from_dit_multi_block_attrs(self): - model = _MultiBlockModel(num_transformer=2, num_single=3) - attr_names, blocks = LayerWiseOffloadBackend.get_blocks_from_dit(model) - assert set(attr_names) == {"transformer_blocks", "single_transformer_blocks"} - assert len(blocks) == 5 - assert all(isinstance(b, _DummyBlock) for b in blocks) - - def test_get_blocks_from_dit_empty_blocks(self): - model = _EmptyBlocksModel() - attr_names, blocks = LayerWiseOffloadBackend.get_blocks_from_dit(model) - assert attr_names == [] - assert blocks == [] - - def test_get_blocks_from_dit_invalid_attr_name(self): - model = _InvalidAttrModel(num_blocks=2) - with pytest.raises( - AttributeError, - match="Attribute 'nonexistent_blocks' declared in _layerwise_offload_blocks_attrs does not exist", - ): - LayerWiseOffloadBackend.get_blocks_from_dit(model) - - def test_get_blocks_from_dit_no_attrs_defined(self): - model = _NoAttrsModel(num_blocks=3) - attr_names, blocks = LayerWiseOffloadBackend.get_blocks_from_dit(model) - assert attr_names == [] - assert blocks == [] - - def test_get_blocks_from_dit_deprecated_single_attr(self): - model = _DeprecatedSingleAttrModel(num_blocks=2) - attr_names, blocks = LayerWiseOffloadBackend.get_blocks_from_dit(model) - assert attr_names == ["blocks"] - assert len(blocks) == 2 - - -class TestGetBlocksAttrNames: - def test_get_blocks_attr_names_new_format(self): - model = _MultiBlockModel() - attrs = LayerWiseOffloadBackend.get_blocks_attr_names(model) - assert attrs == ["transformer_blocks", "single_transformer_blocks"] - - def test_get_blocks_attr_names_no_attrs(self): - model = _NoAttrsModel() - attrs = LayerWiseOffloadBackend.get_blocks_attr_names(model) - assert attrs == [] - - def test_set_blocks_attr_names(self): - model = _NoAttrsModel() - LayerWiseOffloadBackend.set_blocks_attr_names(model, ["new_blocks"]) - assert hasattr(model.__class__, "_layerwise_offload_blocks_attrs") - assert model.__class__._layerwise_offload_blocks_attrs == ["new_blocks"] diff --git a/tests/diffusion/offloader/test_module_collector.py b/tests/diffusion/offloader/test_module_collector.py deleted file mode 100644 index ab15ad8df60..00000000000 --- a/tests/diffusion/offloader/test_module_collector.py +++ /dev/null @@ -1,240 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -"""Unit tests for ModuleDiscovery and SupportsModuleOffload.""" - -from typing import ClassVar - -import pytest -from torch import nn - -from vllm_omni.diffusion.models.interface import SupportsModuleOffload -from vllm_omni.diffusion.offloader.module_collector import ModuleDiscovery - -pytestmark = [pytest.mark.diffusion, pytest.mark.cpu, pytest.mark.core_model] - -# NOTE: tests for skipped/warned attributes verify the *behavioral* -# outcome (attribute excluded from results) but do not assert on log -# output. vllm's logger sets propagate=False, preventing caplog from -# capturing records. See https://github.com/pytest-dev/pytest/issues/3697 - - -# --------------------------------------------------------------------------- -# Test pipelines -# --------------------------------------------------------------------------- - - -class FallbackPipeline(nn.Module): - """Pipeline with standard attribute names (no protocol).""" - - def __init__(self): - super().__init__() - self.transformer = nn.Linear(10, 10) - self.text_encoder = nn.Linear(10, 10) - self.text_encoder_2 = nn.Linear(10, 10) - self.vae = nn.Linear(10, 10) - - -class NonModuleAttrPipeline(nn.Module): - """Pipeline where an attribute is not an nn.Module (fallback path).""" - - def __init__(self): - super().__init__() - self.transformer = nn.Linear(10, 10) - self.text_encoder = "not_a_module" - self.vae = nn.Linear(10, 10) - - -class DuplicateAttrPipeline(nn.Module): - """Pipeline where two encoder attrs point to the same module.""" - - def __init__(self): - super().__init__() - self.transformer = nn.Linear(10, 10) - encoder = nn.Linear(10, 10) - self.text_encoder = encoder - self.text_encoder_2 = encoder - self.vae = nn.Linear(10, 10) - - -class ProtocolPipeline(nn.Module, SupportsModuleOffload): - """Pipeline with non-standard names, using the protocol.""" - - _dit_modules: ClassVar[list[str]] = ["gen_transformer"] - _encoder_modules: ClassVar[list[str]] = ["mllm", "vision_model"] - _vae_modules: ClassVar[list[str]] = ["gen_vae"] - - def __init__(self): - super().__init__() - self.gen_transformer = nn.Linear(10, 10) - self.mllm = nn.Linear(10, 10) - self.vision_model = nn.Linear(10, 10) - self.gen_vae = nn.Linear(10, 10) - # Standard name present but NOT declared — should be ignored - self.transformer = nn.Linear(10, 10) - - -class MissingAttrPipeline(nn.Module, SupportsModuleOffload): - """Pipeline that declares a non-existent attribute.""" - - _dit_modules: ClassVar[list[str]] = ["transformer"] - _encoder_modules: ClassVar[list[str]] = ["nonexistent_encoder"] - _vae_modules: ClassVar[list[str]] = ["vae"] - - def __init__(self): - super().__init__() - self.transformer = nn.Linear(10, 10) - self.vae = nn.Linear(10, 10) - - -class MissingIntermediatePipeline(nn.Module, SupportsModuleOffload): - """Pipeline with dotted path referencing non-existent intermediate.""" - - _dit_modules: ClassVar[list[str]] = ["nonexistent.transformer"] - _encoder_modules: ClassVar[list[str]] = [] - _vae_modules: ClassVar[list[str]] = [] - - def __init__(self): - super().__init__() - - -class NestedPipeline(nn.Module, SupportsModuleOffload): - """Pipeline with nested modules accessed via dotted paths.""" - - _dit_modules: ClassVar[list[str]] = ["pipe.transformer"] - _encoder_modules: ClassVar[list[str]] = ["pipe.text_encoder"] - _vae_modules: ClassVar[list[str]] = ["vae"] - - def __init__(self): - super().__init__() - self.pipe = nn.Module() - self.pipe.transformer = nn.Linear(10, 10) - self.pipe.text_encoder = nn.Linear(10, 10) - self.vae = nn.Linear(10, 10) - - -class ResidentPipeline(nn.Module, SupportsModuleOffload): - """Pipeline with resident modules that must stay on GPU.""" - - _dit_modules: ClassVar[list[str]] = ["language_model.model"] - _encoder_modules: ClassVar[list[str]] = [] - _vae_modules: ClassVar[list[str]] = ["vae"] - _resident_modules: ClassVar[list[str]] = [ - "bagel.time_embedder", - "bagel.vae2llm", - ] - - def __init__(self): - super().__init__() - self.language_model = nn.Module() - self.language_model.model = nn.Linear(10, 10) - self.bagel = nn.Module() - self.bagel.time_embedder = nn.Linear(10, 10) - self.bagel.vae2llm = nn.Linear(10, 10) - self.vae = nn.Linear(10, 10) - - -class MultiVaePipeline(nn.Module, SupportsModuleOffload): - """Pipeline with multiple VAEs.""" - - _dit_modules: ClassVar[list[str]] = ["transformer"] - _encoder_modules: ClassVar[list[str]] = ["text_encoder"] - _vae_modules: ClassVar[list[str]] = ["vae", "audio_vae"] - - def __init__(self): - super().__init__() - self.transformer = nn.Linear(10, 10) - self.text_encoder = nn.Linear(10, 10) - self.vae = nn.Linear(10, 10) - self.audio_vae = nn.Linear(10, 10) - - -# --------------------------------------------------------------------------- -# Tests -# --------------------------------------------------------------------------- - - -class TestFallbackDiscovery: - """Test the fallback attribute scan (no SupportsModuleOffload).""" - - def test_discovers_standard_attrs(self): - pipeline = FallbackPipeline() - result = ModuleDiscovery.discover(pipeline) - - assert not isinstance(pipeline, SupportsModuleOffload) - assert result.dit_names == ["transformer"] - assert result.dits[0] is pipeline.transformer - assert result.encoder_names == ["text_encoder", "text_encoder_2"] - assert result.vaes[0] is pipeline.vae - assert result.resident_modules == [] - - def test_deduplicates_encoders(self): - pipeline = DuplicateAttrPipeline() - result = ModuleDiscovery.discover(pipeline) - - assert len(result.encoders) == 1 - assert result.encoder_names == ["text_encoder"] - - def test_skips_non_module_attr(self): - pipeline = NonModuleAttrPipeline() - result = ModuleDiscovery.discover(pipeline) - - assert len(result.encoders) == 0 - - -class TestProtocolDiscovery: - """Test discovery via SupportsModuleOffload protocol.""" - - def test_discovers_declared_attrs_and_ignores_undeclared(self): - pipeline = ProtocolPipeline() - result = ModuleDiscovery.discover(pipeline) - - assert isinstance(pipeline, SupportsModuleOffload) - assert result.dit_names == ["gen_transformer"] - assert result.encoder_names == ["mllm", "vision_model"] - assert len(result.vaes) == 1 - # self.transformer exists but is NOT in _dit_modules - assert "transformer" not in result.dit_names - # No _resident_modules declared — defaults to empty - assert result.resident_modules == [] - - def test_skips_missing_attr(self): - pipeline = MissingAttrPipeline() - result = ModuleDiscovery.discover(pipeline) - - assert len(result.encoders) == 0 - - def test_skips_missing_intermediate(self): - result = ModuleDiscovery.discover(MissingIntermediatePipeline()) - - assert len(result.dits) == 0 - - def test_dotted_path_resolves_nested_modules(self): - pipeline = NestedPipeline() - result = ModuleDiscovery.discover(pipeline) - - assert result.dit_names == ["pipe.transformer"] - assert result.dits[0] is pipeline.pipe.transformer - assert result.encoder_names == ["pipe.text_encoder"] - assert result.encoders[0] is pipeline.pipe.text_encoder - assert result.vaes[0] is pipeline.vae - - def test_resident_modules(self): - pipeline = ResidentPipeline() - result = ModuleDiscovery.discover(pipeline) - - assert result.resident_names == [ - "bagel.time_embedder", - "bagel.vae2llm", - ] - assert result.resident_modules[0] is pipeline.bagel.time_embedder - assert result.resident_modules[1] is pipeline.bagel.vae2llm - assert result.dits[0] is pipeline.language_model.model - - def test_multiple_vaes(self): - pipeline = MultiVaePipeline() - result = ModuleDiscovery.discover(pipeline) - - assert len(result.vaes) == 2 - assert result.vaes[0] is pipeline.vae - assert result.vaes[1] is pipeline.audio_vae diff --git a/tests/diffusion/offloader/test_sequential_backend.py b/tests/diffusion/offloader/test_sequential_backend.py index 2539cc06895..d18637a780e 100644 --- a/tests/diffusion/offloader/test_sequential_backend.py +++ b/tests/diffusion/offloader/test_sequential_backend.py @@ -3,6 +3,8 @@ """Unit tests for SequentialOffloadBackend.""" +from unittest.mock import patch + import pytest import torch from torch import nn @@ -42,7 +44,7 @@ def mock(self): class TestMoveParamsPinMemory: - def test_dtensor_skips_pin_memory(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): + def test_dtensor_skips_pin_memory(self, accelerator_device): """DTensor should skip pin_memory to avoid RuntimeError.""" module = _create_simple_module().to(accelerator_device) tracker, mock_pin = _track_pin_memory_calls() @@ -54,73 +56,73 @@ def fake_isinstance(obj, cls): return True return original_isinstance(obj, cls) - monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) - monkeypatch.setattr("builtins.isinstance", fake_isinstance) - hook = SequentialOffloadHook( - offload_targets=[], - device=accelerator_device, - pin_memory=True, - use_hsdp=False, - ) - hook._move_params( - module, - torch.device("cpu"), - non_blocking=False, - pin_memory=True, - ) - assert not tracker["called"], "pin_memory should not be called for DTensor" - - def test_regular_tensor_calls_pin_memory(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): + with patch.object(torch.Tensor, "pin_memory", mock_pin): + with patch("builtins.isinstance", fake_isinstance): + hook = SequentialOffloadHook( + offload_targets=[], + device=accelerator_device, + pin_memory=True, + use_hsdp=False, + ) + hook._move_params( + module, + torch.device("cpu"), + non_blocking=False, + pin_memory=True, + ) + assert not tracker["called"], "pin_memory should not be called for DTensor" + + def test_regular_tensor_calls_pin_memory(self, accelerator_device): """Regular tensor should call pin_memory when moving to CPU.""" module = _create_simple_module().to(accelerator_device) tracker, mock_pin = _track_pin_memory_calls() - monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) - hook = SequentialOffloadHook( - offload_targets=[], - device=accelerator_device, - pin_memory=True, - use_hsdp=False, - ) - hook._move_params( - module, - torch.device("cpu"), - non_blocking=False, - pin_memory=True, - ) - assert tracker["called"], "pin_memory should be called for regular tensors" - - def test_pin_memory_skipped_when_disabled(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): + with patch.object(torch.Tensor, "pin_memory", mock_pin): + hook = SequentialOffloadHook( + offload_targets=[], + device=accelerator_device, + pin_memory=True, + use_hsdp=False, + ) + hook._move_params( + module, + torch.device("cpu"), + non_blocking=False, + pin_memory=True, + ) + assert tracker["called"], "pin_memory should be called for regular tensors" + + def test_pin_memory_skipped_when_disabled(self, accelerator_device): """pin_memory should not be called when pin_memory=False.""" module = _create_simple_module().to(accelerator_device) tracker, mock_pin = _track_pin_memory_calls() - monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) - hook = SequentialOffloadHook( - offload_targets=[], - device=accelerator_device, - pin_memory=False, - use_hsdp=False, - ) - hook._move_params( - module, - torch.device("cpu"), - non_blocking=False, - pin_memory=False, - ) - assert not tracker["called"], "pin_memory should not be called when disabled" - - def test_pin_memory_skipped_for_non_cpu_target(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): + with patch.object(torch.Tensor, "pin_memory", mock_pin): + hook = SequentialOffloadHook( + offload_targets=[], + device=accelerator_device, + pin_memory=False, + use_hsdp=False, + ) + hook._move_params( + module, + torch.device("cpu"), + non_blocking=False, + pin_memory=False, + ) + assert not tracker["called"], "pin_memory should not be called when disabled" + + def test_pin_memory_skipped_for_non_cpu_target(self, accelerator_device): """pin_memory should not be called for non-CPU targets.""" module = _create_simple_module().to("cpu") tracker, mock_pin = _track_pin_memory_calls() - monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) - hook = SequentialOffloadHook( - offload_targets=[], - device=torch.device("cpu"), - pin_memory=True, - use_hsdp=False, - ) - hook._move_params(module, accelerator_device, non_blocking=False, pin_memory=True) - assert not tracker["called"], "pin_memory should not be called for non-CPU target" + with patch.object(torch.Tensor, "pin_memory", mock_pin): + hook = SequentialOffloadHook( + offload_targets=[], + device=torch.device("cpu"), + pin_memory=True, + use_hsdp=False, + ) + hook._move_params(module, accelerator_device, non_blocking=False, pin_memory=True) + assert not tracker["called"], "pin_memory should not be called for non-CPU target" diff --git a/tests/diffusion/quantization/test_component_routing.py b/tests/diffusion/quantization/test_component_routing.py deleted file mode 100644 index c8b3837e256..00000000000 --- a/tests/diffusion/quantization/test_component_routing.py +++ /dev/null @@ -1,408 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for component routing for quantization.""" - -from unittest.mock import MagicMock - -import pytest -import torch -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig, -) -from vllm.model_executor.models.utils import WeightsMapper - -from vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker import ( - PRE_QUANTIZED_METHODS, -) -from vllm_omni.quantization.component_config import ( - ComponentQuantizationConfig, -) -from vllm_omni.quantization.inc_config import OmniINCConfig - -pytestmark = [pytest.mark.core_model] - - -# --------------------------------------------------------------------------- -# Helpers: lightweight mock quant configs -# --------------------------------------------------------------------------- - - -class _MockQuantConfig(QuantizationConfig): - """Minimal mock that only implements get_name().""" - - def __init__(self, name: str, **attrs): - self._name = name - for k, v in attrs.items(): - setattr(self, k, v) - - def get_name(self) -> str: - return self._name - - def get_quant_method(self, layer, prefix): - return MagicMock() - - @classmethod - def get_supported_act_dtypes(cls): - return [torch.bfloat16, torch.float16] - - def get_min_capability(self): - return 0 - - @classmethod - def from_config(cls, config): - raise NotImplementedError - - def get_config_filenames(self): - return [] - - -def _make_inc_config(block_names="thinker.model.layers,talker.model.layers", extra_config=None): - """Create a real OmniINCConfig with block_name_to_quantize.""" - return OmniINCConfig( - weight_bits=4, - group_size=128, - sym=True, - block_name_to_quantize=block_names, - extra_config=extra_config or {}, - ) - - -THINKER_MAPPER = WeightsMapper( - orig_to_new_prefix={ - "thinker.lm_head.": "language_model.lm_head.", - "thinker.model.": "language_model.model.", - "thinker.": "", - } -) - -TALKER_MAPPER = WeightsMapper( - orig_to_new_prefix={ - "talker.codec_head.": "language_model.lm_head.", - "talker.model.": "language_model.model.", - "talker.thinker_to_talker_proj.": "thinker_to_talker_proj.", - "talker.": "", - } -) - - -# =================================================================== -# 1. OmniINCConfig.apply_vllm_mapper -# =================================================================== - - -class TestApplyVllmMapper: - def test_inc_csv_string_normalized_to_list(self): - """CSV string block_name_to_quantize is split into a list.""" - cfg = _make_inc_config("thinker.model.layers,talker.model.layers") - cfg.apply_vllm_mapper(THINKER_MAPPER) - assert isinstance(cfg.block_name_to_quantize, list) - - def test_thinker_blocks_remapped(self): - """thinker.model.layers -> language_model.model.layers after apply_vllm_mapper.""" - cfg = _make_inc_config("thinker.model.layers,talker.model.layers") - cfg.apply_vllm_mapper(THINKER_MAPPER) - assert any("language_model.model.layers" in b for b in cfg.block_name_to_quantize) - - def test_cross_stage_blocks_kept_unchanged(self): - """Blocks not matching any mapper prefix are kept unchanged (harmless).""" - cfg = _make_inc_config("thinker.model.layers,talker.model.layers") - cfg.apply_vllm_mapper(THINKER_MAPPER) - # talker.model.layers doesn't match any thinker mapper prefix → stays as-is - assert "talker.model.layers" in cfg.block_name_to_quantize - - def test_talker_remap(self): - """talker.model.layers -> language_model.model.layers with talker mapper.""" - cfg = _make_inc_config("thinker.model.layers,talker.model.layers") - cfg.apply_vllm_mapper(TALKER_MAPPER) - assert any("language_model.model.layers" in b for b in cfg.block_name_to_quantize) - # thinker.model.layers doesn't match talker mapper → stays as-is - assert "thinker.model.layers" in cfg.block_name_to_quantize - - def test_extra_config_keys_remapped(self): - """Regex keys in extra_config get their escaped-dot prefixes remapped.""" - extra = { - r".*thinker\.model\.layers\.0\.mlp\.gate.*": {"bits": 16, "data_type": "float"}, - } - cfg = _make_inc_config("thinker.model.layers", extra_config=extra) - cfg.apply_vllm_mapper(THINKER_MAPPER) - # The key should now reference the vLLM runtime path - assert any("language_model" in k for k in cfg.extra_config) - # Original thinker\.model prefix should be replaced - assert not any(r"thinker\.model" in k for k in cfg.extra_config) - - def test_single_block_name(self): - """Only one block name (not CSV) still works.""" - cfg = _make_inc_config("thinker.model.layers") - cfg.apply_vllm_mapper(THINKER_MAPPER) - assert any("language_model.model.layers" in b for b in cfg.block_name_to_quantize) - - def test_already_list_block_names(self): - """block_name_to_quantize already a list (not CSV string) works.""" - cfg = _make_inc_config(["thinker.model.layers", "talker.model.layers"]) - cfg.apply_vllm_mapper(THINKER_MAPPER) - assert isinstance(cfg.block_name_to_quantize, list) - assert any("language_model.model.layers" in b for b in cfg.block_name_to_quantize) - - def test_mutates_in_place(self): - """apply_vllm_mapper mutates the config in place (same as upstream INCConfig).""" - cfg = _make_inc_config("thinker.model.layers") - original_id = id(cfg) - cfg.apply_vllm_mapper(THINKER_MAPPER) - assert id(cfg) == original_id - - # -- Stage prefix tests (runtime prefix = container + internal name) -- - - def test_thinker_block_has_stage_prefix(self): - """Mapped block name must start with 'thinker.' so runtime startswith() works.""" - cfg = _make_inc_config("thinker.model.layers,talker.model.layers") - cfg.apply_vllm_mapper(THINKER_MAPPER) - assert "thinker.language_model.model.layers" in cfg.block_name_to_quantize - - def test_talker_block_has_stage_prefix(self): - """Mapped block name must start with 'talker.' so runtime startswith() works.""" - cfg = _make_inc_config("thinker.model.layers,talker.model.layers") - cfg.apply_vllm_mapper(TALKER_MAPPER) - assert "talker.language_model.model.layers" in cfg.block_name_to_quantize - - def test_thinker_block_matches_runtime_prefix(self): - """Simulates get_layer_config's startswith() check for FusedMoE layers.""" - cfg = _make_inc_config("thinker.model.layers,talker.model.layers") - cfg.apply_vllm_mapper(THINKER_MAPPER) - runtime_prefix = "thinker.language_model.model.layers.0.mlp.experts" - assert any(runtime_prefix.startswith(b) for b in cfg.block_name_to_quantize) - - def test_talker_block_matches_runtime_prefix(self): - """Simulates get_layer_config's startswith() check for talker FusedMoE.""" - cfg = _make_inc_config("thinker.model.layers,talker.model.layers") - cfg.apply_vllm_mapper(TALKER_MAPPER) - runtime_prefix = "talker.language_model.model.layers.0.mlp.experts" - assert any(runtime_prefix.startswith(b) for b in cfg.block_name_to_quantize) - - def test_extra_config_plain_key_has_stage_prefix(self): - """Plain extra_config keys are remapped with stage prefix.""" - extra = { - "talker.model.layers.0.mlp.shared_expert_gate": {"bits": 16}, - } - cfg = _make_inc_config("talker.model.layers", extra_config=extra) - cfg.apply_vllm_mapper(TALKER_MAPPER) - assert "talker.language_model.model.layers.0.mlp.shared_expert_gate" in cfg.extra_config - - def test_extra_config_regex_key_still_works(self): - """Regex extra_config keys use re.search so no stage prefix needed.""" - import re - - extra = { - r".*thinker\.model\.layers\.0\.mlp\.gate.*": {"bits": 16}, - } - cfg = _make_inc_config("thinker.model.layers", extra_config=extra) - cfg.apply_vllm_mapper(THINKER_MAPPER) - runtime_name = "thinker.language_model.model.layers.0.mlp.gate" - matched = any(re.search(k, runtime_name) for k in cfg.extra_config) - assert matched - - -# =================================================================== -# 2. OmniINCConfig upgrade helpers -# =================================================================== - - -class TestOmniINCConfigUpgrade: - def test_maybe_upgrade_none(self): - assert OmniINCConfig.maybe_upgrade(None) is None - - def test_maybe_upgrade_non_inc(self): - """Non-INC configs are passed through unchanged.""" - cfg = _MockQuantConfig("fp8") - assert OmniINCConfig.maybe_upgrade(cfg) is cfg - - def test_maybe_upgrade_already_omni(self): - """Already OmniINCConfig is returned as-is.""" - cfg = _make_inc_config() - assert OmniINCConfig.maybe_upgrade(cfg) is cfg - - def test_maybe_upgrade_vanilla_inc(self): - """Vanilla INCConfig is promoted to OmniINCConfig.""" - from vllm.model_executor.layers.quantization.inc import INCConfig - - vanilla = INCConfig(weight_bits=4, group_size=128, sym=True) - upgraded = OmniINCConfig.maybe_upgrade(vanilla) - assert isinstance(upgraded, OmniINCConfig) - assert upgraded.weight_bits == 4 - assert upgraded.group_size == 128 - - -# =================================================================== -# 2. Three-branch thinker routing (simulated) -# =================================================================== - - -def _simulate_thinker_routing(quant_config): - """Simulate the three-branch routing in thinker __init__. - - Returns (visual_quant_config, language_quant_config, wrapped_vllm_quant). - """ - if isinstance(quant_config, ComponentQuantizationConfig): - visual_quant_config = quant_config.resolve("visual") - language_quant_config = quant_config.resolve("language_model") - return visual_quant_config, language_quant_config, quant_config - elif quant_config is not None: - if quant_config.get_name() in PRE_QUANTIZED_METHODS: - return quant_config, quant_config, quant_config - else: - language_quant_config = quant_config - wrapped = ComponentQuantizationConfig( - component_configs={"language_model": quant_config}, - default_config=None, - ) - return None, language_quant_config, wrapped - else: - return None, None, None - - -class TestThinkerRouting: - def test_none(self): - vis, lang, wrapped = _simulate_thinker_routing(None) - assert vis is None - assert lang is None - assert wrapped is None - - @pytest.mark.parametrize("method", ["modelopt", "modelopt_fp4", "modelopt_mxfp8"]) - def test_pre_quantized_all_components(self, method): - """Pre-quantized methods pass config to all components.""" - cfg = _MockQuantConfig(method) - vis, lang, wrapped = _simulate_thinker_routing(cfg) - assert vis is cfg - assert lang is cfg - assert wrapped is cfg - - def test_fp8_dynamic_language_only(self): - """fp8 dynamic: visual=None, language gets original config.""" - cfg = _MockQuantConfig("fp8") - vis, lang, wrapped = _simulate_thinker_routing(cfg) - assert vis is None - assert lang is cfg - assert isinstance(wrapped, ComponentQuantizationConfig) - assert wrapped.resolve("language_model") is cfg - assert wrapped.resolve("visual") is None - - def test_inc_autoround_language_only(self): - """INC/AutoRound: not in _PRE_QUANTIZED_METHODS -> wrapped like fp8.""" - cfg = _MockQuantConfig("inc") - vis, lang, wrapped = _simulate_thinker_routing(cfg) - assert vis is None - assert lang is cfg - assert isinstance(wrapped, ComponentQuantizationConfig) - - def test_component_config_passthrough(self): - """Explicit ComponentQuantizationConfig is used directly.""" - inner_fp8 = _MockQuantConfig("fp8") - inner_modelopt = _MockQuantConfig("modelopt") - cqc = ComponentQuantizationConfig( - component_configs={ - "visual": inner_modelopt, - "language_model": inner_fp8, - } - ) - vis, lang, wrapped = _simulate_thinker_routing(cqc) - assert vis is inner_modelopt - assert lang is inner_fp8 - assert wrapped is cqc - - -# =================================================================== -# 3. Talker visual routing (init_multi_modal guard) -# =================================================================== - - -def _simulate_talker_visual_routing(quant_config): - """Simulate the talker init_multi_modal visual routing.""" - if quant_config is not None and quant_config.get_name() in PRE_QUANTIZED_METHODS: - return quant_config - return None - - -class TestTalkerVisualRouting: - def test_none(self): - assert _simulate_talker_visual_routing(None) is None - - @pytest.mark.parametrize("method", ["modelopt", "modelopt_fp4", "modelopt_mxfp8"]) - def test_pre_quantized_passes_through(self, method): - """Pre-quantized methods pass quant config to visual.""" - cfg = _MockQuantConfig(method) - assert _simulate_talker_visual_routing(cfg) is cfg - - def test_fp8_blocked(self): - """fp8 dynamic must NOT be passed to visual.""" - cfg = _MockQuantConfig("fp8") - assert _simulate_talker_visual_routing(cfg) is None - - def test_inc_blocked(self): - """INC/AutoRound must NOT be passed to visual (not in _PRE_QUANTIZED_METHODS).""" - cfg = _MockQuantConfig("inc") - assert _simulate_talker_visual_routing(cfg) is None - - -# =================================================================== -# 4. ComponentQuantizationConfig.resolve -# =================================================================== - - -class TestComponentResolve: - def test_longest_prefix_match(self): - a = _MockQuantConfig("a") - b = _MockQuantConfig("b") - cqc = ComponentQuantizationConfig(component_configs={"language_model": a, "language_model.model": b}) - assert cqc.resolve("language_model.model.layers.0") is b - assert cqc.resolve("language_model.lm_head") is a - - def test_no_match_returns_default(self): - a = _MockQuantConfig("a") - default = _MockQuantConfig("default") - cqc = ComponentQuantizationConfig( - component_configs={"language_model": a}, - default_config=default, - ) - assert cqc.resolve("visual") is default - - def test_no_match_no_default_returns_none(self): - a = _MockQuantConfig("a") - cqc = ComponentQuantizationConfig( - component_configs={"language_model": a}, - ) - assert cqc.resolve("visual") is None - - def test_get_name(self): - cqc = ComponentQuantizationConfig(component_configs={}) - assert cqc.get_name() == "component" - - def test_get_quant_method_delegates(self): - """get_quant_method dispatches to the resolved config.""" - inner = _MockQuantConfig("fp8") - cqc = ComponentQuantizationConfig( - component_configs={"language_model": inner}, - ) - layer = MagicMock() - result = cqc.get_quant_method(layer, "language_model.model.layers.0.mlp") - assert result is not None # delegates to inner.get_quant_method - - def test_get_quant_method_returns_none_for_unmatched(self): - """get_quant_method returns None when no config matches.""" - inner = _MockQuantConfig("fp8") - cqc = ComponentQuantizationConfig( - component_configs={"language_model": inner}, - ) - layer = MagicMock() - result = cqc.get_quant_method(layer, "visual.blocks.0.mlp") - assert result is None - - def test_min_capability(self): - a = _MockQuantConfig("a") - a.get_min_capability = lambda: 80 - b = _MockQuantConfig("b") - b.get_min_capability = lambda: 70 - cqc = ComponentQuantizationConfig(component_configs={"x": a, "y": b}) - assert cqc.get_min_capability() == 70 - - def test_min_capability_empty(self): - cqc = ComponentQuantizationConfig(component_configs={}) - assert cqc.get_min_capability() == 0 diff --git a/tests/diffusion/quantization/test_fp8_config.py b/tests/diffusion/quantization/test_fp8_config.py index 574af7a6699..9c18c1f551b 100644 --- a/tests/diffusion/quantization/test_fp8_config.py +++ b/tests/diffusion/quantization/test_fp8_config.py @@ -5,7 +5,7 @@ import pytest from torch import nn -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion] def test_build_quant_config_fp8(): diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index 875277ece42..d4d5aa5a7fe 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Unit tests for Int8 quantization config.""" +from unittest.mock import MagicMock, patch + import pytest import torch from pytest_mock import MockerFixture @@ -100,7 +102,7 @@ def test_quantization_config_string_and_dict_equivalent(): assert config_str.quantization_config.activation_scheme == config_dict.quantization_config.activation_scheme -def test_get_quant_method(mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch): +def test_get_quant_method(mocker: MockerFixture): """Test for get_quant_method method for GPU""" from vllm_omni.quantization.int8_config import Int8OnlineLinearMethod @@ -109,16 +111,18 @@ def test_get_quant_method(mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch def _fake_init(self, quant_config): pass - layer = mocker.Mock(spec=LinearBase) + layer = MagicMock(spec=LinearBase) mocker.patch.object(Int8OnlineLinearMethod, "__init__", _fake_init) prefix = "test_layer" # Mock the platform to be GPU - monkeypatch.setattr(current_omni_platform, "is_cuda", lambda: True) - monkeypatch.setattr(current_omni_platform, "is_npu", lambda: False) - method = config.get_quant_method(layer, prefix) - assert isinstance(method, Int8OnlineLinearMethod) + with ( + patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=True), + patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=False), + ): + method = config.get_quant_method(layer, prefix) + assert isinstance(method, Int8OnlineLinearMethod) # Test skipping quantization for a layer config.ignored_layers = [prefix] @@ -126,20 +130,22 @@ def _fake_init(self, quant_config): assert isinstance(method, UnquantizedLinearMethod) -def test_get_npu_quant_method(mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch): +def test_get_npu_quant_method(): """Test for get_quant_method method for NPU""" from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod config = build_quant_config("int8") - layer = mocker.Mock(spec=LinearBase) + layer = MagicMock(spec=LinearBase) prefix = "test_layer" # Mock the platform to be NPU - monkeypatch.setattr(current_omni_platform, "is_cuda", lambda: False) - monkeypatch.setattr(current_omni_platform, "is_npu", lambda: True) - method = config.get_quant_method(layer, prefix) - assert isinstance(method, NPUInt8OnlineLinearMethod) + with ( + patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=False), + patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=True), + ): + method = config.get_quant_method(layer, prefix) + assert isinstance(method, NPUInt8OnlineLinearMethod) # Test skipping quantization for a layer config.ignored_layers = [prefix] @@ -239,7 +245,7 @@ class TestNPUInt8LinearMethod: @pytest.fixture def mock_torch_npu(self, mocker): - torch_npu = mocker.MagicMock() + torch_npu = MagicMock() mocker.patch("vllm_omni.quantization.int8_config.torch_npu", return_value=torch_npu) mocker.patch( diff --git a/tests/diffusion/quantization/test_quantization_quality.py b/tests/diffusion/quantization/test_quantization_quality.py index ba6a150c4bb..3d8f1873698 100644 --- a/tests/diffusion/quantization/test_quantization_quality.py +++ b/tests/diffusion/quantization/test_quantization_quality.py @@ -32,7 +32,7 @@ import pytest import torch -from tests.helpers.mark import hardware_marks +from tests.utils import hardware_marks # --------------------------------------------------------------------------- # Configuration — add new quantization methods / models here diff --git a/tests/diffusion/test_diffusers_adapter.py b/tests/diffusion/test_diffusers_adapter.py deleted file mode 100644 index ac2ec2e3fef..00000000000 --- a/tests/diffusion/test_diffusers_adapter.py +++ /dev/null @@ -1,186 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections import namedtuple -from types import SimpleNamespace - -import pytest -import torch -from diffusers import DiffusionPipeline -from PIL import Image - -from vllm_omni.diffusion.data import ( - DiffusionOutput, - DiffusionParallelConfig, - OmniDiffusionConfig, -) -from vllm_omni.diffusion.models.diffusers_adapter import DiffusersAdapterPipeline -from vllm_omni.diffusion.request import OmniDiffusionRequest -from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] - - -def _make_od_config(**overrides) -> OmniDiffusionConfig: - od_config = OmniDiffusionConfig( - model="test/model", - model_class_name="DiffusersAdapterPipeline", - dtype=torch.float16, - diffusion_load_format="diffusers", - diffusers_load_kwargs={}, - diffusers_call_kwargs={}, - output_type="pil", - parallel_config=DiffusionParallelConfig(cfg_parallel_size=1, sequence_parallel_size=1), - cache_backend="none", - ) - for key, value in overrides.items(): - setattr(od_config, key, value) - return od_config - - -def _make_request(**overrides) -> OmniDiffusionRequest: - prompt = overrides.pop("prompt", "a test prompt") - negative_prompt = overrides.pop("negative_prompt", None) - prompt_obj: dict[str, str] = {"prompt": prompt} - if negative_prompt is not None: - prompt_obj["negative_prompt"] = negative_prompt - - defaults = { - "prompts": [prompt_obj], - "sampling_params": OmniDiffusionSamplingParams( - num_inference_steps=20, - guidance_scale=7.5, - height=16, - width=16, - num_frames=1, - num_outputs_per_prompt=1, - seed=42, - output_type="pil", - generator_device="cpu", - ), - } - defaults.update(overrides) - return OmniDiffusionRequest(**defaults) - - -class TestDiffusersAdapterPipeline: - def test_adapter_forward_returns_output(self, mocker): - od_config = _make_od_config() - request = _make_request() - stub_image = Image.new("RGB", (request.sampling_params.width, request.sampling_params.height)) # pyright: ignore[reportArgumentType] - - adapter = DiffusersAdapterPipeline(od_config=od_config) - MockPipelineOutput = namedtuple("MockPipelineOutput", ["image"]) - MockPipeline = type("MockPipeline", (DiffusionPipeline,), {}) - adapter._pipeline = MockPipeline() - - mocker.patch.object( - MockPipeline, - "__call__", - return_value=MockPipelineOutput(image=stub_image), - ) - output = adapter.forward(request) - - assert isinstance(output, DiffusionOutput) - assert isinstance(output.output, MockPipelineOutput) - assert output.output.image is stub_image - - @pytest.mark.parametrize( - "feature_id", - ["cfg_parallel", "ulysses", "ring", "teacache", "cache_dit", "enforce_eager", "quantization"], - ) - def test_adapter_guard_unsupported_feature(self, feature_id): - if feature_id == "cfg_parallel": - od_config = _make_od_config( - parallel_config=DiffusionParallelConfig(cfg_parallel_size=2, sequence_parallel_size=1), - cache_backend="none", - ) - elif feature_id == "ulysses": - od_config = _make_od_config( - parallel_config=DiffusionParallelConfig(cfg_parallel_size=1, ulysses_degree=2), - cache_backend="none", - ) - elif feature_id == "ring": - od_config = _make_od_config( - parallel_config=DiffusionParallelConfig(cfg_parallel_size=1, ring_degree=2), - cache_backend="none", - ) - elif feature_id == "teacache": - od_config = _make_od_config( - parallel_config=DiffusionParallelConfig(cfg_parallel_size=1, sequence_parallel_size=1), - cache_backend="tea_cache", - ) - elif feature_id == "cache_dit": - od_config = _make_od_config( - parallel_config=DiffusionParallelConfig(cfg_parallel_size=1, sequence_parallel_size=1), - cache_backend="cache_dit", - ) - elif feature_id == "enforce_eager": - od_config = _make_od_config(enforce_eager=True) - elif feature_id == "quantization": - od_config = _make_od_config(quantization_config=SimpleNamespace(quant_method="fp8")) - else: - raise ValueError(f"Unknown feature ID: {feature_id}") - - with pytest.raises(NotImplementedError): - DiffusersAdapterPipeline(od_config=od_config) - - def test_adapter_guard_unknown_output_type(self, mocker): - """Test that the adapter wraps an unknown output type as-is. - This is useful when `return_dict=True` and the diffusers pipeline returns an OrderedDict subclass.""" - - adapter = DiffusersAdapterPipeline(od_config=_make_od_config()) - raw_output = {"unexpected": "dict-output"} - - MockPipeline = type("MockPipeline", (DiffusionPipeline,), {}) - adapter._pipeline = MockPipeline() - - mocker.patch.object( - MockPipeline, - "__call__", - return_value=raw_output, - ) - output = adapter.forward(_make_request()) - - assert isinstance(output, DiffusionOutput) - assert output.output == raw_output - - def test_adapter_build_call_kwargs(self): - adapter = DiffusersAdapterPipeline( - od_config=_make_od_config( - diffusers_call_kwargs={ - "guidance_scale": 1.25, - "eta": 0.3, - "output_type": "np", - } - ) - ) - req = _make_request( - prompt="a cat on mars", - negative_prompt="low quality", - sampling_params=OmniDiffusionSamplingParams( - num_inference_steps=9, - guidance_scale=8.0, - height=320, - width=640, - num_frames=8, - num_outputs_per_prompt=2, - seed=123, - output_type="pil", - ), - ) - - kwargs = adapter._build_call_kwargs(req) - - assert kwargs["prompt"] == "a cat on mars" - assert kwargs["negative_prompt"] == "low quality" - assert kwargs["num_inference_steps"] == 9 - assert kwargs["guidance_scale"] == 8.0 - assert kwargs["height"] == 320 - assert kwargs["width"] == 640 - assert kwargs["num_frames"] == 8 - assert kwargs["num_images_per_prompt"] == 2 - assert kwargs["output_type"] == "pil" - assert isinstance(kwargs["generator"], torch.Generator) - assert kwargs["generator"].device.type == "cpu" - assert kwargs["generator"].initial_seed() == 123 diff --git a/tests/diffusion/test_diffusion_model_runner.py b/tests/diffusion/test_diffusion_model_runner.py index b63f6d8887f..88b17147e85 100644 --- a/tests/diffusion/test_diffusion_model_runner.py +++ b/tests/diffusion/test_diffusion_model_runner.py @@ -8,10 +8,9 @@ import torch import vllm_omni.diffusion.worker.diffusion_model_runner as model_runner_module -from tests.helpers.mark import hardware_test from vllm_omni.diffusion.worker.diffusion_model_runner import DiffusionModelRunner -pytestmark = [pytest.mark.diffusion] +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] @contextmanager @@ -65,8 +64,6 @@ def _make_runner(cache_backend, cache_backend_name: str, enable_cache_dit_summar return runner -@pytest.mark.core_model -@hardware_test(res={"cuda": "L4"}, num_cards=1) def test_execute_model_skips_cache_summary_without_active_cache_backend(monkeypatch): """Guard cache diagnostics with runtime backend state to avoid stale-config crashes.""" runner = _make_runner(cache_backend=None, cache_backend_name="cache_dit") @@ -87,8 +84,6 @@ def test_execute_model_skips_cache_summary_without_active_cache_backend(monkeypa assert cache_summary_calls == [] -@pytest.mark.core_model -@hardware_test(res={"cuda": "L4"}, num_cards=1) def test_execute_model_emits_cache_summary_with_active_cache_dit_backend(monkeypatch): class _EnabledCacheBackend: def is_enabled(self): @@ -112,8 +107,6 @@ def is_enabled(self): assert cache_summary_calls == [(runner.pipeline, True)] -@pytest.mark.core_model -@pytest.mark.cpu def test_load_model_clears_cache_backend_for_unsupported_pipeline(monkeypatch): class _DummyLoader: def __init__(self, load_config, od_config=None): diff --git a/tests/diffusion/test_diffusion_scheduler.py b/tests/diffusion/test_diffusion_scheduler.py index a64d9920e03..4324ba1e630 100644 --- a/tests/diffusion/test_diffusion_scheduler.py +++ b/tests/diffusion/test_diffusion_scheduler.py @@ -4,10 +4,10 @@ import queue import threading from types import SimpleNamespace +from unittest.mock import Mock, patch import pytest import torch -from pytest_mock import MockerFixture from vllm_omni.diffusion.data import DiffusionOutput, DiffusionRequestAbortedError from vllm_omni.diffusion.diffusion_engine import DiffusionEngine @@ -97,19 +97,19 @@ def initialize(self, od_config) -> None: def add_request(self, request: OmniDiffusionRequest) -> str: assert request is self._request - self._state = SimpleNamespace(sched_req_id=self._sched_req_id, req=request) + self._state = Mock(sched_req_id=self._sched_req_id, req=request) return self._sched_req_id def schedule(self): if self._scheduled or self._state is None: - return SimpleNamespace( + return Mock( scheduled_new_reqs=[], scheduled_cached_reqs=CachedRequestData.make_empty(), scheduled_req_ids=[], is_empty=True, ) self._scheduled = True - return SimpleNamespace( + return Mock( scheduled_new_reqs=[NewRequestData.from_state(self._state)], scheduled_cached_reqs=CachedRequestData.make_empty(), scheduled_req_ids=[self._state.sched_req_id], @@ -153,7 +153,7 @@ def close(self) -> None: class TestRequestScheduler: def setup_method(self) -> None: self.scheduler: RequestScheduler = RequestScheduler() - self.scheduler.initialize(SimpleNamespace()) + self.scheduler.initialize(Mock()) def test_single_request_success_lifecycle(self) -> None: req_id = self.scheduler.add_request(_make_request("a")) @@ -276,23 +276,23 @@ def test_request_id_mapping_lifecycle(self) -> None: class TestDiffusionEngine: - def test_add_req_and_wait_for_response_single_path(self, mocker: MockerFixture) -> None: + def test_add_req_and_wait_for_response_single_path(self) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.scheduler = RequestScheduler() - engine.scheduler.initialize(SimpleNamespace()) + engine.scheduler.initialize(Mock()) engine._rpc_lock = threading.RLock() engine.abort_queue = queue.Queue() request = _make_request("engine") runner_output = _make_request_output("engine") - engine.execute_fn = mocker.Mock(return_value=runner_output) + engine.execute_fn = Mock(return_value=runner_output) output = engine.add_req_and_wait_for_response(request) assert output is runner_output.result engine.execute_fn.assert_called_once() - def test_supports_scheduler_interface_injection(self, mocker: MockerFixture) -> None: + def test_supports_scheduler_interface_injection(self) -> None: request = _make_request("engine_iface") runner_output = _make_request_output("engine_iface") scheduler = _StubScheduler(request, runner_output) @@ -301,45 +301,33 @@ def test_supports_scheduler_interface_injection(self, mocker: MockerFixture) -> engine.scheduler = scheduler engine._rpc_lock = threading.RLock() engine.abort_queue = queue.Queue() - engine.execute_fn = mocker.Mock(return_value=runner_output) + engine.execute_fn = Mock(return_value=runner_output) output = engine.add_req_and_wait_for_response(request) assert output is runner_output.result engine.execute_fn.assert_called_once() - def test_initializes_injected_scheduler( - self, - monkeypatch: pytest.MonkeyPatch, - mocker: MockerFixture, - ) -> None: + def test_initializes_injected_scheduler(self) -> None: request = _make_request("init") scheduler = _StubScheduler(request, DiffusionOutput(output=None)) - od_config = SimpleNamespace(model_class_name="mock_model") - fake_executor_cls = mocker.Mock(return_value=mocker.Mock()) + od_config = Mock(model_class_name="mock_model") + fake_executor_cls = Mock(return_value=Mock()) - monkeypatch.setattr( - "vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", - lambda *args, **kwargs: None, - ) - monkeypatch.setattr( - "vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", - lambda *args, **kwargs: None, - ) - monkeypatch.setattr( - "vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", - lambda *args, **kwargs: fake_executor_cls, - ) - monkeypatch.setattr(DiffusionEngine, "_dummy_run", lambda self: None) - - DiffusionEngine(od_config, scheduler=scheduler) + with ( + patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", return_value=None), + patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", return_value=None), + patch("vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", return_value=fake_executor_cls), + patch.object(DiffusionEngine, "_dummy_run", return_value=None), + ): + DiffusionEngine(od_config, scheduler=scheduler) assert scheduler.initialized_with is od_config fake_executor_cls.assert_called_once_with(od_config) def test_scheduler_alias_keeps_default_request_scheduler(self) -> None: scheduler = Scheduler() - scheduler.initialize(SimpleNamespace()) + scheduler.initialize(Mock()) req_id = scheduler.add_request(_make_request("alias")) sched_output = scheduler.schedule() @@ -348,10 +336,10 @@ def test_scheduler_alias_keeps_default_request_scheduler(self) -> None: assert req_id in finished assert scheduler.get_request_state(req_id).status == DiffusionRequestStatus.FINISHED_COMPLETED - def test_step_raises_aborted_error(self, mocker: MockerFixture) -> None: + def test_step_raises_aborted_error(self) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.pre_process_func = None - engine.add_req_and_wait_for_response = mocker.Mock( + engine.add_req_and_wait_for_response = Mock( return_value=DiffusionOutput(aborted=True, abort_message="Request req-abort aborted.") ) @@ -361,7 +349,7 @@ def test_step_raises_aborted_error(self, mocker: MockerFixture) -> None: def test_abort_queue_marks_request_finished_aborted(self) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.scheduler = RequestScheduler() - engine.scheduler.initialize(SimpleNamespace()) + engine.scheduler.initialize(Mock()) engine.abort_queue = queue.Queue() req_id = engine.scheduler.add_request(_make_request("req-abort")) @@ -373,7 +361,7 @@ def test_abort_queue_marks_request_finished_aborted(self) -> None: def test_finalize_finished_request_returns_aborted_output(self) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.scheduler = RequestScheduler() - engine.scheduler.initialize(SimpleNamespace()) + engine.scheduler.initialize(Mock()) req_id = engine.scheduler.add_request(_make_request("req-finalize")) engine.scheduler.finish_requests(req_id, DiffusionRequestStatus.FINISHED_ABORTED) @@ -383,40 +371,29 @@ def test_finalize_finished_request_returns_aborted_output(self) -> None: assert output.aborted is True assert output.abort_message == "Request req-finalize aborted." - def test_initializes_step_scheduler_when_step_execution_enabled( - self, - monkeypatch: pytest.MonkeyPatch, - mocker: MockerFixture, - ) -> None: - od_config = SimpleNamespace(model_class_name="mock_model") + def test_initializes_step_scheduler_when_step_execution_enabled(self) -> None: + od_config = Mock(model_class_name="mock_model") od_config.step_execution = True - fake_executor = mocker.Mock() - fake_executor_cls = mocker.Mock(return_value=fake_executor) + fake_executor = Mock() + fake_executor_cls = Mock(return_value=fake_executor) - monkeypatch.setattr( - "vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", - lambda *args, **kwargs: None, - ) - monkeypatch.setattr( - "vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", - lambda *args, **kwargs: None, - ) - monkeypatch.setattr( - "vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", - lambda *args, **kwargs: fake_executor_cls, - ) - monkeypatch.setattr(DiffusionEngine, "_dummy_run", lambda self: None) - engine = DiffusionEngine(od_config) + with ( + patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", return_value=None), + patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", return_value=None), + patch("vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", return_value=fake_executor_cls), + patch.object(DiffusionEngine, "_dummy_run", return_value=None), + ): + engine = DiffusionEngine(od_config) assert isinstance(engine.scheduler, StepScheduler) assert engine.execute_fn is fake_executor.execute_step fake_executor_cls.assert_called_once_with(od_config) - def test_dummy_run_raises_on_output_error(self, mocker: MockerFixture) -> None: + def test_dummy_run_raises_on_output_error(self) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) - engine.od_config = SimpleNamespace(model_class_name="mock_model") + engine.od_config = Mock(model_class_name="mock_model") engine.pre_process_func = None - engine.add_req_and_wait_for_response = mocker.Mock(return_value=DiffusionOutput(error="boom")) + engine.add_req_and_wait_for_response = Mock(return_value=DiffusionOutput(error="boom")) with pytest.raises(RuntimeError, match="Dummy run failed: boom"): engine._dummy_run() @@ -425,7 +402,7 @@ def test_dummy_run_raises_on_output_error(self, mocker: MockerFixture) -> None: class TestStepScheduler: def setup_method(self) -> None: self.scheduler: StepScheduler = StepScheduler() - self.scheduler.initialize(SimpleNamespace()) + self.scheduler.initialize(Mock()) def test_single_request_step_lifecycle(self) -> None: request = _make_step_request("step", num_inference_steps=3) diff --git a/tests/diffusion/test_diffusion_step_pipeline.py b/tests/diffusion/test_diffusion_step_pipeline.py index 06f8cd14dc8..68aba9ba3bf 100644 --- a/tests/diffusion/test_diffusion_step_pipeline.py +++ b/tests/diffusion/test_diffusion_step_pipeline.py @@ -7,13 +7,13 @@ import threading from contextlib import contextmanager from types import SimpleNamespace +from unittest.mock import Mock import pytest import torch -from pytest_mock import MockerFixture import vllm_omni.diffusion.worker.diffusion_model_runner as model_runner_module -from tests.helpers.mark import hardware_test +from tests.utils import hardware_test from vllm_omni.diffusion.data import DiffusionOutput from vllm_omni.diffusion.diffusion_engine import DiffusionEngine from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin @@ -542,11 +542,11 @@ def test_rejects_lora_requests_in_step_mode(self): class TestExecutor: """MultiprocDiffusionExecutor.execute_step""" - def test_execute_step_passes_through_runner_output(self, mocker: MockerFixture): + def test_execute_step_passes_through_runner_output(self): executor = object.__new__(MultiprocDiffusionExecutor) executor._ensure_open = lambda: None expected = RunnerOutput(req_id="req-step", step_index=1, finished=False, result=None) - executor.collective_rpc = mocker.Mock(return_value=expected) + executor.collective_rpc = Mock(return_value=expected) request = _make_engine_request("req-step", num_inference_steps=2) scheduler_output = _make_scheduler_output(request, sched_req_id="req-step") @@ -578,9 +578,9 @@ class TestEngine: ), ], ) - def test_step_engine_returns_error(self, execute_fn, expected_error, mocker: MockerFixture): + def test_step_engine_returns_error(self, execute_fn, expected_error): scheduler = StepScheduler() - scheduler.initialize(mocker.Mock()) + scheduler.initialize(Mock()) engine = _make_engine(scheduler, execute_fn=execute_fn) output = engine.add_req_and_wait_for_response(_make_engine_request("req-error", num_inference_steps=2)) @@ -588,9 +588,9 @@ def test_step_engine_returns_error(self, execute_fn, expected_error, mocker: Moc assert output.output is None assert expected_error in output.error - def test_step_execution_completes(self, mocker: MockerFixture): + def test_step_execution_completes(self): scheduler = StepScheduler() - scheduler.initialize(mocker.Mock()) + scheduler.initialize(Mock()) engine = _make_engine(scheduler) request = _make_engine_request("req-step", num_inference_steps=2) @@ -614,9 +614,9 @@ def execute_fn(_): assert output.error is None assert torch.equal(output.output, torch.tensor([2.0])) - def test_step_abort_stops_rescheduling_after_first_step(self, mocker: MockerFixture): + def test_step_abort_stops_rescheduling_after_first_step(self): scheduler = StepScheduler() - scheduler.initialize(mocker.Mock()) + scheduler.initialize(Mock()) engine = _make_engine(scheduler) request = _make_engine_request("req-stop", num_inference_steps=4) @@ -639,9 +639,9 @@ def execute_fn(_): assert step["n"] == 1 _assert_aborted_output(output, "req-stop") - def test_step_abort_after_reschedule_returns_aborted_output(self, mocker: MockerFixture): + def test_step_abort_after_reschedule_returns_aborted_output(self): scheduler = StepScheduler() - scheduler.initialize(mocker.Mock()) + scheduler.initialize(Mock()) engine = _make_engine(scheduler) request = _make_engine_request("req-mid", num_inference_steps=4) @@ -666,9 +666,9 @@ def execute_fn(sched_output): assert step["n"] == 2 _assert_aborted_output(output, "req-mid") - def test_finished_step_without_result_returns_error(self, mocker: MockerFixture): + def test_finished_step_without_result_returns_error(self): scheduler = StepScheduler() - scheduler.initialize(mocker.Mock()) + scheduler.initialize(Mock()) engine = _make_engine( scheduler, execute_fn=lambda _: RunnerOutput( diff --git a/tests/diffusion/test_diffusion_worker.py b/tests/diffusion/test_diffusion_worker.py index fc08c5f7f03..e2bd7ef8a32 100644 --- a/tests/diffusion/test_diffusion_worker.py +++ b/tests/diffusion/test_diffusion_worker.py @@ -16,7 +16,7 @@ from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.gpu] +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] @pytest.fixture @@ -81,31 +81,17 @@ def test_load_weights_empty_iterable(self, mocker: MockerFixture, mock_gpu_worke class TestDiffusionWorkerSleep: """Test DiffusionWorker.sleep method.""" - @pytest.fixture(autouse=True) - def setup_allocator(self, mocker: MockerFixture): - """ - Unified interception of Allocators, and provision of default security values. - """ - self.mock_allocator_class = mocker.patch("vllm.device_allocator.cumem.CuMemAllocator") - self.mock_allocator = mocker.Mock() - self.mock_allocator_class.get_instance.return_value = self.mock_allocator - self.mock_allocator.get_current_usage.return_value = 4 * 1024**3 - self.mock_allocator.sleep = mocker.Mock() - def test_sleep_level_1(self, mocker: MockerFixture, mock_gpu_worker): """Test sleep mode level 1 (offload weights only).""" mock_allocator_class = mocker.patch("vllm.device_allocator.cumem.CuMemAllocator") - mock_platform = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.current_omni_platform") - mock_platform.get_free_memory.side_effect = [10 * 1024**3, 12 * 1024**3] - mock_platform.get_device_total_memory.return_value = 80 * 1024**3 + mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.current_omni_platform") mock_get_process_memory = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.get_process_gpu_memory") # Setup process-scoped memory mocks # Before sleep: 3GB used # After sleep: 1GB used (freed 2GB) - initial_usage = 3 * 1024**3 mock_get_process_memory.side_effect = [ - initial_usage, + 3 * 1024**3, 1 * 1024**3, ] @@ -113,29 +99,25 @@ def test_sleep_level_1(self, mocker: MockerFixture, mock_gpu_worker): mock_allocator = mocker.Mock() mock_allocator_class.get_instance = mocker.Mock(return_value=mock_allocator) mock_allocator.sleep = mocker.Mock() - mock_allocator.get_current_usage.return_value = initial_usage # Call sleep with level 1 result = mock_gpu_worker.sleep(level=1) # Verify sleep was called with correct tags mock_allocator.sleep.assert_called_once_with(offload_tags=("weights",)) - assert bool(result) is True + assert result is True # Verify buffers were NOT saved (level 1 doesn't save buffers) assert len(mock_gpu_worker._sleep_saved_buffers) == 0 def test_sleep_level_2(self, mocker: MockerFixture, mock_gpu_worker): """Test sleep mode level 2 (offload all, save buffers).""" mock_allocator_class = mocker.patch("vllm.device_allocator.cumem.CuMemAllocator") - mock_platform = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.current_omni_platform") - mock_platform.get_free_memory.side_effect = [5 * 1024**3, 10 * 1024**3] - mock_platform.get_device_total_memory.return_value = 80 * 1024**3 + mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.current_omni_platform") mock_get_process_memory = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.get_process_gpu_memory") # Setup process-scoped memory mocks - initial_usage = 5 * 1024**3 mock_get_process_memory.side_effect = [ - initial_usage, # Before sleep + 5 * 1024**3, # Before sleep 1 * 1024**3, # After sleep (freed 4GB) ] @@ -143,7 +125,6 @@ def test_sleep_level_2(self, mocker: MockerFixture, mock_gpu_worker): mock_allocator = mocker.Mock() mock_allocator_class.get_instance = mocker.Mock(return_value=mock_allocator) mock_allocator.sleep = mocker.Mock() - mock_allocator.get_current_usage.return_value = initial_usage # Mock pipeline buffers mock_buffer1 = torch.randn(10, 10) @@ -160,7 +141,7 @@ def test_sleep_level_2(self, mocker: MockerFixture, mock_gpu_worker): # Verify sleep was called with empty tags (offload all) mock_allocator.sleep.assert_called_once_with(offload_tags=tuple()) - assert bool(result) is True + assert result is True # Verify buffers were saved assert len(mock_gpu_worker._sleep_saved_buffers) == 2 @@ -170,26 +151,22 @@ def test_sleep_level_2(self, mocker: MockerFixture, mock_gpu_worker): def test_sleep_memory_freed_validation(self, mocker: MockerFixture, mock_gpu_worker): """Test that sleep validates memory was actually freed.""" mock_allocator_class = mocker.patch("vllm.device_allocator.cumem.CuMemAllocator") - mock_platform = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.current_omni_platform") - mock_platform.get_free_memory.return_value = 10 * 1024**3 - mock_platform.get_device_total_memory.return_value = 80 * 1024**3 + mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.current_omni_platform") mock_get_process_memory = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.get_process_gpu_memory") # Simulate process memory increase (should trigger assertion error) - initial_usage = 1 * 1024**3 mock_get_process_memory.side_effect = [ - initial_usage, # Before sleep: 1GB used + 1 * 1024**3, # Before sleep: 1GB used 3 * 1024**3, # After sleep: 3GB used (negative freed) ] mock_allocator = mocker.Mock() mock_allocator_class.get_instance = mocker.Mock(return_value=mock_allocator) mock_allocator.sleep = mocker.Mock() - mock_allocator.get_current_usage.return_value = initial_usage # This should raise an assertion error - result = mock_gpu_worker.sleep(level=1) - assert result == initial_usage + with pytest.raises(AssertionError, match="Memory usage increased after sleeping"): + mock_gpu_worker.sleep(level=1) def test_sleep_falls_back_to_device_memory_when_nvml_unavailable(self, mocker: MockerFixture, mock_gpu_worker): """Test sleep uses device-scoped fallback when NVML is unavailable.""" @@ -207,12 +184,11 @@ def test_sleep_falls_back_to_device_memory_when_nvml_unavailable(self, mocker: M mock_allocator = mocker.Mock() mock_allocator_class.get_instance = mocker.Mock(return_value=mock_allocator) mock_allocator.sleep = mocker.Mock() - mock_allocator.get_current_usage.return_value = 2 * 1024**3 result = mock_gpu_worker.sleep(level=1) mock_allocator.sleep.assert_called_once_with(offload_tags=("weights",)) - assert bool(result) is True + assert result is True class TestDiffusionWorkerWakeUp: @@ -226,7 +202,6 @@ def test_wake_up_without_buffers(self, mocker: MockerFixture, mock_gpu_worker): mock_allocator = mocker.Mock() mock_allocator_class.get_instance = mocker.Mock(return_value=mock_allocator) mock_allocator.wake_up = mocker.Mock() - mock_allocator.get_current_usage.return_value = 10 * 1024**3 # Ensure no saved buffers mock_gpu_worker._sleep_saved_buffers = {} @@ -236,7 +211,7 @@ def test_wake_up_without_buffers(self, mocker: MockerFixture, mock_gpu_worker): # Verify allocator.wake_up was called mock_allocator.wake_up.assert_called_once_with(["weights"]) - assert bool(result) is True + assert result is True def test_wake_up_with_buffers(self, mocker: MockerFixture, mock_gpu_worker): """Test wake_up with saved buffers (level 2 sleep).""" @@ -246,7 +221,6 @@ def test_wake_up_with_buffers(self, mocker: MockerFixture, mock_gpu_worker): mock_allocator = mocker.Mock() mock_allocator_class.get_instance = mocker.Mock(return_value=mock_allocator) mock_allocator.wake_up = mocker.Mock() - mock_allocator.get_current_usage.return_value = 10 * 1024**3 # Create saved buffers saved_buffer1 = torch.randn(10, 10) @@ -281,7 +255,7 @@ def test_wake_up_with_buffers(self, mocker: MockerFixture, mock_gpu_worker): # Verify saved buffers were cleared assert len(mock_gpu_worker._sleep_saved_buffers) == 0 - assert bool(result) is True + assert result is True def test_wake_up_partial_buffer_restore(self, mocker: MockerFixture, mock_gpu_worker): """Test wake_up only restores buffers that were saved.""" @@ -291,7 +265,6 @@ def test_wake_up_partial_buffer_restore(self, mocker: MockerFixture, mock_gpu_wo mock_allocator = mocker.Mock() mock_allocator_class.get_instance = mocker.Mock(return_value=mock_allocator) mock_allocator.wake_up = mocker.Mock() - mock_allocator.get_current_usage.return_value = 10 * 1024**3 # Only save buffer1, not buffer2 saved_buffer1 = torch.randn(10, 10) @@ -320,4 +293,4 @@ def test_wake_up_partial_buffer_restore(self, mocker: MockerFixture, mock_gpu_wo # buffer2 should NOT be restored since it wasn't saved mock_buffer2.data.copy_.assert_not_called() - assert bool(result) is True + assert result is True diff --git a/tests/diffusion/test_diffusion_worker_cuda_profiler.py b/tests/diffusion/test_diffusion_worker_cuda_profiler.py deleted file mode 100644 index 4a3b22c212e..00000000000 --- a/tests/diffusion/test_diffusion_worker_cuda_profiler.py +++ /dev/null @@ -1,101 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -from pytest_mock import MockerFixture - -from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker - -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] - - -@pytest.fixture -def mock_od_config(mocker: MockerFixture): - """Create a mock OmniDiffusionConfig with a CUDA profiler backend.""" - config = mocker.Mock() - config.profiler_config = mocker.Mock() - config.profiler_config.profiler = "cuda" - config.diffusion_load_format = "default" - return config - - -@pytest.fixture -def mock_diffusion_worker_dependencies(mocker: MockerFixture): - """Patch heavy worker dependencies for focused profiler tests.""" - mocker.patch.object(DiffusionWorker, "init_device") - mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.DiffusionModelRunner") - - -class TestDiffusionWorkerCudaProfiler: - def test_creates_cuda_profiler_wrapper( - self, - mocker: MockerFixture, - mock_od_config, - mock_diffusion_worker_dependencies, - ): - fake_profiler = mocker.Mock() - cuda_profiler = mocker.patch( - "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper", - return_value=fake_profiler, - ) - create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler") - - worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) - - cuda_profiler.assert_called_once_with(mock_od_config.profiler_config) - create_omni_profiler.assert_not_called() - assert worker.profiler is fake_profiler - - def test_profile_start_stop_delegates_to_cuda_profiler( - self, - mocker: MockerFixture, - mock_od_config, - mock_diffusion_worker_dependencies, - ): - fake_profiler = mocker.Mock() - fake_profiler.start = mocker.Mock() - fake_profiler.stop = mocker.Mock() - mocker.patch( - "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper", - return_value=fake_profiler, - ) - - worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) - - assert worker.profile(is_start=True) is None - assert worker.profile(is_start=False) is None - - fake_profiler.start.assert_called_once_with() - fake_profiler.stop.assert_called_once_with() - - def test_returns_none_when_profiler_config_is_missing( - self, - mocker: MockerFixture, - mock_od_config, - mock_diffusion_worker_dependencies, - ): - mock_od_config.profiler_config = None - cuda_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper") - create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler") - - worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) - - cuda_profiler.assert_not_called() - create_omni_profiler.assert_not_called() - assert worker.profiler is None - - def test_cuda_backend_does_not_use_torch_profiler_factory( - self, - mocker: MockerFixture, - mock_od_config, - mock_diffusion_worker_dependencies, - ): - mocker.patch( - "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper", - return_value=mocker.Mock(), - ) - create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler") - - DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) - - create_omni_profiler.assert_not_called() diff --git a/tests/diffusion/test_inline_stage_diffusion_client.py b/tests/diffusion/test_inline_stage_diffusion_client.py deleted file mode 100644 index 385f39b1240..00000000000 --- a/tests/diffusion/test_inline_stage_diffusion_client.py +++ /dev/null @@ -1,96 +0,0 @@ -from __future__ import annotations - -import asyncio -from unittest.mock import MagicMock, patch - -import pytest - -from vllm_omni.diffusion.data import OmniDiffusionConfig -from vllm_omni.diffusion.inline_stage_diffusion_client import InlineStageDiffusionClient -from vllm_omni.engine.stage_init_utils import StageMetadata -from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.outputs import OmniRequestOutput - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -@pytest.fixture -def mock_engine(): - with patch("vllm_omni.diffusion.inline_stage_diffusion_client.DiffusionEngine") as mock: - engine_instance = MagicMock() - mock.make_engine.return_value = engine_instance - yield engine_instance - - -@pytest.fixture -def client(mock_engine): - metadata = StageMetadata( - stage_id=0, - stage_type="diffusion", - engine_output_type="image", - is_comprehension=False, - requires_multimodal_data=False, - engine_input_source="prompt", - final_output=True, - final_output_type="image", - default_sampling_params={}, - custom_process_input_func=None, - model_stage=None, - runtime_cfg=None, - ) - with patch.object(InlineStageDiffusionClient, "_enrich_config"): - od_config = MagicMock(spec=OmniDiffusionConfig) - c = InlineStageDiffusionClient(model="test_model", od_config=od_config, metadata=metadata, batch_size=1) - yield c - c.shutdown() - - -@pytest.mark.asyncio -async def test_inline_dispatch_request_success(client, mock_engine): - # Setup mock engine step to return a successful result - mock_result = OmniRequestOutput.from_diffusion(request_id="req-1", images=[MagicMock()]) - mock_engine.step.return_value = [mock_result] - - sampling_params = OmniDiffusionSamplingParams() - await client.add_request_async("req-1", "A test prompt", sampling_params) - - # Wait for the task to be processed - for _ in range(10): - output = client.get_diffusion_output_nowait() - if output is not None: - break - await asyncio.sleep(0.01) - - assert output is not None - assert output.request_id == "req-1" - mock_engine.step.assert_called_once() - - -@pytest.mark.asyncio -async def test_inline_dispatch_request_error(client, mock_engine): - # Setup mock engine step to raise an exception - mock_engine.step.side_effect = RuntimeError("Engine failure") - - sampling_params = OmniDiffusionSamplingParams() - await client.add_request_async("req-err", "A test prompt", sampling_params) - - for _ in range(10): - output = client.get_diffusion_output_nowait() - if output is not None: - break - await asyncio.sleep(0.01) - - assert output is not None - assert output.request_id == "req-err" - assert output.error == "Engine failure" - assert not output.images - - -def test_inline_shutdown(client, mock_engine): - assert not client._shutting_down - - # Shutting down should cleanly cancel anything queued and close engine - client.shutdown() - - assert client._shutting_down - mock_engine.close.assert_called_once() diff --git a/tests/diffusion/test_multiproc_engine_concurrency.py b/tests/diffusion/test_multiproc_engine_concurrency.py index 9ec06e8107d..517f98ddaa9 100644 --- a/tests/diffusion/test_multiproc_engine_concurrency.py +++ b/tests/diffusion/test_multiproc_engine_concurrency.py @@ -1,25 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import asyncio -import multiprocessing as mp import queue import threading -import time -from types import SimpleNamespace -from unittest.mock import MagicMock, Mock +from unittest.mock import Mock, patch import pytest import torch -import zmq -from vllm.v1.engine.exceptions import EngineDeadError from vllm_omni.diffusion.data import DiffusionOutput from vllm_omni.diffusion.diffusion_engine import DiffusionEngine from vllm_omni.diffusion.executor.multiproc_executor import MultiprocDiffusionExecutor from vllm_omni.diffusion.sched import RequestScheduler -from vllm_omni.diffusion.stage_diffusion_proc import StageDiffusionProc -from vllm_omni.outputs import OmniRequestOutput pytestmark = [pytest.mark.diffusion, pytest.mark.core_model, pytest.mark.cpu] @@ -32,9 +24,11 @@ def _tagged_output(tag: str) -> DiffusionOutput: return DiffusionOutput(output=torch.tensor([0]), error=tag) -def _mock_request(tag: str): - """Return a lightweight request object identifiable by *tag*.""" - return SimpleNamespace(request_ids=[tag]) +def _mock_request(tag: str) -> Mock: + """Return a mock ``OmniDiffusionRequest`` identifiable by *tag*.""" + req = Mock() + req.request_ids = [tag] + return req def _make_executor(num_gpus: int = 1): @@ -42,25 +36,25 @@ def _make_executor(num_gpus: int = 1): Returns ``(executor, request_queue, result_queue)``. """ - od_cfg = SimpleNamespace(num_gpus=num_gpus) - monkeypatch = pytest.MonkeyPatch() - monkeypatch.setattr(MultiprocDiffusionExecutor, "_init_executor", lambda self: None) - executor = MultiprocDiffusionExecutor(od_cfg) - monkeypatch.undo() + od_cfg = Mock() + od_cfg.num_gpus = num_gpus + + with patch.object(MultiprocDiffusionExecutor, "_init_executor"): + executor = MultiprocDiffusionExecutor(od_cfg) req_q: queue.Queue = queue.Queue() res_q: queue.Queue = queue.Queue() - mock_broadcast_mq = SimpleNamespace(enqueue=req_q.put) + mock_broadcast_mq = Mock() + mock_broadcast_mq.enqueue = req_q.put - mock_rmq = SimpleNamespace(dequeue=lambda timeout=None: res_q.get(timeout=timeout if timeout is not None else 10)) + mock_rmq = Mock() + mock_rmq.dequeue = lambda timeout=None: res_q.get(timeout=timeout if timeout is not None else 10) executor._broadcast_mq = mock_broadcast_mq executor._result_mq = mock_rmq executor._closed = False executor._processes = [] - executor.is_failed = False - executor._failure_callbacks = [] return executor, req_q, res_q @@ -69,7 +63,7 @@ def _make_engine(num_gpus: int = 1): executor, req_q, res_q = _make_executor(num_gpus) engine = DiffusionEngine.__new__(DiffusionEngine) sched = RequestScheduler() - sched.initialize(SimpleNamespace()) + sched.initialize(Mock()) engine.scheduler = sched engine.executor = executor engine._rpc_lock = threading.RLock() @@ -344,9 +338,8 @@ def test_collective_rpc_closed_executor_raises(self): class TestCollectiveRpcTimeoutWhileLockHeld: """``collective_rpc(timeout=...)`` must honour its timeout even when - another thread holds ``engine._rpc_lock`` indefinitely (e.g. request - execution stalled on ``add_req_and_wait_for_response`` → ``execute_fn`` - → ``collective_rpc`` while blocked on an unresponsive worker). + another thread holds ``engine._rpc_lock`` indefinitely (e.g. a stalled + ``add_req`` waiting on an unresponsive worker). """ def test_rpc_times_out_when_lock_held_directly(self): @@ -370,10 +363,10 @@ def _hold_lock(): with pytest.raises(TimeoutError): engine.collective_rpc("health", timeout=0.5) - def test_rpc_times_out_when_request_execution_stalled_on_worker(self): + def test_rpc_times_out_when_add_req_stalled_on_worker(self): """Real-world scenario the bot flagged: - The scheduler/execute path holds ``_rpc_lock`` while blocked on + ``add_req`` holds ``_rpc_lock`` while blocked on ``executor._result_mq.dequeue()`` because the worker never replies. A concurrent ``collective_rpc(timeout=...)`` must still time out instead of hanging forever waiting for the lock. @@ -439,353 +432,3 @@ def _hold_and_release(): t.join(5) assert result.error == "ok" - - -# ───────── error handling: EngineDeadError propagation through layers ───── - - -class TestMultiprocExecutorRaisesEngineDeadError: - """``collective_rpc`` raises ``EngineDeadError`` when the engine is failed.""" - - def test_collective_rpc_raises_when_is_failed(self): - executor = object.__new__(MultiprocDiffusionExecutor) - executor._closed = False - executor._broadcast_mq = MagicMock() - executor._result_mq = MagicMock() - executor._result_mq.dequeue = MagicMock(side_effect=TimeoutError) - executor.is_failed = True - - with pytest.raises(EngineDeadError): - executor.collective_rpc( - "generate", - args=(MagicMock(),), - unique_reply_rank=0, - exec_all_ranks=True, - ) - - def test_collective_rpc_raises_mid_dequeue_when_is_failed(self): - """Worker dies while we are polling the dequeue loop.""" - executor, _, res_q = _make_executor() - - call_count = 0 - orig_dequeue = executor._result_mq.dequeue - - def _dying_dequeue(timeout=None): - nonlocal call_count - call_count += 1 - if call_count == 1: - executor.is_failed = True - raise TimeoutError - return orig_dequeue(timeout=timeout) - - executor._result_mq.dequeue = _dying_dequeue - - with pytest.raises(EngineDeadError): - executor.collective_rpc( - "generate", - args=(MagicMock(),), - unique_reply_rank=0, - exec_all_ranks=True, - ) - - -class TestDiffusionEngineDeadErrorPassthrough: - """``DiffusionEngine.add_req_and_wait_for_response`` re-raises - ``EngineDeadError`` from executor and wraps other errors.""" - - def test_engine_dead_error_propagates(self): - engine, executor, _, _ = _make_engine() - engine.execute_fn = Mock(side_effect=EngineDeadError()) - - with pytest.raises(EngineDeadError): - engine.add_req_and_wait_for_response(_mock_request("dead")) - - def test_runtime_error_wrapped_in_output(self): - engine, executor, _, _ = _make_engine() - engine.execute_fn = Mock(side_effect=RuntimeError("gpu fault")) - - out = engine.add_req_and_wait_for_response(_mock_request("fault")) - assert isinstance(out, DiffusionOutput) - assert "gpu fault" in out.error - - -class TestStageDiffusionClientErrorPropagation: - """Error surface behaviour of ``StageDiffusionClient``. - - Uses ``object.__new__`` to construct a client without spawning a real - subprocess, then manually sets the fields needed for each test. - """ - - def _make_client(self, *, engine_dead=False, proc_alive=True): - from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient - - client = object.__new__(StageDiffusionClient) - client.stage_id = 0 - client.final_output = True - client.final_output_type = "image" - client.default_sampling_params = None - client.custom_process_input_func = None - client.engine_input_source = None - - client._output_queue = asyncio.Queue() - client._rpc_results = {} - client._pending_rpcs = set() - client._tasks = {} - client._shutting_down = False - client._engine_dead = engine_dead - client._owns_process = True - client._proc = MagicMock( - is_alive=MagicMock(return_value=proc_alive), - exitcode=1, - ) - client._request_socket = MagicMock() - client._response_socket = MagicMock() - client._encoder = MagicMock() - client._decoder = MagicMock() - - return client - - @pytest.mark.asyncio - async def test_add_request_raises_when_dead(self): - client = self._make_client(engine_dead=True) - - with pytest.raises(EngineDeadError): - await client.add_request_async("req-3", "test prompt", None) - - def test_check_health_raises_when_dead(self): - client = self._make_client(engine_dead=True) - - with pytest.raises(EngineDeadError): - client.check_health() - - def test_check_health_ok_when_alive(self): - client = self._make_client() - client.check_health() - - def test_get_output_raises_engine_dead_when_dead(self): - """When ``_engine_dead`` is True and the output queue is empty, - ``get_diffusion_output_nowait`` must raise ``EngineDeadError``.""" - client = self._make_client(engine_dead=True) - # Simulate _drain_responses as a no-op (no ZMQ socket) - client._response_socket.recv.side_effect = zmq.Again - - with pytest.raises(EngineDeadError): - client.get_diffusion_output_nowait() - - def test_get_output_returns_none_when_alive_and_empty(self): - """When the engine is alive and the queue is empty, return None.""" - client = self._make_client() - client._response_socket.recv.side_effect = zmq.Again - - assert client.get_diffusion_output_nowait() is None - - def test_check_health_raises_when_proc_dead(self): - """``check_health`` detects a dead subprocess via ``_proc.is_alive()`` - and raises ``EngineDeadError``, setting ``_engine_dead`` as a - side effect.""" - client = self._make_client(proc_alive=False) - - with pytest.raises(EngineDeadError, match="not alive"): - client.check_health() - - assert client._engine_dead is True - - def test_get_output_raises_when_proc_dead(self): - """When the subprocess has died (non-signal exit) and the output - queue is empty, ``get_diffusion_output_nowait`` must raise - ``EngineDeadError`` with the exit code.""" - client = self._make_client(proc_alive=False) - client._response_socket.recv.side_effect = zmq.Again - - with pytest.raises(EngineDeadError, match="exit code"): - client.get_diffusion_output_nowait() - - assert client._engine_dead is True - - def test_get_output_returns_none_on_signal_death(self): - """When the subprocess was killed by a signal (exit code > 128), - ``get_diffusion_output_nowait`` returns ``None`` and sets - ``_shutting_down`` instead of raising.""" - client = self._make_client(proc_alive=False) - client._proc.exitcode = 137 # SIGKILL (128 + 9) - client._response_socket.recv.side_effect = zmq.Again - - result = client.get_diffusion_output_nowait() - - assert result is None - assert client._shutting_down is True - assert client._engine_dead is True - - -# ───────── monitor thread & death sentinel integration tests ───────── - - -def _poll_flag(get_flag, *, timeout=5.0, interval=0.05) -> bool: - """Poll until ``get_flag()`` returns True or *timeout* elapses.""" - deadline = time.monotonic() + timeout - while time.monotonic() < deadline: - if get_flag(): - return True - time.sleep(interval) - return False - - -def _make_short_lived_process() -> mp.Process: - """Spawn a real subprocess that exits immediately. - - The process must be started with ``"fork"`` (or the platform default) - so that it can use a plain ``lambda`` as its target — ``"spawn"`` would - fail to pickle it. - """ - ctx = mp.get_context("fork") - p = ctx.Process(target=lambda: None, name="ShortLivedWorker-0") - p.start() - return p - - -class TestMultiprocExecutorWorkerMonitor: - """Integration tests for ``start_worker_monitor``. - - Uses real short-lived subprocesses so that OS-level sentinel fd - readiness is exercised end-to-end. - """ - - def test_worker_monitor_sets_is_failed_and_calls_callbacks_on_death(self): - """When a worker process dies, the monitor thread must: - 1. Set ``is_failed = True`` - 2. Call ``shutdown()`` (which sets ``_closed = True``) - 3. Invoke all registered failure callbacks - """ - executor = object.__new__(MultiprocDiffusionExecutor) - executor._closed = False - executor.is_failed = False - executor._failure_callbacks = [] - executor._broadcast_mq = None - executor._result_mq = None - executor.resources = None - # Use a no-op so shutdown() doesn't crash on None resources. - executor._finalizer = lambda: None - - proc = _make_short_lived_process() - executor._processes = [proc] - - callback_called = threading.Event() - executor.register_failure_callback(callback_called.set) - - executor.start_worker_monitor() - - # Wait for the process to exit and the monitor to react. - proc.join(5) - assert _poll_flag(lambda: executor.is_failed), "is_failed was not set" - assert executor._closed, "shutdown() was not called" - assert callback_called.wait(timeout=2), "failure callback was not invoked" - - def test_worker_monitor_noop_when_already_closed(self): - """If ``_closed`` is already True when the process dies (orderly - shutdown), the monitor must *not* set ``is_failed``.""" - executor = object.__new__(MultiprocDiffusionExecutor) - executor._closed = True # already shut down - executor.is_failed = False - executor._failure_callbacks = [] - executor._broadcast_mq = None - executor._result_mq = None - executor.resources = None - executor._finalizer = lambda: None - - proc = _make_short_lived_process() - executor._processes = [proc] - - executor.start_worker_monitor() - proc.join(5) - - # Give the monitor thread a chance to run (it should early-return). - time.sleep(0.3) - assert not executor.is_failed, "is_failed should remain False on orderly shutdown" - - -class TestStageDiffusionClientProcMonitor: - """Integration test for ``StageDiffusionClient._start_proc_monitor``. - - Uses a real short-lived subprocess to verify the sentinel-based - detection pipeline. - """ - - def test_proc_monitor_sets_engine_dead_on_process_death(self): - """When the subprocess dies, the monitor thread must set - ``_engine_dead = True``.""" - from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient - - client = object.__new__(StageDiffusionClient) - client.stage_id = 0 - client._shutting_down = False - client._engine_dead = False - - proc = _make_short_lived_process() - client._proc = proc - - client._start_proc_monitor() - proc.join(5) - - assert _poll_flag(lambda: client._engine_dead), "_engine_dead was not set" - - -class TestDrainResponsesDeathSentinel: - """Tests for death sentinel and error routing in - ``StageDiffusionClient._drain_responses()``. - """ - - def _make_client(self): - from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient - - client = object.__new__(StageDiffusionClient) - client.stage_id = 0 - client._engine_dead = False - client._shutting_down = False - client._output_queue = asyncio.Queue() - client._rpc_results = {} - client._pending_rpcs = set() - client._response_socket = MagicMock() - client._decoder = MagicMock() - return client - - def test_drain_responses_sets_engine_dead_on_death_sentinel(self): - """When ``_drain_responses`` receives the ``DIFFUSION_PROC_DEAD`` - sentinel, it must set ``_engine_dead = True`` and stop draining - (decoder is never called).""" - client = self._make_client() - - # First recv returns the death sentinel, second would be a normal - # message but should never be reached. - client._response_socket.recv.side_effect = [ - StageDiffusionProc.DIFFUSION_PROC_DEAD, - b"should-not-be-reached", - ] - - client._drain_responses() - - assert client._engine_dead is True - client._decoder.decode.assert_not_called() - - def test_drain_responses_routes_error_as_omni_request_output(self): - """When ``_drain_responses`` receives a ``{"type": "error"}`` message - with a ``request_id``, it must place an ``OmniRequestOutput`` with - the error on ``_output_queue``.""" - client = self._make_client() - - error_msg = { - "type": "error", - "request_id": "req-fail", - "error": "gpu fault", - } - # First recv returns the encoded error, second raises zmq.Again. - client._response_socket.recv.side_effect = [b"encoded-error", zmq.Again] - client._decoder.decode.return_value = error_msg - - client._drain_responses() - - assert not client._output_queue.empty() - output = client._output_queue.get_nowait() - assert isinstance(output, OmniRequestOutput) - assert output.request_id == "req-fail" - assert output.error == "gpu fault" - assert output.finished is True diff --git a/tests/diffusion/test_stage_diffusion_proc.py b/tests/diffusion/test_stage_diffusion_proc.py deleted file mode 100644 index f1cf4f9b7d1..00000000000 --- a/tests/diffusion/test_stage_diffusion_proc.py +++ /dev/null @@ -1,75 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -from concurrent.futures import ThreadPoolExecutor -from dataclasses import asdict -from types import SimpleNamespace - -import pytest - -from vllm_omni.diffusion.stage_diffusion_proc import StageDiffusionProc -from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] - - -def test_process_batch_request_preserves_parent_request_id_and_kv_sender_info(): - async def run_test(): - captured = {} - - def step(request): - captured["request"] = request - return [ - SimpleNamespace( - images=["img-1"], - _multimodal_output={}, - _custom_output={}, - metrics={}, - stage_durations={}, - peak_memory_mb=0.0, - latents=None, - trajectory_latents=None, - trajectory_timesteps=None, - trajectory_log_probs=None, - trajectory_decoded=None, - final_output_type="image", - ), - SimpleNamespace( - images=["img-2"], - _multimodal_output={}, - _custom_output={}, - metrics={}, - stage_durations={}, - peak_memory_mb=0.0, - latents=None, - trajectory_latents=None, - trajectory_timesteps=None, - trajectory_log_probs=None, - trajectory_decoded=None, - final_output_type="image", - ), - ] - - proc = object.__new__(StageDiffusionProc) - proc._engine = SimpleNamespace(step=step) - proc._executor = ThreadPoolExecutor(max_workers=1) - - try: - result = await proc._process_batch_request( - request_id="req-parent", - prompts=["hello", "world"], - sampling_params_dict=asdict(OmniDiffusionSamplingParams()), - kv_sender_info={0: {"host": "10.0.0.2", "zmq_port": 50151}}, - ) - finally: - proc._executor.shutdown(wait=True) - - request = captured["request"] - assert request.request_id == "req-parent" - assert request.request_ids == ["req-parent-0", "req-parent-1"] - assert request.kv_sender_info == {0: {"host": "10.0.0.2", "zmq_port": 50151}} - assert result.request_id == "req-parent" - assert result.images == ["img-1", "img-2"] - - asyncio.run(run_test()) diff --git a/tests/distributed/omni_connectors/test_basic_connectors.py b/tests/distributed/omni_connectors/test_basic_connectors.py index 662d41fe01e..1b1965355e9 100644 --- a/tests/distributed/omni_connectors/test_basic_connectors.py +++ b/tests/distributed/omni_connectors/test_basic_connectors.py @@ -9,7 +9,7 @@ from vllm_omni.distributed.omni_connectors.utils.config import ConnectorSpec from vllm_omni.distributed.omni_connectors.utils.serialization import OmniSerializer -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] +# pytestmark = [pytest.mark.core_model, pytest.mark.cpu] def test_basic_serialization(): @@ -120,61 +120,3 @@ def test_get_invalid_metadata(shm_connector): result = shm_connector.get("stage_0", "stage_1", "req_3", {"unknown": "format"}) assert result is None - - -def test_mooncake_connector_defaults_missing_host_to_detected_ip(monkeypatch: pytest.MonkeyPatch): - import vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector as mooncake_module - - class _FakePool: - is_cuda = False - - def pin_memory(self): - return self - - def data_ptr(self): - return 1234 - - class _FakeTransferEngine: - def initialize(self, host, mode, protocol, device_name): - self.host = host - self.mode = mode - self.protocol = protocol - self.device_name = device_name - return 0 - - def get_rpc_port(self): - return 23456 - - def register_memory(self, base_ptr, pool_size): - del base_ptr, pool_size - return 0 - - def unregister_memory(self, base_ptr): - del base_ptr - return 0 - - monkeypatch.setattr(mooncake_module, "TransferEngine", _FakeTransferEngine) - monkeypatch.setattr(mooncake_module.torch, "empty", lambda *args, **kwargs: _FakePool()) - monkeypatch.setattr( - mooncake_module.MooncakeTransferEngineConnector, - "_get_local_ip", - lambda self: "10.20.30.40", - ) - monkeypatch.setattr( - mooncake_module.MooncakeTransferEngineConnector, - "_zmq_listener_loop", - lambda self: self._listener_ready.set(), - ) - - connector = mooncake_module.MooncakeTransferEngineConnector( - { - "zmq_port": 50051, - "memory_pool_size": 4096, - } - ) - try: - assert connector.host == "10.20.30.40" - assert connector.engine.host == "10.20.30.40" - assert connector.get_connection_info()["host"] == "10.20.30.40" - finally: - connector.close() diff --git a/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py b/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py index 22f7c268be2..dddf49a05de 100644 --- a/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py +++ b/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py @@ -4,15 +4,12 @@ import threading from collections import deque from types import SimpleNamespace -from unittest.mock import patch import pytest import torch from pytest_mock import MockerFixture -from vllm.v1.core.sched.scheduler import Scheduler as VLLMScheduler from vllm.v1.request import RequestStatus -from vllm_omni.data_entry_keys import OmniPayload from vllm_omni.distributed.omni_connectors.transfer_adapter.base import OmniTransferAdapterBase from vllm_omni.distributed.omni_connectors.transfer_adapter.chunk_transfer_adapter import ( OmniChunkTransferAdapter, @@ -112,11 +109,7 @@ def test_load_poll(build_adapter): request = _req("req-1", RequestStatus.WAITING, external_req_id="external-1") adapter.load_async(request) - payload: OmniPayload = { - "codes": {"audio": [[1]]}, - "hidden_states": {"output": torch.tensor([[2.0]])}, - "meta": {"finished": torch.tensor(True, dtype=torch.bool)}, - } + payload = {"code_predictor_codes": [[1]], "hidden_states": torch.tensor([[2.0]]), "finished": True} connector.get.return_value = (payload, 16) adapter._poll_single_request(request) @@ -140,68 +133,15 @@ def test_save_async(build_adapter): assert task["is_finished"] is False -def test_send_single_request_cleans_up_after_finished_payload(build_adapter, monkeypatch): - adapter, _ = build_adapter(stage_id=1) - request = _req("req-finished", RequestStatus.FINISHED_STOPPED, external_req_id="ext-finished") - - adapter.custom_process_next_stage_input_func = lambda **kwargs: {"x": [1], "finished": True} - cleanup_calls = [] - monkeypatch.setattr(adapter, "cleanup", lambda *a, **kw: cleanup_calls.append((a, kw))) - - adapter._send_single_request({"pooling_output": None, "request": request, "is_finished": True}) - - assert len(cleanup_calls) == 1 - args, _ = cleanup_calls[0] - assert args[0] == "req-finished" - assert args[1] == "ext-finished" - - def test_update_request_payload(build_adapter): adapter, _ = build_adapter() - first: OmniPayload = { - "hidden_states": {"output": torch.tensor([[1.0]])}, - "codes": {"audio": [1]}, - "meta": {"finished": torch.tensor(False, dtype=torch.bool)}, - } - adapter._update_request_payload("ext", first) - second: OmniPayload = { - "hidden_states": {"output": torch.tensor([[2.0]])}, - "codes": {"audio": [2]}, - "meta": {"finished": torch.tensor(True, dtype=torch.bool)}, - } - merged = adapter._update_request_payload("ext", second) - - assert torch.equal(merged["hidden_states"]["output"], torch.tensor([[1.0], [2.0]])) - assert merged["codes"]["audio"] == [1, 2] - assert merged["meta"]["finished"].item() is True - - -def test_load_poll_ar_request_additional_information_concats_tensors(build_adapter): - adapter, connector = build_adapter(stage_id=2, model_mode="ar") - request = _req("req-merged", RequestStatus.WAITING, external_req_id="ext-merged") - - adapter.request_ids_mapping["req-merged"] = "ext-merged" - adapter.request_payload["ext-merged"] = { - "hidden_states": {"output": torch.tensor([[1.0]])}, - "ids": {"prompt": [11, 12]}, - "meta": {"finished": torch.tensor(False, dtype=torch.bool)}, - } - payload: OmniPayload = { - "hidden_states": {"output": torch.tensor([[2.0]])}, - "meta": {"finished": torch.tensor(True, dtype=torch.bool)}, - } - connector.get.return_value = (payload, 8) - - adapter._poll_single_request(request) + adapter._update_request_payload("ext", {"h": torch.tensor([[1.0]]), "codes": [1], "finished": False}) + merged = adapter._update_request_payload("ext", {"h": torch.tensor([[2.0]]), "codes": [2], "finished": True}) - assert torch.equal( - request.additional_information["hidden_states"]["output"], - torch.tensor([[1.0], [2.0]]), - ) - # Keys absent from the new chunk are dropped (matches main's behavior). - assert "ids" not in request.additional_information - assert request.additional_information["meta"]["finished"].item() is True + assert torch.equal(merged["h"], torch.tensor([[1.0], [2.0]])) + assert merged["codes"] == [1, 2] + assert merged["finished"] is True def test_process_and_restore_queues(build_adapter): @@ -363,10 +303,7 @@ def test_cleanup_after_poll_flow(build_adapter): adapter.load_async(request) adapter.request_ids_mapping["req-flow"] = "ext-flow" - payload: OmniPayload = { - "hidden_states": {"output": torch.tensor([[1.0]])}, - "meta": {"finished": torch.tensor(True, dtype=torch.bool)}, - } + payload = {"hidden_states": torch.tensor([[1.0]]), "finished": True} connector.get.return_value = (payload, 8) adapter._poll_single_request(request) @@ -382,27 +319,6 @@ def test_cleanup_after_poll_flow(build_adapter): assert "ext-flow" not in adapter.request_payload -def test_finish_requests_restores_status(build_adapter): - """Abort path must pop ``requests_origin_status`` and restore pre-wait status. - - While ``process_pending_chunks`` holds a request off the scheduler queues, the - adapter records the prior status (WAITING or RUNNING). ``finish_requests`` must - put that status back on the live ``Request`` so base ``Scheduler.finish_requests`` - can finish bookkeeping without inconsistent state / crashes. - """ - adapter, _ = build_adapter(stage_id=1) - req_id = "req-abort-during-chunk" - prior = RequestStatus.RUNNING - request = _req(req_id, RequestStatus.WAITING_FOR_CHUNK) - adapter.requests_origin_status[req_id] = prior - requests_map = {req_id: request} - - adapter.finish_requests([req_id], RequestStatus.FINISHED_ABORTED, requests_map) - - assert request.status == prior - assert req_id not in adapter.requests_origin_status - - # --------------------------------------------------------------- # Scheduler trigger tests # --------------------------------------------------------------- @@ -493,114 +409,3 @@ def test_generation_scheduler_calls_cleanup_on_finished(monkeypatch, mocker: Moc args, _ = cleanup_calls[0] assert args[0] == "req-s1" assert args[1] == "ext-s1" - - -def test_ar_scheduler_defers_cleanup_and_queues_save_on_finished(mocker: MockerFixture): - """OmniARScheduler should enqueue save; adapter cleanup is handled in save thread.""" - cleanup_calls = [] - save_calls = [] - - adapter_mock = mocker.MagicMock() - adapter_mock.cleanup = lambda *a, **kw: cleanup_calls.append((a, kw)) - adapter_mock.save_async = lambda *a, **kw: save_calls.append((a, kw)) - - from vllm_omni.core.sched.omni_ar_scheduler import OmniARScheduler - - scheduler = mocker.MagicMock() - scheduler.chunk_transfer_adapter = adapter_mock - scheduler.connector = None - scheduler.perf_metrics = None - scheduler.log_stats = False - scheduler.recompute_kv_load_failures = False - scheduler.structured_output_manager = mocker.MagicMock() - scheduler.structured_output_manager.should_advance.return_value = False - scheduler.finished_req_ids_dict = {} - scheduler.kv_cache_manager = mocker.MagicMock() - scheduler.kv_cache_manager.take_events.return_value = None - scheduler.kv_event_publisher = mocker.MagicMock() - scheduler.waiting_for_transfer_free = set() - scheduler.transfer_triggered_requests = set() - scheduler.active_kv_transfers = set() - - request = _HashableRequest( - request_id="req-ar", - external_req_id="ext-ar", - status=RequestStatus.RUNNING, - is_finished=lambda: False, - num_computed_tokens=1, - num_prompt_tokens=1, - prompt_token_ids=[1], - num_output_placeholders=0, - sampling_params=None, - pooling_params=None, - stop_reason=None, - client_index=0, - take_events=lambda: [], - trace_headers=None, - num_cached_tokens=0, - num_external_computed_tokens=0, - num_nans_in_logits=0, - get_finished_reason=lambda: "stop", - ) - scheduler.requests = {"req-ar": request} - - scheduler._update_request_with_output = mocker.MagicMock(return_value=([], True)) - scheduler._process_kv_transfer_trigger = mocker.MagicMock(return_value=False) - scheduler._handle_stopped_request = mocker.MagicMock(return_value=True) - scheduler._free_request = mocker.MagicMock(return_value=None) - scheduler._get_routed_experts = mocker.MagicMock(return_value=None) - scheduler.running = [request] - scheduler.waiting = mocker.MagicMock() - scheduler.waiting.remove_requests = mocker.MagicMock() - scheduler.make_spec_decoding_stats = mocker.MagicMock(return_value=None) - scheduler.make_stats = mocker.MagicMock(return_value=None) - - scheduler_output = SimpleNamespace( - num_scheduled_tokens={"req-ar": 1}, - scheduled_spec_decode_tokens={}, - num_invalid_spec_tokens=0, - ) - model_runner_output = SimpleNamespace( - sampled_token_ids=[[123]], - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=None, - num_nans_in_logits=None, - kv_connector_output=None, - cudagraph_stats=None, - req_id_to_index={"req-ar": 0}, - kv_extracted_req_ids=None, - ) - - OmniARScheduler.update_from_output(scheduler, scheduler_output, model_runner_output) - - assert len(cleanup_calls) == 0 - assert len(save_calls) == 1 - - -def test_omni_ar_scheduler_finish_requests(mocker: MockerFixture): - """``OmniARScheduler.finish_requests`` must run chunk adapter hook before vLLM base.""" - from vllm_omni.core.sched.omni_ar_scheduler import OmniARScheduler - - order: list[str] = [] - - adapter = mocker.MagicMock() - - def _adapter_finish(request_ids, finished_status, requests): - order.append("adapter") - return [] - - adapter.finish_requests.side_effect = _adapter_finish - - def _super_finish(_self, request_ids, finished_status): - order.append("super") - return [] - - sched = OmniARScheduler.__new__(OmniARScheduler) - sched.chunk_transfer_adapter = adapter - sched.requests = {} - - with patch.object(VLLMScheduler, "finish_requests", _super_finish): - OmniARScheduler.finish_requests(sched, ["r1"], RequestStatus.FINISHED_ABORTED) - - assert order == ["adapter", "super"] diff --git a/tests/distributed/omni_connectors/test_kv_flow.py b/tests/distributed/omni_connectors/test_kv_flow.py index cea18601932..b12fc013b7f 100644 --- a/tests/distributed/omni_connectors/test_kv_flow.py +++ b/tests/distributed/omni_connectors/test_kv_flow.py @@ -1,14 +1,8 @@ -import json -import struct - -import numpy as np import pytest import torch -import vllm_omni.distributed.omni_connectors.kv_transfer_manager as kv_transfer_manager_module from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.distributed.omni_connectors.kv_transfer_manager import ( - KVCacheTransferData, OmniKVCacheConfig, OmniKVTransferManager, ) @@ -66,35 +60,6 @@ def common_constants(): } -def _decode_stored_payload(data): - if isinstance(data, torch.Tensor) and data.dtype == torch.uint8 and data.dim() == 1: - return KVCacheTransferData.from_bytes(data.cpu().numpy().tobytes()) - - if isinstance(data, (bytes, bytearray, memoryview)): - return KVCacheTransferData.from_bytes(data) - - return data - - -def _make_serialized_payload() -> tuple[bytes, torch.Tensor]: - key_tensor = torch.arange(12, dtype=torch.float32).reshape(3, 4) - payload = KVCacheTransferData( - request_id="req-payload", - layer_blocks={"key_cache": [key_tensor], "value_cache": [None]}, - block_ids=[1], - metadata={"seq_len": 3}, - ).to_bytes() - return payload, key_tensor - - -def _rewrite_serialized_header(payload: bytes, mutate_header) -> bytes: - header_len = struct.unpack(">I", payload[:4])[0] - header = json.loads(payload[4 : 4 + header_len]) - mutate_header(header) - new_header = json.dumps(header, separators=(",", ":")).encode("utf-8") - return struct.pack(">I", len(new_header)) + new_header + payload[4 + header_len :] - - def test_manager_extraction(kv_config, mock_connector, common_constants): """Test extraction and sending logic in OmniKVTransferManager.""" num_layers = common_constants["num_layers"] @@ -130,7 +95,7 @@ def test_manager_extraction(kv_config, mock_connector, common_constants): expected_key = f"stage1->stage2:{full_request_id}" assert expected_key in mock_connector.store - data = _decode_stored_payload(mock_connector.store[expected_key]) + data = mock_connector.store[expected_key] assert data["request_id"] == req_id assert "layer_blocks" in data assert len(data["layer_blocks"]["key_cache"]) == num_layers @@ -141,116 +106,6 @@ def test_manager_extraction(kv_config, mock_connector, common_constants): assert data["layer_blocks"]["key_cache"][0].shape == expected_shape -def test_from_bytes_rejects_out_of_bounds_header_len(): - payload, _ = _make_serialized_payload() - bad_payload = struct.pack(">I", len(payload)) + payload[4:] - - with pytest.raises(ValueError, match="header_len"): - KVCacheTransferData.from_bytes(bad_payload) - - with pytest.raises(ValueError, match="header_len"): - KVCacheTransferData.from_bytes_gpu(torch.tensor(list(bad_payload), dtype=torch.uint8)) - - -def test_from_bytes_rejects_out_of_bounds_tensor_span(): - payload, _ = _make_serialized_payload() - bad_payload = _rewrite_serialized_header(payload, lambda header: header["td"][0].update({"o": 4096})) - - with pytest.raises(ValueError, match="tensor span"): - KVCacheTransferData.from_bytes(bad_payload) - - with pytest.raises(ValueError, match="tensor span"): - KVCacheTransferData.from_bytes_gpu(torch.tensor(list(bad_payload), dtype=torch.uint8)) - - -def test_from_bytes_rejects_unsupported_dtype(): - payload, _ = _make_serialized_payload() - bad_payload = _rewrite_serialized_header(payload, lambda header: header["td"][0].update({"d": "cuda"})) - - with pytest.raises(ValueError, match="Unsupported dtype"): - KVCacheTransferData.from_bytes(bad_payload) - - with pytest.raises(ValueError, match="Unsupported dtype"): - KVCacheTransferData.from_bytes_gpu(torch.tensor(list(bad_payload), dtype=torch.uint8)) - - -def test_from_bytes_uses_explicit_layer_index_descriptor(): - payload, key_tensor = _make_serialized_payload() - payload_with_explicit_index = _rewrite_serialized_header( - payload, - lambda header: header["td"][0].update({"n": "key_cache_extra_suffix", "i": 0}), - ) - - data = KVCacheTransferData.from_bytes(payload_with_explicit_index) - - assert torch.equal(data["layer_blocks"]["key_cache"][0], key_tensor) - - -def test_update_sender_info_uses_configured_source_stage(): - config = OmniKVCacheConfig( - connector_config={"type": "mock"}, - stage_id=2, - engine_input_source=[1], - need_recv_cache=True, - ) - manager = OmniKVTransferManager(config) - - manager.update_sender_info( - { - 0: {"host": "10.0.0.1", "zmq_port": 50151}, - 1: {"host": "10.0.0.2", "zmq_port": 50152}, - } - ) - - assert manager.config.connector_config["sender_host"] == "10.0.0.2" - assert manager.config.connector_config["sender_zmq_port"] == 50152 - - -def test_clone_received_payload_tensors_breaks_buffer_alias(): - payload, key_tensor = _make_serialized_payload() - raw = np.frombuffer(bytearray(payload), dtype=np.uint8) - data = KVCacheTransferData.from_bytes(memoryview(raw)) - - OmniKVTransferManager._clone_received_payload_tensors(data) - raw[:] = 0 - - assert torch.equal(data["layer_blocks"]["key_cache"][0], key_tensor) - - -def test_receive_kv_cache_uses_exponential_backoff(monkeypatch): - config = OmniKVCacheConfig( - connector_config={"type": "mock"}, - from_stage="sender", - stage_id="receiver", - need_recv_cache=True, - recv_timeout=0.3, - ) - manager = OmniKVTransferManager(config) - - class _NeverReadyConnector: - def get(self, **kwargs): - del kwargs - return None - - manager._connector = _NeverReadyConnector() - - now = {"value": 0.0} - sleep_intervals = [] - - monkeypatch.setattr(kv_transfer_manager_module.time, "time", lambda: now["value"]) - - def _fake_sleep(interval: float) -> None: - sleep_intervals.append(interval) - now["value"] += interval - - monkeypatch.setattr(kv_transfer_manager_module.time, "sleep", _fake_sleep) - - data, size = manager.receive_kv_cache_for_request("req-backoff") - - assert (data, size) == (None, 0) - assert sleep_intervals == pytest.approx([0.01, 0.02, 0.04, 0.08, 0.16]) - - def test_manager_extraction_tuple_layout(kv_config, mock_connector, common_constants): """Test extraction with tuple layout.""" num_layers = common_constants["num_layers"] @@ -280,7 +135,7 @@ def test_manager_extraction_tuple_layout(kv_config, mock_connector, common_const expected_key = f"stage1->stage2:{full_request_id}" assert expected_key in mock_connector.store - data = _decode_stored_payload(mock_connector.store[expected_key]) + data = mock_connector.store[expected_key] expected_shape = (seq_len, num_heads, head_dim) for idx in range(len(kv_caches)): assert data["layer_blocks"]["key_cache"][idx].shape == expected_shape @@ -310,7 +165,7 @@ def test_manager_extraction_mismatched_kv_block_counts(kv_config, mock_connector expected_key = f"stage1->stage2:{full_request_id}" assert expected_key in mock_connector.store - data = _decode_stored_payload(mock_connector.store[expected_key]) + data = mock_connector.store[expected_key] expected_shape = (2 * block_size, num_heads, head_dim) assert data["layer_blocks"]["key_cache"][0].shape == expected_shape assert data["layer_blocks"]["value_cache"][0].shape == expected_shape @@ -399,82 +254,6 @@ def test_manager_reception(kv_config, mock_connector, common_constants): assert req.kv_metadata["seq_len"] == seq_len -def test_manager_reception_prefers_parent_request_id_for_batched_request(kv_config, mock_connector, common_constants): - """Batched diffusion requests must fetch KV using the parent/global request ID.""" - num_layers = common_constants["num_layers"] - num_heads = common_constants["num_heads"] - head_dim = common_constants["head_dim"] - seq_len = common_constants["seq_len"] - parent_req_id = common_constants["req_id"] - - expected_shape = (seq_len, num_heads, head_dim) - key_cache = [torch.randn(expected_shape) for _ in range(num_layers)] - value_cache = [torch.randn(expected_shape) for _ in range(num_layers)] - - data_to_receive = { - "request_id": parent_req_id, - "layer_blocks": {"key_cache": key_cache, "value_cache": value_cache}, - "metadata": {"seq_len": seq_len}, - "block_ids": [], - } - - manager = OmniKVTransferManager(kv_config) - manager._connector = mock_connector - - full_request_id = f"omni_stage1_to_stage2_kv_cache_{parent_req_id}" - store_key = f"stage1->stage2:{full_request_id}" - mock_connector.store[store_key] = data_to_receive - - req = OmniDiffusionRequest( - prompts=["prompt-a", "prompt-b"], - sampling_params=OmniDiffusionSamplingParams(), - request_ids=[f"{parent_req_id}-0", f"{parent_req_id}-1"], - request_id=parent_req_id, - ) - - success = manager.receive_kv_cache(req, target_device=torch.device("cpu")) - - assert success - assert req.kv_metadata["seq_len"] == seq_len - assert torch.allclose(req.past_key_values.key_cache[0], key_cache[0]) - - -def test_receive_multi_kv_cache_uses_parent_request_id_for_cfg_collection(kv_config): - manager = OmniKVTransferManager(kv_config) - - seen = {} - - def collect_cfg(request_id, cfg_request_ids, kv_transfer_manager, target_device): - seen["request_id"] = request_id - seen["cfg_request_ids"] = cfg_request_ids - seen["kv_transfer_manager"] = kv_transfer_manager - seen["target_device"] = target_device - return {"cfg_text_kv_metadata": {"ok": True}} - - req = OmniDiffusionRequest( - prompts=["prompt-a", "prompt-b"], - sampling_params=OmniDiffusionSamplingParams(), - request_ids=["req-parent-0", "req-parent-1"], - request_id="req-parent", - ) - req.sampling_params.cfg_kv_request_ids = {"cfg_text": "req-parent__cfg_text"} - - manager.receive_kv_cache = lambda request, target_device=None: request is req - - success = manager.receive_multi_kv_cache( - req, - cfg_kv_collect_func=collect_cfg, - target_device=torch.device("cpu"), - ) - - assert success - assert seen["request_id"] == "req-parent" - assert seen["cfg_request_ids"] == {"cfg_text": "req-parent__cfg_text"} - assert seen["kv_transfer_manager"] is manager - assert seen["target_device"] == torch.device("cpu") - assert req.sampling_params.cfg_text_kv_metadata == {"ok": True} - - def test_integration_flow(common_constants): """Simulate extraction -> connector -> reception.""" num_layers = common_constants["num_layers"] diff --git a/tests/distributed/omni_connectors/test_shm_connector.py b/tests/distributed/omni_connectors/test_shm_connector.py deleted file mode 100644 index e702318e3f3..00000000000 --- a/tests/distributed/omni_connectors/test_shm_connector.py +++ /dev/null @@ -1,184 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for SharedMemoryConnector focusing on TP / CFG / metadata fallback.""" - -import pytest - -from vllm_omni.distributed.omni_connectors.connectors.shm_connector import ( - SharedMemoryConnector, -) - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -@pytest.fixture() -def connector(): - c = SharedMemoryConnector({"shm_threshold_bytes": 64}) - yield c - c.close() - - -# ── Key-based read (the fundamental SHM path) ──────────────────────── - - -class TestKeyBasedReadWrite: - def test_put_then_get_by_key(self, connector): - data = {"hello": "world", "n": 42} - ok, size, meta = connector.put("s0", "s1", "test_key_1", data) - assert ok - assert size > 0 - assert "shm" in meta - assert "test_key_1" in connector._pending_keys - - result = connector.get("s0", "s1", "test_key_1", metadata=None) - assert result is not None - obj, rsize = result - assert obj == data - assert rsize == size - assert "test_key_1" not in connector._pending_keys - - def test_get_nonexistent_key_returns_none(self, connector): - result = connector.get("s0", "s1", "no_such_key_xyz", metadata=None) - assert result is None - - def test_rank_aware_keys_independent(self, connector): - """Each TP rank writes/reads its own key — simulates homogeneous TP.""" - payloads = {} - for rank in range(4): - key = f"req1_s0_0_{rank}_{rank}" - data = {"rank": rank, "values": list(range(rank, rank + 3))} - ok, _, _ = connector.put("s0", "s1", key, data) - assert ok - payloads[rank] = data - - for rank in range(4): - key = f"req1_s0_0_{rank}_{rank}" - result = connector.get("s0", "s1", key, metadata=None) - assert result is not None - obj, _ = result - assert obj == payloads[rank] - - -# ── Metadata fallback behaviour ────────────────────────────────────── - - -class TestMetadataFallback: - def test_rdma_style_metadata_falls_back_to_key(self, connector): - """source_host/source_port metadata should be ignored; key read used.""" - data = {"payload": True} - connector.put("s0", "s1", "fb_key_1", data) - - rdma_meta = {"source_host": "10.0.0.1", "source_port": 12345} - result = connector.get("s0", "s1", "fb_key_1", metadata=rdma_meta) - assert result is not None - obj, _ = result - assert obj == data - - def test_non_dict_metadata_falls_back_to_key(self, connector): - data = {"val": 99} - connector.put("s0", "s1", "fb_key_2", data) - - result = connector.get("s0", "s1", "fb_key_2", metadata="not_a_dict") - assert result is not None - obj, _ = result - assert obj == data - - def test_empty_dict_metadata_falls_back_to_key(self, connector): - data = {"x": 1} - connector.put("s0", "s1", "fb_key_3", data) - - result = connector.get("s0", "s1", "fb_key_3", metadata={}) - assert result is not None - obj, _ = result - assert obj == data - - def test_shm_handle_metadata_still_works(self, connector): - """When metadata contains a proper 'shm' handle, use it directly.""" - data = {"direct": True} - ok, size, meta = connector.put("s0", "s1", "shm_direct_1", data) - assert ok - result = connector.get("s0", "s1", "shm_direct_1", metadata=meta) - assert result is not None - obj, _ = result - assert obj == data - - def test_metadata_keyed_by_request_id(self, connector): - """Metadata wrapped as {get_key: actual_meta} should be unwrapped.""" - data = {"wrapped": True} - ok, size, meta = connector.put("s0", "s1", "wrap_key", data) - assert ok - wrapped = {"wrap_key": meta} - result = connector.get("s0", "s1", "wrap_key", metadata=wrapped) - assert result is not None - obj, _ = result - assert obj == data - - -# ── Heterogeneous TP multi-key read ────────────────────────────────── - - -class TestHeteroTPMultiKey: - def test_receiver_reads_multiple_sender_keys(self, connector): - """Simulates from_tp=2 -> to_tp=1: receiver reads 2 keys and merges.""" - for sender_rank in range(2): - key = f"req1_s0_0_{sender_rank}_0" - data = {"sender": sender_rank, "shard": [sender_rank * 10]} - connector.put("s0", "s1", key, data) - - shards = [] - for sender_rank in range(2): - key = f"req1_s0_0_{sender_rank}_0" - result = connector.get("s0", "s1", key, metadata=None) - assert result is not None - obj, _ = result - shards.append(obj) - - assert len(shards) == 2 - assert shards[0]["sender"] == 0 - assert shards[1]["sender"] == 1 - - def test_sender_writes_multiple_receiver_keys(self, connector): - """Simulates from_tp=1 -> to_tp=2: sender writes 2 sliced keys.""" - for recv_rank in range(2): - key = f"req1_s0_0_0_{recv_rank}" - data = {"target": recv_rank, "slice": list(range(recv_rank, recv_rank + 2))} - connector.put("s0", "s1", key, data) - - for recv_rank in range(2): - key = f"req1_s0_0_0_{recv_rank}" - result = connector.get("s0", "s1", key, metadata=None) - assert result is not None - obj, _ = result - assert obj["target"] == recv_rank - - -# ── Cleanup ────────────────────────────────────────────────────────── - - -class TestCleanup: - def test_cleanup_removes_unconsumed_segment(self, connector): - data = {"leak": True} - connector.put("s0", "s1", "cleanup_req_42", data) - assert "cleanup_req_42" in connector._pending_keys - - connector.cleanup("req_42") - assert "cleanup_req_42" not in connector._pending_keys - - result = connector.get("s0", "s1", "cleanup_req_42", metadata=None) - assert result is None - - def test_cleanup_noop_for_consumed_segment(self, connector): - data = {"consumed": True} - connector.put("s0", "s1", "consumed_req_99", data) - connector.get("s0", "s1", "consumed_req_99", metadata=None) - - connector.cleanup("req_99") - assert "consumed_req_99" not in connector._pending_keys - - def test_close_cleans_all_pending(self, connector): - for i in range(3): - connector.put("s0", "s1", f"close_test_{i}", {"i": i}) - - assert len(connector._pending_keys) == 3 - connector.close() - assert len(connector._pending_keys) == 0 diff --git a/tests/distributed/omni_connectors/test_tp_rank_aware.py b/tests/distributed/omni_connectors/test_tp_rank_aware.py deleted file mode 100644 index d4793479aaf..00000000000 --- a/tests/distributed/omni_connectors/test_tp_rank_aware.py +++ /dev/null @@ -1,716 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for rank-aware KV transfer (TP > 1) and heterogeneous TP support. - -Covers: -- _build_rank_aware_send_keys / _build_rank_aware_recv_keys -- _get_kv_source_ranks / _get_kv_target_ranks / get_kv_connector_key -- update_sender_info storing base host/port -- receive path constructing per-rank metadata for connector.get() -- Mooncake connector _query_metadata_at and partial-metadata get() path -""" - -from types import SimpleNamespace -from unittest.mock import MagicMock, patch - -import pytest -import torch - -from vllm_omni.distributed.omni_connectors.kv_transfer_manager import ( - KVCacheTransferData, - OmniKVCacheConfig, - OmniKVTransferManager, -) -from vllm_omni.distributed.omni_connectors.utils.initialization import ( - KV_RANK_PORT_STRIDE, -) -from vllm_omni.distributed.omni_connectors.utils.kv_utils import ( - KVTPTopology, - build_rank_aware_recv_keys, - build_rank_aware_send_keys, - get_kv_connector_key, - get_kv_source_ranks, - get_kv_target_ranks, - merge_received_rank_shards, - slice_received_rank_shard, -) - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -def _make_manager( - from_tp: int = 1, - to_tp: int = 1, - local_rank: int = 0, - from_stage: str = "stage0", - to_stage: str = "stage1", - stage_id: str = "stage1", - need_recv: bool = True, - need_send: bool = False, - recv_timeout: float = 0.3, -) -> OmniKVTransferManager: - """Build a manager with TP params injected, bypassing torch.distributed.""" - config = OmniKVCacheConfig( - connector_config={"type": "mock"}, - from_stage=from_stage, - to_stage=to_stage, - stage_id=stage_id, - need_recv_cache=need_recv, - need_send_cache=need_send, - recv_timeout=recv_timeout, - from_tp=from_tp, - to_tp=to_tp, - ) - with ( - patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=local_rank), - patch( - "vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", - return_value=max(from_tp, to_tp), - ), - ): - mgr = OmniKVTransferManager(config) - return mgr - - -def _make_payload(head_values: list[float], request_id: str = "req-1") -> dict: - head_tensor = torch.tensor(head_values, dtype=torch.float32).view(1, len(head_values), 1).repeat(2, 1, 1) - return { - "request_id": request_id, - "layer_blocks": { - "key_cache": [head_tensor.clone()], - "value_cache": [(head_tensor + 100).clone()], - }, - "block_ids": [0], - "metadata": {"seq_len": 2}, - } - - -def _make_transfer_data(head_values: list[float], request_id: str = "req-1") -> KVCacheTransferData: - payload = _make_payload(head_values, request_id=request_id) - return KVCacheTransferData( - request_id=request_id, - layer_blocks=payload["layer_blocks"], - block_ids=payload["block_ids"], - metadata=payload["metadata"], - ) - - -# ── Key format helper ──────────────────────────────────────────────── - - -class TestConnectorKeyFormat: - def test_key_format_matches_pr2677(self): - key = get_kv_connector_key("req-1", "stage0", 0, 1, 2) - assert key == "req-1_stage0_0_1_2" - - def test_key_fields_are_positional(self): - key = get_kv_connector_key("r", "s", 5, 3, 7) - parts = key.split("_") - assert parts == ["r", "s", "5", "3", "7"] - - -# ── Source / target rank mapping ───────────────────────────────────── - - -class TestRankMapping: - """Verify get_kv_target_ranks and get_kv_source_ranks for various TP configs.""" - - def test_homogeneous_tp2_rank0(self): - topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=0) - assert get_kv_target_ranks(topo) == [0] - assert get_kv_source_ranks(topo) == [0] - - def test_homogeneous_tp2_rank1(self): - topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=1) - assert get_kv_target_ranks(topo) == [1] - assert get_kv_source_ranks(topo) == [1] - - def test_homogeneous_tp4_rank3(self): - topo = KVTPTopology(source_tp_size=4, target_tp_size=4, local_rank=3) - assert get_kv_target_ranks(topo) == [3] - assert get_kv_source_ranks(topo) == [3] - - def test_sender_gt_receiver_tp4_to_tp2_rank0(self): - """Receiver rank 0 should receive from sender rank 0 and 1.""" - topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=0) - assert get_kv_source_ranks(topo) == [0, 1] - - def test_sender_gt_receiver_tp4_to_tp2_rank1(self): - """Receiver rank 1 should receive from sender rank 2 and 3.""" - topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=1) - assert get_kv_source_ranks(topo) == [2, 3] - - def test_sender_lt_receiver_tp2_to_tp4_rank0(self): - """Sender rank 0 should send to receiver ranks 0 and 1.""" - topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) - assert get_kv_target_ranks(topo) == [0, 1] - - def test_sender_lt_receiver_tp2_to_tp4_rank1(self): - topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=1) - assert get_kv_target_ranks(topo) == [2, 3] - - def test_receiver_lt_sender_source_ranks(self): - """Receiver rank 0 with tp2_to_tp4 should source from rank 0 only.""" - topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) - assert get_kv_source_ranks(topo) == [0] - - def test_invalid_topology_raises(self): - topo = KVTPTopology(source_tp_size=3, target_tp_size=2, local_rank=0) - with pytest.raises(ValueError, match="divisible"): - get_kv_source_ranks(topo) - - -# ── _build_rank_aware_recv_keys ────────────────────────────────────── - - -class TestBuildRankAwareRecvKeys: - """Verify build_rank_aware_recv_keys returns (key, from_rank) tuples.""" - - def test_tp1_returns_legacy_key_with_none_rank(self): - topo = KVTPTopology(source_tp_size=1, target_tp_size=1, local_rank=0) - pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) - assert len(pairs) == 1 - key, rank = pairs[0] - assert key == "omni_stage0_to_stage1_kv_cache_req-1" - assert rank is None - - def test_homogeneous_tp2_rank0(self): - topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=0) - pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) - assert len(pairs) == 1 - key, rank = pairs[0] - assert key == "req-1_stage0_0_0_0" - assert rank == 0 - - def test_homogeneous_tp2_rank1(self): - topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=1) - pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) - assert len(pairs) == 1 - key, rank = pairs[0] - assert key == "req-1_stage0_0_1_1" - assert rank == 1 - - def test_heterogeneous_tp4_to_tp2_rank0_gets_two_keys(self): - """Receiver rank 0 with source_tp=4, target_tp=2 should get 2 keys.""" - topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=0) - pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) - assert len(pairs) == 2 - - keys = [k for k, _ in pairs] - ranks = [r for _, r in pairs] - assert keys == ["req-1_stage0_0_0_0", "req-1_stage0_0_1_0"] - assert ranks == [0, 1] - - def test_heterogeneous_tp4_to_tp2_rank1_gets_two_keys(self): - topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=1) - pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) - assert len(pairs) == 2 - - ranks = [r for _, r in pairs] - assert ranks == [2, 3] - - def test_heterogeneous_tp2_to_tp4_rank2_gets_one_key(self): - """Receiver rank 2 with source_tp=2, target_tp=4 should get 1 key from sender rank 1.""" - topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=2) - pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) - assert len(pairs) == 1 - key, rank = pairs[0] - assert rank == 1 - assert key == "req-1_stage0_0_1_2" - - -# ── _build_rank_aware_send_keys ────────────────────────────────────── - - -class TestBuildRankAwareSendKeys: - def test_tp1_returns_legacy_key(self): - topo = KVTPTopology(source_tp_size=1, target_tp_size=1, local_rank=0) - keys = build_rank_aware_send_keys("req-1", "stage0", "stage1", topo) - assert keys == ["omni_stage0_to_stage1_kv_cache_req-1"] - - def test_homogeneous_tp2_rank0(self): - topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=0) - keys = build_rank_aware_send_keys("req-1", "stage0", "stage1", topo) - assert keys == ["req-1_stage0_0_0_0"] - - def test_sender_lt_receiver_tp2_to_tp4_rank0_sends_two_keys(self): - topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) - keys = build_rank_aware_send_keys("req-1", "stage0", "stage1", topo) - assert len(keys) == 2 - assert keys == ["req-1_stage0_0_0_0", "req-1_stage0_0_0_1"] - - -# ── update_sender_info stores base host/port ───────────────────────── - - -class TestUpdateSenderInfoBase: - def test_stores_base_host_and_port(self): - mgr = _make_manager(from_tp=2, to_tp=2, local_rank=0) - mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) - - assert mgr._sender_base_host == "10.0.0.1" - assert mgr._sender_base_zmq_port == 50151 - - def test_rank1_adjusts_default_port_but_preserves_base(self): - mgr = _make_manager(from_tp=2, to_tp=2, local_rank=1) - mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) - - assert mgr._sender_base_host == "10.0.0.1" - assert mgr._sender_base_zmq_port == 50151 - expected_adjusted = 50151 + 1 * KV_RANK_PORT_STRIDE - assert mgr.config.connector_config["sender_zmq_port"] == expected_adjusted - - def test_nested_sender_info_resolves_correctly(self): - """Nested sender_info keyed by integer stage id should resolve - using recv_stages (engine_input_source → recv_from).""" - config = OmniKVCacheConfig( - connector_config={"type": "mock"}, - stage_id=2, - engine_input_source=[1], - need_recv_cache=True, - from_tp=2, - to_tp=2, - ) - with ( - patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=0), - patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", return_value=2), - ): - mgr = OmniKVTransferManager(config) - mgr.update_sender_info( - { - 0: {"host": "10.0.0.1", "zmq_port": 50151}, - 1: {"host": "10.0.0.2", "zmq_port": 50152}, - } - ) - assert mgr._sender_base_host == "10.0.0.2" - assert mgr._sender_base_zmq_port == 50152 - - -# ── receive path constructs per-rank metadata ──────────────────────── - - -class TestReceiveConstructsMetadata: - """Verify that receive_kv_cache_for_request passes metadata with - correct (host, port) to connector.get() for heterogeneous TP.""" - - def test_tp1_no_metadata_passed(self): - """TP=1: connector.get() should be called WITHOUT metadata.""" - mgr = _make_manager(from_tp=1, to_tp=1, local_rank=0, recv_timeout=0.05) - mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) - - calls = [] - - class _Connector: - def get(self, from_stage, to_stage, get_key, metadata=None): - calls.append({"key": get_key, "metadata": metadata}) - return None - - mgr._connector = _Connector() - mgr.receive_kv_cache_for_request("req-1") - - assert len(calls) > 0 - assert calls[0]["metadata"] is None - - def test_homogeneous_tp2_rank0_passes_metadata(self): - """TP=2 rank 0: metadata should point to sender rank 0's port.""" - mgr = _make_manager(from_tp=2, to_tp=2, local_rank=0, recv_timeout=0.05) - mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) - - calls = [] - - class _Connector: - def get(self, from_stage, to_stage, get_key, metadata=None): - calls.append({"key": get_key, "metadata": metadata}) - return None - - mgr._connector = _Connector() - mgr.receive_kv_cache_for_request("req-1") - - assert len(calls) > 0 - meta = calls[0]["metadata"] - assert meta is not None - assert meta["source_host"] == "10.0.0.1" - assert meta["source_port"] == 50151 + 0 * KV_RANK_PORT_STRIDE - - def test_homogeneous_tp2_rank1_passes_metadata_with_offset(self): - mgr = _make_manager(from_tp=2, to_tp=2, local_rank=1, recv_timeout=0.05) - mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) - - calls = [] - - class _Connector: - def get(self, from_stage, to_stage, get_key, metadata=None): - calls.append({"key": get_key, "metadata": metadata}) - return None - - mgr._connector = _Connector() - mgr.receive_kv_cache_for_request("req-1") - - meta = calls[0]["metadata"] - assert meta["source_port"] == 50151 + 1 * KV_RANK_PORT_STRIDE - - def test_heterogeneous_tp4_to_tp2_rank0_multiple_metadata(self): - """Receiver rank 0 with source_tp=4, target_tp=2 should call get() with - two different metadata entries for sender ranks 0 and 1.""" - mgr = _make_manager(from_tp=4, to_tp=2, local_rank=0, recv_timeout=0.05) - mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) - - calls = [] - - class _Connector: - def get(self, from_stage, to_stage, get_key, metadata=None): - calls.append({"key": get_key, "metadata": metadata}) - return None - - mgr._connector = _Connector() - mgr.receive_kv_cache_for_request("req-1") - - seen_ports = set() - for c in calls: - if c["metadata"]: - seen_ports.add(c["metadata"]["source_port"]) - expected_ports = { - 50151 + 0 * KV_RANK_PORT_STRIDE, - 50151 + 1 * KV_RANK_PORT_STRIDE, - } - assert expected_ports.issubset(seen_ports) - - -# ── Mooncake connector _query_metadata_at ──────────────────────────── - - -class TestMooncakeQueryMetadataAt: - """Test the connector's _query_metadata_at method and partial-metadata - path in get() without requiring real RDMA/Mooncake.""" - - def test_query_metadata_at_returns_full_metadata(self): - """Mock the ZMQ interaction to verify _query_metadata_at returns - complete metadata including data_size.""" - - try: - from vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector import ( - MooncakeTransferEngineConnector, - QueryResponse, - ) - except ImportError: - pytest.skip("Mooncake not available") - - import msgspec - - connector = MagicMock(spec=MooncakeTransferEngineConnector) - connector._get_req_socket = MagicMock() - - mock_socket = MagicMock() - resp = QueryResponse(request_id="test_key@s0_s1", data_size=4096, is_fast_path=True) - mock_socket.recv.return_value = msgspec.msgpack.encode(resp) - connector._get_req_socket.return_value = mock_socket - - result = MooncakeTransferEngineConnector._query_metadata_at( - connector, - "test_key@s0_s1", - "10.0.0.1", - 50151, - ) - - assert result is not None - assert result["source_host"] == "10.0.0.1" - assert result["source_port"] == 50151 - assert result["data_size"] == 4096 - assert result["is_fast_path"] is True - - def test_query_metadata_at_returns_none_on_not_found(self): - try: - from vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector import ( - INFO_NOT_FOUND, - MooncakeTransferEngineConnector, - ) - except ImportError: - pytest.skip("Mooncake not available") - - connector = MagicMock(spec=MooncakeTransferEngineConnector) - mock_socket = MagicMock() - mock_socket.recv.return_value = INFO_NOT_FOUND - connector._get_req_socket.return_value = mock_socket - - result = MooncakeTransferEngineConnector._query_metadata_at( - connector, - "test_key@s0_s1", - "10.0.0.1", - 50151, - ) - assert result is None - - -# ── Merge / slice hooks ────────────────────────────────────────────── - - -class TestMergeSliceHooks: - def test_single_shard_passes_through(self): - payload = {"layer_blocks": {"key_cache": [1]}} - assert merge_received_rank_shards([payload]) == payload - - def test_default_merger_concats_head_dim(self): - p0 = _make_payload([0.0]) - p1 = _make_payload([1.0]) - result = merge_received_rank_shards([p0, p1]) - key_cache = result["layer_blocks"]["key_cache"][0] - value_cache = result["layer_blocks"]["value_cache"][0] - assert key_cache.shape == (2, 2, 1) - assert value_cache.shape == (2, 2, 1) - assert torch.equal(key_cache[:, :, 0], torch.tensor([[0.0, 1.0], [0.0, 1.0]])) - assert torch.equal(value_cache[:, :, 0], torch.tensor([[100.0, 101.0], [100.0, 101.0]])) - - def test_custom_merger_hook_called(self): - merged = {"merged": True} - assert merge_received_rank_shards([{}, {}], merger=lambda payloads: merged) == merged - - def test_slicer_hook_called(self): - topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) - sliced = {"sliced": True} - assert slice_received_rank_shard({"full": True}, topo, slicer=lambda payload: sliced) == sliced - - def test_default_slicer_extracts_rank_local_heads(self): - topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=1) - payload = _make_payload([0.0, 1.0]) - result = slice_received_rank_shard(payload, topo) - key_cache = result["layer_blocks"]["key_cache"][0] - value_cache = result["layer_blocks"]["value_cache"][0] - assert key_cache.shape == (2, 1, 1) - assert value_cache.shape == (2, 1, 1) - assert torch.equal(key_cache[:, :, 0], torch.tensor([[1.0], [1.0]])) - assert torch.equal(value_cache[:, :, 0], torch.tensor([[101.0], [101.0]])) - - def test_presliced_payload_is_not_sliced_twice(self): - topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=1) - payload = _make_payload([1.0]) - payload["metadata"]["tp_head_slice"] = {"applied": True, "target_rank": 1} - result = slice_received_rank_shard(payload, topo) - assert result is payload - - def test_round_trip_merge_from_tp4_to_tp2(self): - topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=1) - source_ranks = get_kv_source_ranks(topo) - payloads = [_make_payload([float(rank)]) for rank in source_ranks] - result = merge_received_rank_shards(payloads) - key_cache = result["layer_blocks"]["key_cache"][0] - assert torch.equal(key_cache[:, :, 0], torch.tensor([[2.0, 3.0], [2.0, 3.0]])) - - def test_round_trip_slice_from_tp2_to_tp4(self): - topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=3) - payload = _make_payload([2.0, 3.0]) - result = slice_received_rank_shard(payload, topo) - key_cache = result["layer_blocks"]["key_cache"][0] - assert torch.equal(key_cache[:, :, 0], torch.tensor([[3.0], [3.0]])) - - -class TestSenderSideSlicing: - def test_transfer_slices_before_sending_to_multiple_targets(self): - mgr = _make_manager( - from_tp=2, - to_tp=4, - local_rank=0, - need_send=True, - need_recv=False, - ) - sent_payloads = [] - - class _Connector: - supports_raw_data = False - - def put(self, from_stage, to_stage, put_key, data): - sent_payloads.append((put_key, KVCacheTransferData.from_bytes(data))) - return True, len(data), {} - - mgr._connector = _Connector() - mgr._transfer_kv_cache(_make_transfer_data([0.0, 1.0]), "req-1") - - assert [key for key, _ in sent_payloads] == ["req-1_stage0_0_0_0", "req-1_stage0_0_0_1"] - assert sent_payloads[0][1]["layer_blocks"]["key_cache"][0].shape == (2, 1, 1) - assert sent_payloads[1][1]["layer_blocks"]["key_cache"][0].shape == (2, 1, 1) - assert torch.equal( - sent_payloads[0][1]["layer_blocks"]["key_cache"][0][:, :, 0], - torch.tensor([[0.0], [0.0]]), - ) - assert torch.equal( - sent_payloads[1][1]["layer_blocks"]["key_cache"][0][:, :, 0], - torch.tensor([[1.0], [1.0]]), - ) - assert sent_payloads[0][1]["metadata"]["tp_head_slice"]["target_rank"] == 0 - assert sent_payloads[1][1]["metadata"]["tp_head_slice"]["target_rank"] == 1 - - -class _MockBroadcastGroup: - def __init__(self, world_size: int, rank_in_group: int, broadcast_value=None, recv_value=None): - self.world_size = world_size - self.rank_in_group = rank_in_group - self.broadcast_value = broadcast_value - self.recv_value = recv_value - self.broadcast_calls = [] - self.send_calls = [] - self.recv_calls = [] - self.shm_broadcaster = None - - def broadcast_object(self, obj=None, src: int = 0): - self.broadcast_calls.append((obj, src)) - return self.broadcast_value if self.broadcast_value is not None else obj - - def send_object(self, obj, dst: int): - self.send_calls.append((dst, obj)) - - def recv_object(self, src: int): - self.recv_calls.append(src) - return self.recv_value - - -class TestDistributedReceive: - def test_tp_cfg_leader_receives_then_sends_branch_local_payloads(self): - mgr = _make_manager(from_tp=2, to_tp=4, local_rank=0) - req = SimpleNamespace(request_id="req-1", sampling_params=SimpleNamespace()) - world_group = _MockBroadcastGroup(world_size=4, rank_in_group=2) - cfg_group = _MockBroadcastGroup(world_size=3, rank_in_group=0) - - def _receive(req_obj, cfg_func, target_device): - req_obj.past_key_values = SimpleNamespace(key_cache=[torch.tensor([1.0])]) - req_obj.kv_metadata = {"source": "leader"} - req_obj.sampling_params.past_key_values = req_obj.past_key_values - req_obj.sampling_params.kv_metadata = req_obj.kv_metadata - req_obj.sampling_params.cfg_text_past_key_values = SimpleNamespace(key_cache=[torch.tensor([2.0])]) - req_obj.sampling_params.cfg_text_kv_metadata = {"source": "cfg_text"} - req_obj.sampling_params.cfg_img_past_key_values = SimpleNamespace(key_cache=[torch.tensor([3.0])]) - req_obj.sampling_params.cfg_img_kv_metadata = {"source": "cfg_img"} - return True - - mgr.receive_multi_kv_cache = MagicMock(side_effect=_receive) - with ( - patch("vllm_omni.diffusion.distributed.parallel_state.get_world_group", return_value=world_group), - patch( - "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_world_size", - return_value=3, - ), - patch( - "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_rank", - return_value=0, - ), - patch("vllm_omni.diffusion.distributed.parallel_state.get_cfg_group", return_value=cfg_group), - ): - assert mgr.receive_multi_kv_cache_distributed(req) is True - - mgr.receive_multi_kv_cache.assert_called_once() - assert mgr.receive_multi_kv_cache.call_args.args[2] == torch.device("cpu") - assert req.kv_metadata == {"source": "leader"} - assert cfg_group.broadcast_calls == [] - assert [dst for dst, _ in cfg_group.send_calls] == [1, 2] - rank1_payload = cfg_group.send_calls[0][1] - rank2_payload = cfg_group.send_calls[1][1] - assert torch.equal(rank1_payload["past_key_values"].key_cache[0], torch.tensor([1.0])) - assert torch.equal(rank2_payload["past_key_values"].key_cache[0], torch.tensor([1.0])) - assert rank1_payload["sp.cfg_active_branch"] == "cfg_text" - assert rank2_payload["sp.cfg_active_branch"] == "cfg_img" - assert rank1_payload["sp.cfg_branch_roles"] == ["cfg_text", "cfg_img"] - assert rank2_payload["sp.cfg_branch_roles"] == ["cfg_text", "cfg_img"] - assert "sp.cfg_branch_past_key_values" in rank1_payload - assert "sp.cfg_branch_past_key_values" in rank2_payload - assert list(rank1_payload["sp.cfg_branch_past_key_values"].keys()) == ["cfg_text"] - assert list(rank2_payload["sp.cfg_branch_past_key_values"].keys()) == ["cfg_img"] - assert "sp.cfg_text_past_key_values" in rank1_payload - assert "sp.cfg_img_past_key_values" not in rank1_payload - assert "sp.cfg_img_past_key_values" in rank2_payload - assert "sp.cfg_text_past_key_values" not in rank2_payload - - def test_tp_cfg_follower_receives_local_payload_without_receiving(self): - mgr = _make_manager(from_tp=2, to_tp=4, local_rank=1) - req = SimpleNamespace(request_id="req-1", sampling_params=SimpleNamespace()) - world_group = _MockBroadcastGroup(world_size=4, rank_in_group=3) - cfg_payload = { - "past_key_values": SimpleNamespace(key_cache=[torch.tensor([1.0])]), - "kv_metadata": {"source": "main"}, - "sp.past_key_values": SimpleNamespace(key_cache=[torch.tensor([1.0])]), - "sp.kv_metadata": {"source": "main"}, - "sp.cfg_active_branch": "cfg_text", - "sp.cfg_branch_roles": ["cfg_text", "cfg_img"], - "sp.cfg_branch_past_key_values": { - "cfg_text": SimpleNamespace(key_cache=[torch.tensor([2.0])]), - }, - "sp.cfg_branch_kv_metadata": {"cfg_text": {"source": "cfg-text"}}, - "sp.cfg_text_past_key_values": SimpleNamespace(key_cache=[torch.tensor([2.0])]), - } - cfg_group = _MockBroadcastGroup(world_size=2, rank_in_group=1, recv_value=cfg_payload) - - mgr.receive_multi_kv_cache = MagicMock(return_value=True) - with ( - patch("vllm_omni.diffusion.distributed.parallel_state.get_world_group", return_value=world_group), - patch( - "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_world_size", - return_value=2, - ), - patch( - "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_rank", - return_value=1, - ), - patch("vllm_omni.diffusion.distributed.parallel_state.get_cfg_group", return_value=cfg_group), - ): - assert mgr.receive_multi_kv_cache_distributed(req) is True - - mgr.receive_multi_kv_cache.assert_not_called() - assert req.kv_metadata == {"source": "main"} - assert torch.equal(req.past_key_values.key_cache[0], torch.tensor([1.0])) - assert torch.equal(req.sampling_params.past_key_values.key_cache[0], torch.tensor([1.0])) - assert req.sampling_params.cfg_active_branch == "cfg_text" - assert req.sampling_params.cfg_branch_roles == ["cfg_text", "cfg_img"] - assert torch.equal( - req.sampling_params.cfg_branch_past_key_values["cfg_text"].key_cache[0], - torch.tensor([2.0]), - ) - assert req.sampling_params.cfg_branch_kv_metadata == {"cfg_text": {"source": "cfg-text"}} - assert torch.equal(req.sampling_params.cfg_text_past_key_values.key_cache[0], torch.tensor([2.0])) - assert cfg_group.broadcast_calls == [] - assert cfg_group.recv_calls == [0] - - def test_tp_without_cfg_keeps_independent_receive_path(self): - mgr = _make_manager(from_tp=2, to_tp=2, local_rank=1) - req = SimpleNamespace(request_id="req-1", sampling_params=SimpleNamespace()) - world_group = _MockBroadcastGroup(world_size=2, rank_in_group=1) - mgr.receive_multi_kv_cache = MagicMock(return_value=True) - - with patch("vllm_omni.diffusion.distributed.parallel_state.get_world_group", return_value=world_group): - assert mgr.receive_multi_kv_cache_distributed(req, target_device=torch.device("cpu")) is True - - mgr.receive_multi_kv_cache.assert_called_once_with(req, None, torch.device("cpu")) - - -# ── TP auto-detect ─────────────────────────────────────────────────── - - -class TestAutoDetectTP: - def test_auto_detect_when_config_defaults(self): - """When config from_tp/to_tp == 1 (default), manager should auto-detect.""" - config = OmniKVCacheConfig( - connector_config={"type": "mock"}, - from_stage="s0", - stage_id="s1", - need_recv_cache=True, - ) - with ( - patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=0), - patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", return_value=4), - ): - mgr = OmniKVTransferManager(config) - assert mgr._tp_topo.source_tp_size == 4 - assert mgr._tp_topo.target_tp_size == 4 - - def test_explicit_tp_overrides_auto_detect(self): - config = OmniKVCacheConfig( - connector_config={"type": "mock"}, - from_stage="s0", - stage_id="s1", - need_recv_cache=True, - from_tp=2, - to_tp=4, - ) - with ( - patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=0), - patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", return_value=8), - ): - mgr = OmniKVTransferManager(config) - assert mgr._tp_topo.source_tp_size == 2 - assert mgr._tp_topo.target_tp_size == 4 diff --git a/tests/distributed/omni_coordinator/test_load_balancer.py b/tests/distributed/omni_coordinator/test_load_balancer.py index 8350b33d396..c54d2489402 100644 --- a/tests/distributed/omni_coordinator/test_load_balancer.py +++ b/tests/distributed/omni_coordinator/test_load_balancer.py @@ -3,18 +3,12 @@ from time import time -import pytest - from vllm_omni.distributed.omni_coordinator import ( InstanceInfo, - LeastQueueLengthBalancer, RandomBalancer, - RoundRobinBalancer, StageStatus, ) -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - def test_load_balancer_select_returns_valid_index(): """Verify RandomBalancer.select() returns a valid index for instances.""" @@ -62,173 +56,3 @@ def test_load_balancer_select_returns_valid_index(): assert isinstance(index, int) assert 0 <= index < len(instances) - - -def test_round_robin_balancer_cycles_instances(): - now = time() - instances = [ - InstanceInfo( - input_addr="tcp://host:10001", - output_addr="tcp://host:10001-out", - stage_id=0, - status=StageStatus.UP, - queue_length=2, - last_heartbeat=now, - registered_at=now, - ), - InstanceInfo( - input_addr="tcp://host:10002", - output_addr="tcp://host:10002-out", - stage_id=0, - status=StageStatus.UP, - queue_length=1, - last_heartbeat=now, - registered_at=now, - ), - InstanceInfo( - input_addr="tcp://host:10003", - output_addr="tcp://host:10003-out", - stage_id=1, - status=StageStatus.UP, - queue_length=0, - last_heartbeat=now, - registered_at=now, - ), - ] - - balancer = RoundRobinBalancer() - results = [balancer.select({}, instances) for _ in range(5)] - - # Default start_index=0 => 0,1,2,0,1 - assert results == [0, 1, 2, 0, 1] - - -def test_round_robin_balancer_empty_instances_raises(): - with pytest.raises(ValueError, match="instances must not be empty"): - RoundRobinBalancer().select({}, []) - - -def test_round_robin_balancer_after_large_index_and_shorter_list(): - """Large start_index % len(instances) then counter wraps with shorter list.""" - now = time() - two = [ - InstanceInfo( - input_addr="tcp://host:10001", - output_addr="tcp://host:10001-out", - stage_id=0, - status=StageStatus.UP, - queue_length=0, - last_heartbeat=now, - registered_at=now, - ), - InstanceInfo( - input_addr="tcp://host:10002", - output_addr="tcp://host:10002-out", - stage_id=0, - status=StageStatus.UP, - queue_length=0, - last_heartbeat=now, - registered_at=now, - ), - ] - balancer = RoundRobinBalancer(start_index=7) - assert balancer.select({}, two) == 1 # 7 % 2 - assert balancer.select({}, two) == 0 # next index wrapped to 0 - - -def test_least_queue_length_balancer_picks_min_queue(): - now = time() - instances = [ - InstanceInfo( - input_addr="tcp://host:10001", - output_addr="tcp://host:10001-out", - stage_id=0, - status=StageStatus.UP, - queue_length=2, - last_heartbeat=now, - registered_at=now, - ), - InstanceInfo( - input_addr="tcp://host:10002", - output_addr="tcp://host:10002-out", - stage_id=0, - status=StageStatus.UP, - queue_length=0, - last_heartbeat=now, - registered_at=now, - ), - InstanceInfo( - input_addr="tcp://host:10003", - output_addr="tcp://host:10003-out", - stage_id=1, - status=StageStatus.UP, - queue_length=5, - last_heartbeat=now, - registered_at=now, - ), - ] - - balancer = LeastQueueLengthBalancer() - index = balancer.select({}, instances) - assert index == 1 - - -def test_least_queue_length_balancer_empty_instances_raises(): - with pytest.raises(ValueError, match="instances must not be empty"): - LeastQueueLengthBalancer().select({}, []) - - -def test_least_queue_length_balancer_equal_queues_uses_choice(mocker): - now = time() - instances = [ - InstanceInfo( - input_addr="tcp://host:10001", - output_addr="tcp://host:10001-out", - stage_id=0, - status=StageStatus.UP, - queue_length=3, - last_heartbeat=now, - registered_at=now, - ), - InstanceInfo( - input_addr="tcp://host:10002", - output_addr="tcp://host:10002-out", - stage_id=0, - status=StageStatus.UP, - queue_length=3, - last_heartbeat=now, - registered_at=now, - ), - InstanceInfo( - input_addr="tcp://host:10003", - output_addr="tcp://host:10003-out", - stage_id=1, - status=StageStatus.UP, - queue_length=3, - last_heartbeat=now, - registered_at=now, - ), - ] - balancer = LeastQueueLengthBalancer() - mocker.patch( - "vllm_omni.distributed.omni_coordinator.load_balancer.random.choice", - return_value=2, - ) - assert balancer.select({}, instances) == 2 - - -def test_least_queue_length_balancer_negative_queue_raises(): - now = time() - instances = [ - InstanceInfo( - input_addr="tcp://host:10001", - output_addr="tcp://host:10001-out", - stage_id=0, - status=StageStatus.UP, - queue_length=-1, - last_heartbeat=now, - registered_at=now, - ), - ] - with pytest.raises(ValueError, match="queue_length must be non-negative"): - LeastQueueLengthBalancer().select({}, instances) diff --git a/tests/distributed/omni_coordinator/test_omni_coord_client_for_hub.py b/tests/distributed/omni_coordinator/test_omni_coord_client_for_hub.py index 2fbd7c85bf8..24b3319232d 100644 --- a/tests/distributed/omni_coordinator/test_omni_coord_client_for_hub.py +++ b/tests/distributed/omni_coordinator/test_omni_coord_client_for_hub.py @@ -12,8 +12,6 @@ OmniCoordClientForHub, ) -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - def _bind_pub() -> tuple[zmq.Context, zmq.Socket, str]: ctx = zmq.Context.instance() diff --git a/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py b/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py index 0ba19c7fff7..b74a48f49cd 100644 --- a/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py +++ b/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py @@ -2,20 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json -import threading -import pytest import zmq from vllm_omni.distributed.omni_coordinator import ( OmniCoordClientForStage, StageStatus, ) -from vllm_omni.distributed.omni_coordinator import ( - omni_coord_client_for_stage as stage_client_module, -) - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] def _bind_router() -> tuple[zmq.Context, zmq.Socket, str]: @@ -26,8 +19,7 @@ def _bind_router() -> tuple[zmq.Context, zmq.Socket, str]: return ctx, router, endpoint -def _recv_event(router: zmq.Socket, timeout_ms: int = 2000) -> dict: - assert router.poll(timeout=timeout_ms) != 0, "Timed out waiting for coordinator event" +def _recv_event(router: zmq.Socket) -> dict: frames = router.recv_multipart() # ROUTER adds identity frame; the last frame is the payload. payload = frames[-1] @@ -116,197 +108,3 @@ def test_stage_client_close_sends_down_status(): router.close(0) ctx.term() - - -def test_stage_client_reconnects_after_send_failure(mocker): - """Verify send failure path invokes reconnect before retrying send.""" - ctx, router, endpoint = _bind_router() - - client = OmniCoordClientForStage( - endpoint, - "tcp://stage:reconnect-in", - "tcp://stage:reconnect-out", - 0, - ) - - # Discard initial registration event from the real socket. - _recv_event(router) - - class _FlakySocket: - def __init__(self): - self.send_calls = 0 - self.closed = False - - def send(self, *_args, **_kwargs): - self.send_calls += 1 - if self.send_calls == 1: - raise RuntimeError("simulated send failure") - - def close(self, *_args, **_kwargs): - self.closed = True - - flaky_socket = _FlakySocket() - client._socket = flaky_socket - client._reconnect = mocker.Mock(return_value=True) - - client.update_info(queue_length=1) - - client._reconnect.assert_called_once_with(max_retries=3) - assert flaky_socket.send_calls == 2 - - client.close() - router.close(0) - ctx.term() - - -def test_stage_client_raises_when_reconnect_fails(mocker): - """Verify send failure is propagated when reconnect cannot recover.""" - ctx, router, endpoint = _bind_router() - - client = OmniCoordClientForStage( - endpoint, - "tcp://stage:reconnect-fail-in", - "tcp://stage:reconnect-fail-out", - 0, - ) - - # Discard initial registration event from the real socket. - _recv_event(router) - - class _AlwaysFailSocket: - def send(self, *_args, **_kwargs): - raise RuntimeError("simulated send failure") - - def close(self, *_args, **_kwargs): - pass - - client._socket = _AlwaysFailSocket() - client._reconnect = mocker.Mock(return_value=False) - - with pytest.raises(RuntimeError, match="simulated send failure"): - client.update_info(queue_length=2) - - client._reconnect.assert_called_once_with(max_retries=3) - client.close() - router.close(0) - ctx.term() - - -def test_stage_client_close_handles_runtime_error_in_final_update(mocker): - """Verify close() still releases resources when final update raises RuntimeError.""" - ctx, router, endpoint = _bind_router() - - client = OmniCoordClientForStage( - endpoint, - "tcp://stage:close-runtime-in", - "tcp://stage:close-runtime-out", - 0, - ) - - # Discard initial registration event from the real socket. - _recv_event(router) - - client._send_event = mocker.Mock(side_effect=RuntimeError("simulated close-time failure")) - client.close() - - assert client._closed - assert client._socket.closed - - router.close(0) - ctx.term() - - -def test_reconnect_respects_retry_limit(monkeypatch): - """Verify _reconnect stops after max_retries on repeated failures.""" - attempts = {"connect": 0} - - class _FailSocket: - def close(self, *_args, **_kwargs): - pass - - def connect(self, *_args, **_kwargs): - attempts["connect"] += 1 - raise zmq.ZMQError("simulated reconnect failure") - - class _FailContext: - def socket(self, *_args, **_kwargs): - return _FailSocket() - - def term(self): - pass - - client = OmniCoordClientForStage.__new__(OmniCoordClientForStage) - client._closed = False - client._coord_zmq_addr = "tcp://127.0.0.1:9999" - client._stop_event = threading.Event() - client._send_lock = threading.RLock() - client._socket = _FailSocket() - client._ctx = _FailContext() - - monkeypatch.setattr(stage_client_module.zmq, "Context", lambda: _FailContext()) - monkeypatch.setattr(stage_client_module.time, "sleep", lambda *_args, **_kwargs: None) - - assert client._reconnect(max_retries=3, retry_interval=5.0) is False - assert attempts["connect"] == 3 - - -def test_heartbeat_loop_retries_after_transient_send_failure(): - """Verify heartbeat loop continues after one transient send failure.""" - - class _FakeStopEvent: - def __init__(self): - self.wait_calls = 0 - self._set = False - - def wait(self, timeout=None): - _ = timeout - self.wait_calls += 1 - # Run two loop iterations, then stop. - return self._set or self.wait_calls >= 3 - - def is_set(self): - return self._set - - def set(self): - self._set = True - - client = OmniCoordClientForStage.__new__(OmniCoordClientForStage) - client._closed = False - client._heartbeat_interval = 0.0 - client._stop_event = _FakeStopEvent() - - calls = {"count": 0} - - def _fake_send(event_type): - assert event_type == "heartbeat" - calls["count"] += 1 - if calls["count"] == 1: - raise RuntimeError("transient heartbeat failure") - - client._send_event = _fake_send - - client._heartbeat_loop() - - assert calls["count"] == 2 - - -def test_update_info_rejected_while_closing(): - """Verify update_info is rejected once client enters closing state.""" - ctx, router, endpoint = _bind_router() - - client = OmniCoordClientForStage( - endpoint, - "tcp://stage:closing-in", - "tcp://stage:closing-out", - 0, - ) - _recv_event(router) - - client._closing = True - with pytest.raises(RuntimeError, match="closing"): - client.update_info(queue_length=3) - - client._closing = False - client.close() - router.close(0) - ctx.term() diff --git a/tests/distributed/omni_coordinator/test_omni_coordinator.py b/tests/distributed/omni_coordinator/test_omni_coordinator.py index eff3d429e40..0c68e61bb11 100644 --- a/tests/distributed/omni_coordinator/test_omni_coordinator.py +++ b/tests/distributed/omni_coordinator/test_omni_coordinator.py @@ -4,7 +4,6 @@ import json import time -import pytest import zmq from vllm.v1.utils import get_engine_client_zmq_addr @@ -14,8 +13,6 @@ StageStatus, ) -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - def _recv_instance_list(sub: zmq.Socket, timeout_ms: int = 2000) -> dict | None: """Receive InstanceList JSON from SUB socket. Returns None on timeout.""" @@ -41,74 +38,6 @@ def _wait_for_instance_list( return None -def _drain_sub_messages(sub: zmq.Socket, max_seconds: float = 0.4) -> None: - """Drain queued SUB messages for a short window.""" - deadline = time.time() + max_seconds - while time.time() < deadline: - _recv_instance_list(sub, timeout_ms=50) - - -def test_omni_coordinator_pub_coalescing_on_rapid_queue_updates(): - """Rapid updates should be coalesced into fewer PUB messages.""" - router_addr = get_engine_client_zmq_addr( - local_only=False, - host="127.0.0.1", - port=0, - ) - pub_addr = get_engine_client_zmq_addr( - local_only=False, - host="127.0.0.1", - port=0, - ) - coordinator = OmniCoordinator( - router_zmq_addr=router_addr, - pub_zmq_addr=pub_addr, - heartbeat_timeout=1000.0, - ) - - sub_ctx = zmq.Context.instance() - sub = sub_ctx.socket(zmq.SUB) - sub.connect(pub_addr) - sub.setsockopt(zmq.SUBSCRIBE, b"") - - time.sleep(0.3) # PUB/SUB slow-joiner - - client = OmniCoordClientForStage( - router_addr, - "tcp://stage:coalesce", - "tcp://stage:coalesce-out", - 0, - ) - - # Wait for initial registration broadcast and clear any queued messages. - msg = _wait_for_instance_list(sub, expected_count=1) - assert msg is not None - _drain_sub_messages(sub) - - # Burst many queue updates in a short period. - update_count = 80 - for i in range(update_count): - client.update_info(queue_length=i) - - # With publish_min_interval=0.1s, received messages over ~1s should be - # much smaller than update_count (coalescing effect). - window_s = 1.1 - deadline = time.time() + window_s - recv_count = 0 - while time.time() < deadline: - if _recv_instance_list(sub, timeout_ms=100) is not None: - recv_count += 1 - - assert recv_count < update_count // 2, ( - f"expected coalesced PUB traffic, got {recv_count} for {update_count} updates" - ) - - client.close() - coordinator.close() - sub.close(0) - sub_ctx.term() - - def test_omni_coordinator_registration_broadcast(): """Verify that after multiple OmniCoordClientForStage instances register, OmniCoordinator publishes an InstanceList containing all registered instances. diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py index 709fdf345ec..0a81b02075b 100644 --- a/tests/e2e/accuracy/conftest.py +++ b/tests/e2e/accuracy/conftest.py @@ -1,18 +1,16 @@ from __future__ import annotations import os +import shutil import subprocess from contextlib import contextmanager from dataclasses import dataclass -from io import BytesIO from pathlib import Path import pytest -import requests import torch -from PIL import Image -from tests.helpers.runtime import OmniServer, OmniServerParams +from tests.conftest import OmniServer, OmniServerParams def pytest_addoption(parser): @@ -116,8 +114,8 @@ def generate_server(self): params = self.generate_params model = self.model_prefix + params.model server_args = params.server_args or [] - if params.use_omni and params.stage_init_timeout is not None: - server_args = ["--stage-init-timeout", str(params.stage_init_timeout), *server_args] + if params.use_omni: + server_args = ["--stage-init-timeout", "120", *server_args] with OmniServer( model, server_args, @@ -185,26 +183,16 @@ def accuracy_artifact_root() -> Path: return root -@pytest.fixture(scope="session") -def qwen_bear_image(accuracy_artifact_root: Path) -> Image.Image: - """Download the Qwen bear image from the URL and save it to the accuracy artifact root.""" - QWEN_BEAR_IMAGE_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/omni-assets/qwen-bear.png" - response = requests.get(QWEN_BEAR_IMAGE_URL, timeout=60) - response.raise_for_status() - image = Image.open(BytesIO(response.content)).convert("RGB") - image.save(accuracy_artifact_root / "qwen_bear.png") - return image +def reset_artifact_dir(path: Path) -> Path: + if path.exists(): + shutil.rmtree(path) + path.mkdir(parents=True, exist_ok=True) + return path -@pytest.fixture(scope="session") -def rabbit_image(accuracy_artifact_root: Path) -> Image.Image: - """Download the rabbit image from the URL and save it to the accuracy artifact root.""" - RABBIT_IMAGE_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/omni-assets/rabbit.png" - response = requests.get(RABBIT_IMAGE_URL, timeout=60) - response.raise_for_status() - image = Image.open(BytesIO(response.content)).convert("RGB") - image.save(accuracy_artifact_root / "rabbit.png") - return image +def infer_model_label(model: str) -> str: + label = Path(model.rstrip("/\\")).name or "model" + return "".join(char if char.isalnum() or char in {"-", "_"} else "_" for char in label) def _build_accuracy_server_config( @@ -238,7 +226,6 @@ def _build_accuracy_server_config( server_args=generate_server_args, env_dict={"CUDA_VISIBLE_DEVICES": shared_gpu}, use_omni=True, - stage_init_timeout=300, ), judge_params=OmniServerParams( model=judge_model, diff --git a/tests/e2e/accuracy/helpers.py b/tests/e2e/accuracy/helpers.py deleted file mode 100644 index 382d3ea9b5f..00000000000 --- a/tests/e2e/accuracy/helpers.py +++ /dev/null @@ -1,115 +0,0 @@ -from pathlib import Path - -import numpy as np -import pytest -import torch -from PIL import Image -from torchmetrics.image import PeakSignalNoiseRatio, StructuralSimilarityIndexMeasure - - -def reset_artifact_dir(path: Path) -> Path: - import shutil - - if path.exists(): - shutil.rmtree(path) - path.mkdir(parents=True, exist_ok=True) - return path - - -def infer_model_label(model: str) -> str: - label = Path(model.rstrip("/\\")).name or "model" - return "".join(char if char.isalnum() or char in {"-", "_"} else "_" for char in label) - - -def model_output_dir(parent_dir: Path, model: str) -> Path: - safe_model_name = model.split("/")[-1].replace(".", "_") - path = parent_dir / safe_model_name - path.mkdir(parents=True, exist_ok=True) - return path - - -def assert_similarity( - *, - model_name: str, - vllm_image: Image.Image, - diffusers_image: Image.Image, - ssim_threshold: float, - psnr_threshold: float, - width: int | None = None, - height: int | None = None, - compare_mode: str = "RGB", -) -> None: - requested_size = (width, height) if width is not None and height is not None else None - if requested_size is not None and diffusers_image.size != requested_size: - pytest.skip( - "Skipping as diffusers baseline output is corrupt and not comparable: " - f"dimensions do not match requested size; requested={requested_size}, got={diffusers_image.size}." - ) - - assert vllm_image.size == diffusers_image.size, ( - f"Online and diffusers output sizes mismatch: online={vllm_image.size}, diffusers={diffusers_image.size}" - ) - - ssim_score, psnr_score = compute_image_ssim_psnr( - prediction=vllm_image, - reference=diffusers_image, - compare_mode=compare_mode, - ) - print(f"{model_name} similarity metrics:") - print(f" SSIM: value={ssim_score:.6f}, threshold>={ssim_threshold:.6f}, range=[-1, 1], higher_is_better=True") - print( - f" PSNR: value={psnr_score:.6f} dB, threshold>={psnr_threshold:.6f} dB, range=[0, +inf), higher_is_better=True" - ) - - assert ssim_score >= ssim_threshold, ( - f"SSIM below threshold for {model_name}: got {ssim_score:.6f}, expected >= {ssim_threshold:.6f}." - ) - assert psnr_score >= psnr_threshold, ( - f"PSNR below threshold for {model_name}: got {psnr_score:.6f}, expected >= {psnr_threshold:.6f}." - ) - - -def assert_image_sequence_similarity( - *, - model_name: str, - vllm_images: list[Image.Image], - diffusers_images: list[Image.Image], - ssim_threshold: float, - psnr_threshold: float, - compare_mode: str = "RGB", -) -> None: - assert len(vllm_images) == len(diffusers_images), ( - f"Output image count mismatch for {model_name}: online={len(vllm_images)}, diffusers={len(diffusers_images)}" - ) - for index, (vllm_image, diffusers_image) in enumerate(zip(vllm_images, diffusers_images, strict=True), start=1): - assert_similarity( - model_name=f"{model_name}[layer={index}]", - vllm_image=vllm_image, - diffusers_image=diffusers_image, - ssim_threshold=ssim_threshold, - psnr_threshold=psnr_threshold, - compare_mode=compare_mode, - ) - - -def compute_image_ssim_psnr( - *, - prediction: Image.Image, - reference: Image.Image, - compare_mode: str = "RGB", -) -> tuple[float, float]: - pred_tensor = _pil_to_batched_tensor(prediction, compare_mode=compare_mode) - ref_tensor = _pil_to_batched_tensor(reference, compare_mode=compare_mode) - - ssim_metric = StructuralSimilarityIndexMeasure(data_range=1.0) - psnr_metric = PeakSignalNoiseRatio(data_range=1.0) - - ssim_value = float(ssim_metric(pred_tensor, ref_tensor).item()) - psnr_value = float(psnr_metric(pred_tensor, ref_tensor).item()) - return ssim_value, psnr_value - - -def _pil_to_batched_tensor(image: Image.Image, *, compare_mode: str) -> torch.Tensor: - array = np.asarray(image.convert(compare_mode), dtype=np.float32) / 255.0 - tensor = torch.from_numpy(array).permute(2, 0, 1).unsqueeze(0) - return tensor diff --git a/tests/e2e/accuracy/qwen3_omni/__init__.py b/tests/e2e/accuracy/qwen3_omni/__init__.py deleted file mode 100644 index 79a31c4f100..00000000000 --- a/tests/e2e/accuracy/qwen3_omni/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -"""Qwen3-Omni accuracy benchmarks (Daily-Omni / Seed-TTS ``vllm bench serve --omni``).""" diff --git a/tests/e2e/accuracy/qwen3_omni/qwen3_omni_acc_bench_core.py b/tests/e2e/accuracy/qwen3_omni/qwen3_omni_acc_bench_core.py deleted file mode 100644 index 2ce86d504f0..00000000000 --- a/tests/e2e/accuracy/qwen3_omni/qwen3_omni_acc_bench_core.py +++ /dev/null @@ -1,201 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -"""Shared helpers for Qwen3-Omni Daily-Omni / Seed-TTS ``vllm bench serve --omni`` accuracy runs. - -Local dataset paths are **optional**. When ``VLLM_DAILY_OMNI_QA_JSON`` + ``VLLM_DAILY_OMNI_VIDEO_DIR`` -point to existing files, those are used with inline video. Otherwise the benchmark falls back to -the HuggingFace dataset id (``liarliar/Daily-Omni``); QA loads via ``datasets``, and the first -bench request that needs media downloads ``Videos.tar`` from the Hub when no video dir is set. - -Similarly for Seed-TTS: a local directory wins; otherwise ``--dataset-path`` uses the Hub id -and ``huggingface_hub.snapshot_download`` inside ``resolve_seed_tts_root`` pulls files on demand. - -Use :func:`build_acc_benchmark_cli_argv` to assemble ``argv`` for a live Omni server (host/port/model -and small bench defaults) before ``parse_args`` / ``run_acc_benchmark`` in the accuracy driver. -""" - -from __future__ import annotations - -import json -import os -import shutil -import subprocess -from pathlib import Path -from typing import Any, Protocol - -DEFAULT_DAILY_OMNI_HF_REPO = "liarliar/Daily-Omni" -DEFAULT_SEED_TTS_HF_REPO = "zhaochenyang20/seed-tts-eval" - - -class OmniBenchServerEndpoint(Protocol): - """Anything with ``host`` / ``port`` / ``model`` (e.g. :class:`tests.conftest.OmniServer`).""" - - host: str - port: int - model: str - - -def build_acc_benchmark_cli_argv( - server: OmniBenchServerEndpoint, - *, - skip_seed: bool, - skip_daily: bool, - num_prompts: int | None = None, - max_concurrency: int | None = None, -) -> list[str]: - """Prefix argv for :func:`run_qwen_omni_acc_benchmark.parse_acc_benchmark_args` + :func:`run_acc_benchmark`. - - Wires ``--host`` / ``--port`` / ``--model`` to a running Omni OpenAI server, sets small - ``--num-prompts`` / ``--max-concurrency`` defaults (overridable via ``ACC_BENCH_NUM_PROMPTS`` / - ``ACC_BENCH_MAX_CONCURRENCY``), and when Daily-Omni runs adds ``--daily-omni-repo`` so Hub QA - matches :func:`daily_omni_bench_argv` once ``run_acc_benchmark`` mirrors ``--daily-omni-repo`` into env. - """ - n_prompts = int(os.environ.get("ACC_BENCH_NUM_PROMPTS", "2000")) if num_prompts is None else int(num_prompts) - n_conc = int(os.environ.get("ACC_BENCH_MAX_CONCURRENCY", "10")) if max_concurrency is None else int(max_concurrency) - argv = [ - "--host", - server.host, - "--port", - str(server.port), - "--model", - server.model, - "--num-prompts", - str(n_prompts), - "--max-concurrency", - str(n_conc), - ] - if not skip_daily: - repo = os.environ.get("VLLM_DAILY_OMNI_REPO", DEFAULT_DAILY_OMNI_HF_REPO).strip() or DEFAULT_DAILY_OMNI_HF_REPO - argv.extend(["--daily-omni-repo", repo]) - if skip_seed: - argv.append("--skip-seed-tts") - if skip_daily: - argv.append("--skip-daily-omni") - return argv - - -def daily_omni_bench_argv() -> list[str]: - """CLI args for Daily-Omni (after ``vllm bench serve --omni``).""" - qa = os.environ.get("VLLM_DAILY_OMNI_QA_JSON", "").strip() - vd = os.environ.get("VLLM_DAILY_OMNI_VIDEO_DIR", "").strip() - if qa and vd: - qap = Path(qa).expanduser() - vdp = Path(vd).expanduser() - if qap.is_file() and vdp.is_dir(): - return [ - "--dataset-name", - "daily-omni", - "--daily-omni-qa-json", - str(qap.resolve()), - "--daily-omni-video-dir", - str(vdp.resolve()), - "--daily-omni-inline-local-video", - ] - repo = os.environ.get("VLLM_DAILY_OMNI_REPO", DEFAULT_DAILY_OMNI_HF_REPO).strip() or DEFAULT_DAILY_OMNI_HF_REPO - return [ - "--dataset-name", - "daily-omni", - "--dataset-path", - repo, - ] - - -def seed_tts_bench_argv(*, locale: str = "en") -> list[str]: - """CLI args for Seed-TTS (after ``vllm bench serve --omni``).""" - dp = os.environ.get("VLLM_SEED_TTS_DATASET_PATH", "").strip() - if dp: - p = Path(dp).expanduser() - # Preserve Hugging Face repo ids verbatim. Only canonicalize to an - # absolute path when the value actually exists as a local directory. - dataset_path = str(p.resolve()) if p.exists() and p.is_dir() else dp - else: - dataset_path = ( - os.environ.get("VLLM_SEED_TTS_REPO", DEFAULT_SEED_TTS_HF_REPO).strip() or DEFAULT_SEED_TTS_HF_REPO - ) - out = ["--dataset-name", "seed-tts", "--dataset-path", dataset_path] - root = os.environ.get("SEED_TTS_ROOT", "").strip() - if root: - out.extend(["--seed-tts-root", str(Path(root).expanduser().resolve())]) - out.extend(["--seed-tts-locale", locale]) - return out - - -def find_vllm_cli() -> str: - exe = shutil.which("vllm") - if not exe: - raise FileNotFoundError("Could not find `vllm` on PATH (install vLLM-Omni with CLI entrypoints).") - return exe - - -def run_vllm_bench_subprocess(vllm: str, argv: list[str], *, extra_env: dict[str, str] | None = None) -> None: - env = os.environ.copy() - if extra_env: - env.update(extra_env) - subprocess.run([vllm, *argv], env=env, check=True) - - -def load_benchmark_result(path: Path) -> dict[str, Any]: - with path.open(encoding="utf-8") as f: - return json.load(f) - - -def build_serve_common_argv( - *, - host: str, - port: int, - model: str, - num_prompts: int, - max_concurrency: int, - num_warmups: int, - percentile_metrics: str, - result_dir: Path, - result_filename: str, - ready_check_timeout_sec: int | None = None, -) -> list[str]: - out = [ - "bench", - "serve", - "--omni", - "--host", - host, - "--port", - str(port), - "--model", - model, - "--endpoint", - "/v1/chat/completions", - "--backend", - "openai-chat-omni", - "--request-rate", - "inf", - "--num-prompts", - str(num_prompts), - "--max-concurrency", - str(max_concurrency), - "--no-oversample", - "--num-warmups", - str(num_warmups), - "--percentile-metrics", - percentile_metrics, - "--save-result", - "--result-dir", - str(result_dir), - "--result-filename", - result_filename, - ] - if ready_check_timeout_sec is not None: - out.extend(["--ready-check-timeout-sec", str(int(ready_check_timeout_sec))]) - return out - - -def assert_daily_omni_scored(result: dict[str, Any]) -> None: - acc = result.get("daily_omni_accuracy") - assert acc is not None, "daily_omni_accuracy missing — wrong dataset or benchmark wiring" - assert int(result.get("daily_omni_evaluated_ok", 0) or 0) > 0, "no successful MCQ rows (daily_omni_evaluated_ok==0)" - - -def assert_seed_tts_scored(result: dict[str, Any]) -> None: - err = result.get("seed_tts_eval_setup_error") - assert not err, f"Seed-TTS eval deps/setup failed: {err}" - assert int(result.get("seed_tts_content_evaluated", 0) or 0) > 0, ( - "seed_tts_content_evaluated==0 — enable WER eval and check PCM capture / modalities" - ) diff --git a/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py b/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py deleted file mode 100644 index 7fb71b28d77..00000000000 --- a/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py +++ /dev/null @@ -1,428 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: Apache-2.0 -"""Accuracy (and light perf) checks for Qwen3-Omni via ``vllm bench serve --omni``. - -The standalone CLI uses small ``--num-prompts`` / ``--max-concurrency`` defaults suitable for -L4-style smoke runs against an already-running server. The pytest wrappers in -``tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py`` may still require larger GPUs (currently -H100 / MI325) because they launch the live Omni server inside the test. - -1. **Daily-Omni** — MCQ accuracy fields in the saved JSON (``daily_omni_accuracy``, …); by default the - run **fails** if accuracy is strictly below **0.69** (``--min-daily-omni-accuracy`` / ``ACC_BENCH_MIN_DAILY_OMNI_ACCURACY``). -2. **Seed-TTS** — ``seed-tts-eval``-style metrics when ``--seed-tts-wer-eval`` is used - (WER / SIM / UTMOS keys from :func:`compute_seed_tts_wer_metrics`). - -Prerequisites -------------- -* A running Omni OpenAI-compatible server (same machine or reachable host), e.g.:: - - vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8000 - - On L4 you may need a smaller checkpoint, quantization, or tighter engine flags; this script - only drives the **client** benchmark. - -* ``vllm`` CLI from **vLLM-Omni** (so ``bench serve`` registers ``daily-omni`` / ``seed-tts``). - -* **Daily-Omni** — if local ``qa.json`` + ``Videos/`` are not both provided (CLI or matching env), - the client passes ``--dataset-path`` with a Hub id (default ``liarliar/Daily-Omni``). The **child** - ``vllm bench serve`` process then loads QA via ``datasets.load_dataset`` (needs ``pip install datasets``, - network or HF cache). Without ``--daily-omni-video-dir``, the benchmark **lazily** downloads and - extracts ``Videos.tar`` from the Hub (``huggingface_hub``) on first multimodal request. Override - the dataset repo with ``--daily-omni-repo`` or ``VLLM_DAILY_OMNI_REPO``; override the tar repo - with ``VLLM_DAILY_OMNI_MEDIA_REPO`` if needed. - -* **Seed-TTS** optional extras for WER/SIM/UTMOS:: - - pip install 'vllm-omni[seed-tts-eval]' - -Examples --------- -Pytest (same checks; needs a running server):: - - pytest -sv tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py - -Smoke on localhost (server already up):: - - python tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py \\ - --model Qwen/Qwen3-Omni-30B-A3B-Instruct \\ - --daily-omni-qa-json ./qa.json \\ - --daily-omni-video-dir ./Videos \\ - --seed-tts-dataset-path ./seed-tts-eval - -Skip one suite, tighten gates:: - - python tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py \\ - --skip-daily-omni \\ - --max-seed-tts-mean-wer 0.35 \\ - --min-seed-tts-mean-sim 0.75 -""" - -from __future__ import annotations - -import argparse -import contextlib -import json -import os -import sys -from datetime import datetime -from pathlib import Path -from typing import Any - -from tests.e2e.accuracy.qwen3_omni.qwen3_omni_acc_bench_core import ( - build_serve_common_argv, - daily_omni_bench_argv, - find_vllm_cli, - load_benchmark_result, - run_vllm_bench_subprocess, - seed_tts_bench_argv, -) - -_REPO_ROOT = Path(__file__).resolve().parents[4] - - -def _repo_root() -> Path: - return _REPO_ROOT - - -def _default_result_dir() -> Path: - return Path(__file__).resolve().parent / "results" / "qwen_omni_acc" - - -def _validate_daily_omni(result: dict[str, Any], *, min_accuracy: float | None) -> list[str]: - errs: list[str] = [] - acc = result.get("daily_omni_accuracy") - if acc is None: - errs.append("Missing daily_omni_accuracy (wrong dataset or no gold-evaluated rows).") - return errs - ev = int(result.get("daily_omni_evaluated_ok", 0) or 0) - if ev <= 0: - errs.append("daily_omni_evaluated_ok is 0; no successful MCQ rows to score.") - if min_accuracy is not None and float(acc) + 1e-12 < float(min_accuracy): - errs.append(f"daily_omni_accuracy={acc:.6f} < --min-daily-omni-accuracy={min_accuracy}") - return errs - - -def _validate_seed_tts( - result: dict[str, Any], - *, - max_mean_wer: float | None, - min_mean_sim: float | None, - min_mean_utmos: float | None, -) -> list[str]: - errs: list[str] = [] - setup = result.get("seed_tts_eval_setup_error") - if setup: - errs.append(f"Seed-TTS eval setup failed: {setup}") - return errs - n = int(result.get("seed_tts_content_evaluated", 0) or 0) - if n <= 0: - errs.append("seed_tts_content_evaluated is 0 (enable --seed-tts-wer-eval and check PCM capture).") - mean_wer = result.get("seed_tts_content_error_mean") - if mean_wer is not None and max_mean_wer is not None and float(mean_wer) > float(max_mean_wer) + 1e-12: - errs.append(f"seed_tts_content_error_mean (WER)={mean_wer:.6f} > --max-seed-tts-mean-wer={max_mean_wer}") - sim_m = result.get("seed_tts_sim_mean") - if sim_m is not None and min_mean_sim is not None and float(sim_m) + 1e-12 < float(min_mean_sim): - errs.append(f"seed_tts_sim_mean={sim_m:.6f} < --min-seed-tts-mean-sim={min_mean_sim}") - ut_m = result.get("seed_tts_utmos_mean") - if ut_m is not None and min_mean_utmos is not None and float(ut_m) + 1e-12 < float(min_mean_utmos): - errs.append(f"seed_tts_utmos_mean={ut_m:.6f} < --min-seed-tts-mean-utmos={min_mean_utmos}") - return errs - - -def sync_dataset_env_from_ns(ns: argparse.Namespace) -> None: - """Mirror CLI path flags into env vars read by ``daily_omni_bench_argv`` / ``seed_tts_bench_argv``.""" - repo = getattr(ns, "daily_omni_repo", None) - if repo is not None and str(repo).strip(): - os.environ["VLLM_DAILY_OMNI_REPO"] = str(repo).strip() - if ns.daily_omni_qa_json is not None: - os.environ["VLLM_DAILY_OMNI_QA_JSON"] = str(Path(ns.daily_omni_qa_json).expanduser().resolve()) - if ns.daily_omni_video_dir is not None: - os.environ["VLLM_DAILY_OMNI_VIDEO_DIR"] = str(Path(ns.daily_omni_video_dir).expanduser().resolve()) - if ns.seed_tts_dataset_path is not None: - # ``--seed-tts-dataset-path`` accepts either a local directory or a - # Hugging Face repo id. Only resolve to an absolute filesystem path - # when the value actually exists locally; otherwise preserve the repo - # string verbatim so downstream code can pass it to snapshot_download. - raw = str(ns.seed_tts_dataset_path).strip() - p = Path(raw).expanduser() - os.environ["VLLM_SEED_TTS_DATASET_PATH"] = str(p.resolve()) if p.exists() and p.is_dir() else raw - if ns.seed_tts_root is not None: - os.environ["SEED_TTS_ROOT"] = str(Path(ns.seed_tts_root).expanduser().resolve()) - - -@contextlib.contextmanager -def _preserve_benchmark_dataset_env() -> Any: - """Save/restore dataset-related env vars so benchmark tests don't leak state.""" - keys = ( - "VLLM_DAILY_OMNI_REPO", - "VLLM_DAILY_OMNI_QA_JSON", - "VLLM_DAILY_OMNI_VIDEO_DIR", - "VLLM_SEED_TTS_DATASET_PATH", - "SEED_TTS_ROOT", - ) - original = {k: os.environ.get(k) for k in keys} - try: - yield - finally: - for key, value in original.items(): - if value is None: - os.environ.pop(key, None) - else: - os.environ[key] = value - - -def _build_common_args(ns: argparse.Namespace, *, result_filename: str) -> list[str]: - return build_serve_common_argv( - host=ns.host, - port=ns.port, - model=ns.model, - num_prompts=ns.num_prompts, - max_concurrency=ns.max_concurrency, - num_warmups=ns.num_warmups, - percentile_metrics=ns.percentile_metrics, - result_dir=ns.result_dir, - result_filename=result_filename, - ready_check_timeout_sec=ns.ready_check_timeout_sec, - ) - - -def run_daily_omni(ns: argparse.Namespace, vllm: str) -> Path: - ns.result_dir.mkdir(parents=True, exist_ok=True) - tag = datetime.now().strftime("%Y%m%d-%H%M%S") - result_filename = f"qwen_omni_acc_daily_omni_{tag}.json" - extra = json.loads(ns.daily_extra_body_json) - argv = ( - _build_common_args(ns, result_filename=result_filename) - + daily_omni_bench_argv() - + [ - "--daily-omni-input-mode", - ns.daily_omni_input_mode, - "--extra-body", - json.dumps(extra, ensure_ascii=False, separators=(",", ":")), - ] - ) - if ns.daily_omni_save_eval_items: - argv.append("--daily-omni-save-eval-items") - print("\n$", vllm, *argv, "\n", flush=True) - run_vllm_bench_subprocess(vllm, argv) - out = Path(ns.result_dir) / result_filename - if not out.is_file(): - raise FileNotFoundError(f"Expected result JSON at {out}") - return out - - -def run_seed_tts(ns: argparse.Namespace, vllm: str) -> Path: - ns.result_dir.mkdir(parents=True, exist_ok=True) - tag = datetime.now().strftime("%Y%m%d-%H%M%S") - result_filename = f"qwen_omni_acc_seed_tts_{tag}.json" - extra = json.loads(ns.seed_extra_body_json) - argv = ( - _build_common_args(ns, result_filename=result_filename) - + seed_tts_bench_argv(locale=ns.seed_tts_locale) - + [ - "--seed-tts-wer-eval", - "--extra-body", - json.dumps(extra, ensure_ascii=False, separators=(",", ":")), - ] - ) - if ns.seed_tts_wer_save_items: - argv.append("--seed-tts-wer-save-items") - if ns.seed_tts_file_ref_audio: - argv.append("--seed-tts-file-ref-audio") - extra_env: dict[str, str] = {"SEED_TTS_WER_EVAL": "1"} - if ns.seed_tts_eval_device: - extra_env["SEED_TTS_EVAL_DEVICE"] = ns.seed_tts_eval_device - print("\n$", vllm, *argv, "\n", flush=True) - run_vllm_bench_subprocess(vllm, argv, extra_env=extra_env) - out = Path(ns.result_dir) / result_filename - if not out.is_file(): - raise FileNotFoundError(f"Expected result JSON at {out}") - return out - - -def build_arg_parser() -> argparse.ArgumentParser: - p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) - p.add_argument("--host", default=os.environ.get("ACC_BENCH_HOST", "127.0.0.1")) - p.add_argument("--port", type=int, default=int(os.environ.get("ACC_BENCH_PORT", "8000"))) - p.add_argument( - "--model", - default=os.environ.get( - "ACC_BENCH_MODEL", - "Qwen/Qwen3-Omni-30B-A3B-Instruct", - ), - help="Model id passed to ``vllm bench serve`` (must match the running server).", - ) - p.add_argument("--num-prompts", type=int, default=int(os.environ.get("ACC_BENCH_NUM_PROMPTS", "2000"))) - p.add_argument("--max-concurrency", type=int, default=int(os.environ.get("ACC_BENCH_MAX_CONCURRENCY", "10"))) - p.add_argument("--num-warmups", type=int, default=int(os.environ.get("ACC_BENCH_NUM_WARMUPS", "0"))) - p.add_argument( - "--percentile-metrics", - default=os.environ.get("ACC_BENCH_PERCENTILE_METRICS", "ttft,tpot,itl,e2el,audio_ttfp,audio_rtf"), - ) - p.add_argument( - "--ready-check-timeout-sec", - type=int, - default=None, - help="If set, forwarded to ``vllm bench serve`` (probe first request until success). " - "Omit to use upstream default (typically skip).", - ) - p.add_argument( - "--result-dir", - type=Path, - default=Path(os.environ.get("ACC_BENCH_RESULT_DIR", str(_default_result_dir()))), - ) - - p.add_argument("--skip-daily-omni", action="store_true") - p.add_argument("--skip-seed-tts", action="store_true") - - p.add_argument( - "--daily-omni-repo", - type=str, - default=None, - help="Hugging Face dataset id for Daily-Omni Hub mode (sets VLLM_DAILY_OMNI_REPO). " - "Ignored when local qa.json + video dir are used.", - ) - p.add_argument( - "--daily-omni-qa-json", - type=Path, - default=None, - help="Optional local qa.json; if omitted with no env, uses Hub liarliar/Daily-Omni.", - ) - p.add_argument( - "--daily-omni-video-dir", - type=Path, - default=None, - help="Optional local Videos root; if omitted, media is fetched lazily from Hub Videos.tar.", - ) - p.add_argument("--daily-omni-input-mode", choices=("all", "visual", "audio"), default="all") - p.add_argument( - "--daily-extra-body-json", - default='{"modalities":["text"]}', - help="JSON merged into each chat request for Daily-Omni (default matches common L4 / text-output runs).", - ) - p.add_argument( - "--daily-omni-save-eval-items", - action="store_true", - help="Sets env via CLI flag so per-item rows are stored in the result JSON.", - ) - p.add_argument( - "--min-daily-omni-accuracy", - type=float, - default=float((os.environ.get("ACC_BENCH_MIN_DAILY_OMNI_ACCURACY") or "0.69").strip() or "0.69"), - help="Fail when daily_omni_accuracy is strictly below this threshold (0–1). " - "Default baseline 0.69; override with env ACC_BENCH_MIN_DAILY_OMNI_ACCURACY or pass 0 to disable the floor.", - ) - - p.add_argument( - "--seed-tts-dataset-path", - type=str, - default=None, - help="Optional local root or Hub id; if omitted, uses zhaochenyang20/seed-tts-eval.", - ) - p.add_argument("--seed-tts-root", type=Path, default=None, help="Optional override for Seed-TTS filesystem root.") - p.add_argument("--seed-tts-locale", choices=("en", "zh"), default="en") - p.add_argument( - "--seed-extra-body-json", - default='{"modalities":["text","audio"]}', - help="JSON for Seed-TTS chat requests (must include audio for synthesis + PCM capture).", - ) - p.add_argument("--seed-tts-wer-save-items", action="store_true") - p.add_argument( - "--seed-tts-file-ref-audio", - action="store_true", - help="Use file:// ref_audio; server must allow local media paths.", - ) - p.add_argument( - "--seed-tts-eval-device", - default=os.environ.get("SEED_TTS_EVAL_DEVICE"), - help="Sets SEED_TTS_EVAL_DEVICE for Whisper / WavLM / UTMOS (e.g. cuda:0).", - ) - p.add_argument( - "--max-seed-tts-mean-wer", - type=float, - default=0.5, - help="If set, fail when seed_tts_content_error_mean is strictly above this value.", - ) - p.add_argument( - "--min-seed-tts-mean-sim", - type=float, - default=None, - help="If set, fail when seed_tts_sim_mean is strictly below this value.", - ) - p.add_argument( - "--min-seed-tts-mean-utmos", - type=float, - default=None, - help="If set, fail when seed_tts_utmos_mean is strictly below this value.", - ) - return p - - -def parse_acc_benchmark_args(argv: list[str] | None = None) -> argparse.Namespace: - """Parse CLI args; when ``argv`` is ``None``, use ``sys.argv[1:]`` (standalone script).""" - if argv is None: - argv = sys.argv[1:] - return build_arg_parser().parse_args(argv) - - -def run_acc_benchmark(ns: argparse.Namespace) -> int: - """Run Daily-Omni and/or Seed-TTS client benches against a running server; return 0 on success.""" - failed: list[str] = [] - - with _preserve_benchmark_dataset_env(): - sync_dataset_env_from_ns(ns) - - vllm = find_vllm_cli() - print(f"Using vLLM CLI: {vllm}", flush=True) - print(f"Repo root (for cwd reference): {_repo_root()}", flush=True) - - if not ns.skip_daily_omni: - path = run_daily_omni(ns, vllm) - print(f"\n[Daily-Omni] result JSON: {path}", flush=True) - data = load_benchmark_result(path) - errs = _validate_daily_omni(data, min_accuracy=ns.min_daily_omni_accuracy) - if errs: - failed.extend([f"[Daily-Omni] {e}" for e in errs]) - else: - print( - f"[Daily-Omni] daily_omni_accuracy={data.get('daily_omni_accuracy')} " - f"evaluated_ok={data.get('daily_omni_evaluated_ok')}", - flush=True, - ) - - if not ns.skip_seed_tts: - path = run_seed_tts(ns, vllm) - print(f"\n[Seed-TTS] result JSON: {path}", flush=True) - data = load_benchmark_result(path) - errs = _validate_seed_tts( - data, - max_mean_wer=ns.max_seed_tts_mean_wer, - min_mean_sim=ns.min_seed_tts_mean_sim, - min_mean_utmos=ns.min_seed_tts_mean_utmos, - ) - if errs: - failed.extend([f"[Seed-TTS] {e}" for e in errs]) - else: - print( - f"[Seed-TTS] mean_wer={data.get('seed_tts_content_error_mean')} " - f"mean_sim={data.get('seed_tts_sim_mean')} mean_utmos={data.get('seed_tts_utmos_mean')} " - f"evaluated={data.get('seed_tts_content_evaluated')}", - flush=True, - ) - - if failed: - print("\nACCURACY CHECK FAILED:", file=sys.stderr) - for line in failed: - print(f" - {line}", file=sys.stderr) - return 1 - - print("\nAll configured accuracy checks passed.", flush=True) - return 0 - - -def main() -> int: - return run_acc_benchmark(parse_acc_benchmark_args()) - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py b/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py deleted file mode 100644 index 773f7c1108c..00000000000 --- a/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py +++ /dev/null @@ -1,137 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -"""Qwen3-Omni accuracy benchmarks (Daily-Omni MCQ + Seed-TTS WER) via ``vllm bench serve --omni``. - -Starts a **module-scoped** Omni OpenAI-compatible server (same pattern as ``tests/dfx/perf`` and -``tests/e2e/online_serving/test_qwen3_omni.py``), then runs the client benches against -``omni_server.host`` / ``omni_server.port`` / ``omni_server.model``. - -**Daily-Omni from Hugging Face:** unless ``VLLM_DAILY_OMNI_QA_JSON`` and ``VLLM_DAILY_OMNI_VIDEO_DIR`` -point at a full local tree, the bench uses ``--dataset-path`` (default ``liarliar/Daily-Omni`` via -``VLLM_DAILY_OMNI_REPO`` / ``--daily-omni-repo``). QA loads through ``datasets``; ``Videos.tar`` is -downloaded and extracted under ``HF_HOME`` on demand. The tests patch in -``--daily-omni-inline-local-video`` so multimodal payloads use data URLs (no -``--allowed-local-media-path`` on the server). Use small ``--num-prompts`` defaults suitable for CI -(override with ``ACC_BENCH_NUM_PROMPTS`` / ``ACC_BENCH_MAX_CONCURRENCY``; see -:func:`tests.e2e.accuracy.qwen3_omni.qwen3_omni_acc_bench_core.build_acc_benchmark_cli_argv`). - -This package lives under ``tests/e2e/accuracy/qwen3_omni/``, so pytest still loads -``tests/e2e/accuracy/conftest.py``, which imports ``tests.conftest`` (heavy deps: ``vllm``, ``torch``, …). -A broken or partial install can therefore **fail during collection** before these tests run. - -If ``vllm`` is not on ``PATH``, the tests **skip** instead of erroring. Without -``VLLM_SKIP_ACC_BENCH=1``, a failed bench still yields a **failed** run (non-zero subprocess exit). - -Run:: - - pytest -sv tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py - -Only the subprocess accuracy marker:: - - pytest -sv tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py -m qwen3_omni_acc - -Skip when you do not have GPUs, a server, or datasets (CI opt-out):: - - VLLM_SKIP_ACC_BENCH=1 pytest -sv tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py - -Standalone CLI (expects a server already up; uses ``ACC_BENCH_*`` env defaults):: - - python tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py --help -""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from tests.e2e.accuracy.qwen3_omni import run_qwen_omni_acc_benchmark as _acc_bench -from tests.e2e.accuracy.qwen3_omni.qwen3_omni_acc_bench_core import ( - build_acc_benchmark_cli_argv, - find_vllm_cli, -) -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniServerParams -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config -from vllm_omni.platforms import current_omni_platform - -_E2E_ROOT = Path(__file__).resolve().parent.parent.parent - -models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] - -pytestmark = [pytest.mark.full_model, pytest.mark.omni] - -_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml") - - -def get_chunk_config(config_path: str | None = None): - """Load the qwen3_omni CI deploy yaml with async_chunk modifications for streaming mode.""" - if config_path is None: - config_path = _CI_DEPLOY - # TODO: remove this workaround once legacy `stage_args` path is deleted. - # The pipeline (qwen3_omni/pipeline.py) already wires - # thinker2talker_async_chunk / talker2code2wav_async_chunk on stage 0/1, - # so only async_chunk needs flipping. Writing nested `engine_args:` into - # the new-schema overlay trips _parse_stage_deploy's legacy branch and - # drops flat fields (load_format, max_num_seqs, ...). - return modify_stage_config(config_path, updates={"async_chunk": True}) - - -if current_omni_platform.is_xpu(): - stage_configs = [_CI_DEPLOY] -else: # CUDA + ROCm MI325 share the same deploy config - stage_configs = [get_chunk_config()] - -test_params = [ - OmniServerParams(model=model, stage_config_path=stage_config) for model in models for stage_config in stage_configs -] - - -def _require_vllm_cli() -> None: - try: - find_vllm_cli() - except FileNotFoundError as exc: - pytest.skip(str(exc)) - - -@pytest.fixture(autouse=True) -def _daily_omni_hub_inline_media(monkeypatch: pytest.MonkeyPatch) -> None: - """Hub / lazy-cache mode uses local files → default ``file://`` needs server allowlist. - - ``run_qwen_omni_acc_benchmark`` binds ``daily_omni_bench_argv`` at import time; patch that copy - so we append ``--daily-omni-inline-local-video`` whenever the core helper did not already set it - (local qa.json + video-dir mode already passes the flag). - """ - orig = _acc_bench.daily_omni_bench_argv - - def _wrapped() -> list[str]: - out = list(orig()) - if "--daily-omni-inline-local-video" not in out: - out.append("--daily-omni-inline-local-video") - return out - - monkeypatch.setattr(_acc_bench, "daily_omni_bench_argv", _wrapped) - monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - monkeypatch.setenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") - - -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_qwen3_omni_daily_omni_accuracy_bench(omni_server) -> None: - _require_vllm_cli() - pytest.importorskip("datasets") - pytest.importorskip("huggingface_hub") - ns = _acc_bench.parse_acc_benchmark_args( - build_acc_benchmark_cli_argv(omni_server, skip_seed=True, skip_daily=False) - ) - assert _acc_bench.run_acc_benchmark(ns) == 0 - - -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_qwen3_omni_seed_tts_wer_bench(omni_server) -> None: - _require_vllm_cli() - pytest.importorskip("huggingface_hub") - ns = _acc_bench.parse_acc_benchmark_args( - build_acc_benchmark_cli_argv(omni_server, skip_seed=False, skip_daily=True) - ) - assert _acc_bench.run_acc_benchmark(ns) == 0 diff --git a/tests/e2e/accuracy/test_gebench_h100_smoke.py b/tests/e2e/accuracy/test_gebench_h100_smoke.py index 2702710e4a2..b4b83187135 100644 --- a/tests/e2e/accuracy/test_gebench_h100_smoke.py +++ b/tests/e2e/accuracy/test_gebench_h100_smoke.py @@ -6,13 +6,13 @@ import pytest from benchmarks.accuracy.text_to_image.gbench import main as gbench_main -from tests.e2e.accuracy.helpers import infer_model_label, reset_artifact_dir -from tests.helpers.mark import hardware_test - -pytestmark = [pytest.mark.diffusion, pytest.mark.full_model] +from tests.e2e.accuracy.conftest import infer_model_label, reset_artifact_dir +from tests.utils import hardware_test +@pytest.mark.advanced_model @pytest.mark.benchmark +@pytest.mark.diffusion @hardware_test(res={"cuda": "H100"}, num_cards=1) def test_gebench_h100_smoke( gebench_accuracy_servers, diff --git a/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py b/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py index 789f7ec939b..ac5f2cb3cfd 100644 --- a/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py +++ b/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py @@ -7,13 +7,13 @@ from benchmarks.accuracy.image_to_image.gedit_bench import GROUPS from benchmarks.accuracy.image_to_image.gedit_bench import main as gedit_main -from tests.e2e.accuracy.helpers import infer_model_label, reset_artifact_dir -from tests.helpers.mark import hardware_test - -pytestmark = [pytest.mark.diffusion, pytest.mark.full_model] +from tests.e2e.accuracy.conftest import infer_model_label, reset_artifact_dir +from tests.utils import hardware_test +@pytest.mark.advanced_model @pytest.mark.benchmark +@pytest.mark.diffusion @hardware_test(res={"cuda": "H100"}, num_cards=1) def test_gedit_bench_h100_smoke( gedit_accuracy_servers, @@ -106,9 +106,9 @@ def test_gedit_bench_h100_smoke( group_summary = language_summary["by_group"][group] assert set(group_summary) == {"count", "Q_SC", "Q_PQ", "Q_O"} - assert summary["languages"]["en"]["overall"]["Q_SC"] >= 6.95 + assert summary["languages"]["en"]["overall"]["Q_SC"] >= 7.0 assert summary["languages"]["en"]["overall"]["Q_PQ"] >= 5.8 - assert summary["languages"]["en"]["overall"]["Q_O"] >= 6.15 + assert summary["languages"]["en"]["overall"]["Q_O"] >= 6.2 assert summary["languages"]["cn"]["overall"]["Q_SC"] >= 6.9 assert summary["languages"]["cn"]["overall"]["Q_PQ"] >= 5.7 assert summary["languages"]["cn"]["overall"]["Q_O"] >= 6.1 diff --git a/tests/e2e/accuracy/test_ltx2_3_video_similarity.py b/tests/e2e/accuracy/test_ltx2_3_video_similarity.py deleted file mode 100644 index dec533d58ae..00000000000 --- a/tests/e2e/accuracy/test_ltx2_3_video_similarity.py +++ /dev/null @@ -1,410 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -""" -SSIM/PSNR accuracy tests for LTX-2.3. - -1. **Transformer parity** (``test_ltx2_3_transformer_matches_diffusers``): - Swaps our custom transformer into diffusers' ``LTX2Pipeline`` to measure - numerical parity in isolation. Thresholds: SSIM >= 0.95, PSNR >= 28 dB. - Result: SSIM 0.999987 (bit-identical). - -2. **Full pipeline** (``test_ltx2_3_pipeline_matches_diffusers``): - Runs the full vLLM-Omni serving stack (``OmniServer`` -> HTTP API) and - compares per-frame against stock diffusers. Currently skipped because - the OmniServer subprocess creates a different RNG state than in-process - diffusers, producing different initial latents from the same seed. - This is a test infrastructure limitation, not a model accuracy issue. -""" - -from __future__ import annotations - -import gc -import os -import tempfile -from pathlib import Path - -import diffusers -import numpy as np -import pytest -import requests -import torch -from PIL import Image - -from tests.e2e.accuracy.helpers import compute_image_ssim_psnr, model_output_dir -from tests.helpers.env import run_post_test_cleanup, run_pre_test_cleanup -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniServer - -# Parse diffusers version for compatibility check -_DIFFUSERS_VERSION = tuple(int(x) for x in diffusers.__version__.split(".")[:2] if x.isdigit()) -_DIFFUSERS_038 = _DIFFUSERS_VERSION >= (0, 38) - -MODEL_ID = "dg845/LTX-2.3-Diffusers" -MODEL_ENV_VAR = "VLLM_TEST_LTX23_MODEL" -PROMPT = "A lighthouse on a rocky cliff at sunset, waves crashing below, golden hour lighting" -NEGATIVE_PROMPT = "blurry, low quality, distorted, watermark" -WIDTH = 512 -HEIGHT = 384 -NUM_FRAMES = 25 # ~1 second at 24fps -NUM_INFERENCE_STEPS = 20 -GUIDANCE_SCALE = 4.0 -SEED = 42 - -# Transformer-swap test: near-identical output expected -TRANSFORMER_SSIM_THRESHOLD = 0.95 -TRANSFORMER_PSNR_THRESHOLD = 28.0 - -# Full-pipeline test: allows minor divergence from RNG / pipeline differences -PIPELINE_SSIM_THRESHOLD = 0.94 -PIPELINE_PSNR_THRESHOLD = 28.0 - - -def _model_name() -> str: - return os.environ.get(MODEL_ENV_VAR, MODEL_ID) - - -def _local_files_only(model: str) -> bool: - return Path(model).exists() - - -# --------------------------------------------------------------------------- -# Frame extraction helpers -# --------------------------------------------------------------------------- - - -def _video_to_frames(video_np: np.ndarray) -> list[Image.Image]: - """Convert numpy video to list of PIL Images.""" - while video_np.ndim > 4: - video_np = video_np[0] - if video_np.dtype in (np.float32, np.float64, np.float16): - video_np = np.clip(video_np * 255, 0, 255).astype(np.uint8) - return [Image.fromarray(video_np[t]) for t in range(video_np.shape[0])] - - -def _extract_diffusers_frames(result) -> list[Image.Image]: - """Extract frames from diffusers pipeline output.""" - video = result.frames - if isinstance(video, np.ndarray): - return _video_to_frames(video) - if isinstance(video, list): - if isinstance(video[0], list): - return [img.convert("RGB") for img in video[0]] - if isinstance(video[0], Image.Image): - return [img.convert("RGB") for img in video] - raise ValueError(f"Unexpected output type: {type(video)}") - - -def _extract_mp4_frames(mp4_bytes: bytes) -> list[Image.Image]: - """Extract frames from an MP4 video using ffmpeg.""" - import subprocess - - with tempfile.TemporaryDirectory() as tmpdir: - mp4_path = os.path.join(tmpdir, "video.mp4") - with open(mp4_path, "wb") as f: - f.write(mp4_bytes) - - # Extract video frames as PNG files using ffmpeg - frame_pattern = os.path.join(tmpdir, "frame_%04d.png") - subprocess.run( - ["ffmpeg", "-i", mp4_path, "-vsync", "0", frame_pattern], - capture_output=True, - check=True, - ) - - # Load frames in order - frames = [] - i = 1 - while True: - fpath = os.path.join(tmpdir, f"frame_{i:04d}.png") - if not os.path.exists(fpath): - break - frames.append(Image.open(fpath).convert("RGB").copy()) - i += 1 - return frames - - -# --------------------------------------------------------------------------- -# Comparison helper -# --------------------------------------------------------------------------- - - -def _assert_video_similarity( - *, - model_name: str, - vllm_frames: list[Image.Image], - diffusers_frames: list[Image.Image], - ssim_threshold: float, - psnr_threshold: float, -) -> tuple[float, float]: - """Compare video frames and assert SSIM/PSNR meet thresholds.""" - min_frames = min(len(vllm_frames), len(diffusers_frames)) - assert min_frames > 0, "No frames to compare" - - ssim_scores = [] - psnr_scores = [] - for i in range(min_frames): - ssim_val, psnr_val = compute_image_ssim_psnr( - prediction=vllm_frames[i], - reference=diffusers_frames[i], - ) - ssim_scores.append(ssim_val) - psnr_scores.append(psnr_val) - - avg_ssim = sum(ssim_scores) / len(ssim_scores) - avg_psnr = sum(psnr_scores) / len(psnr_scores) - - print(f"\n{model_name} video similarity ({min_frames} frames):") - print(f" SSIM: avg={avg_ssim:.6f}, min={min(ssim_scores):.6f}, threshold>={ssim_threshold:.6f}") - print(f" PSNR: avg={avg_psnr:.6f} dB, min={min(psnr_scores):.6f} dB, threshold>={psnr_threshold:.6f} dB") - - assert avg_ssim >= ssim_threshold, f"SSIM below threshold: got {avg_ssim:.6f}, expected >= {ssim_threshold:.6f}." - assert avg_psnr >= psnr_threshold, f"PSNR below threshold: got {avg_psnr:.6f}, expected >= {psnr_threshold:.6f}." - return avg_ssim, avg_psnr - - -# --------------------------------------------------------------------------- -# Diffusers baseline (shared by both tests) -# --------------------------------------------------------------------------- - - -def _run_diffusers_baseline(model: str, output_dir: Path) -> list[Image.Image]: - """Generate video using stock diffusers LTX2Pipeline.""" - from diffusers import LTX2Pipeline - - run_pre_test_cleanup(enable_force=True) - pipe = None - try: - pipe = LTX2Pipeline.from_pretrained( - model, torch_dtype=torch.bfloat16, local_files_only=_local_files_only(model) - ).to("cuda") - - generator = torch.Generator(device="cuda").manual_seed(SEED) - result = pipe( - prompt=PROMPT, - negative_prompt=NEGATIVE_PROMPT, - width=WIDTH, - height=HEIGHT, - num_frames=NUM_FRAMES, - num_inference_steps=NUM_INFERENCE_STEPS, - guidance_scale=GUIDANCE_SCALE, - generator=generator, - output_type="np", - ) - frames = _extract_diffusers_frames(result) - for i, f in enumerate(frames): - f.save(output_dir / f"diffusers_frame_{i:04d}.png") - return frames - finally: - del pipe - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - run_post_test_cleanup(enable_force=True) - - -# --------------------------------------------------------------------------- -# Test 1: Transformer-swap parity -# --------------------------------------------------------------------------- - - -def _run_with_custom_transformer(model: str, output_dir: Path) -> list[Image.Image]: - """Run diffusers pipeline with our custom transformer swapped in.""" - from contextlib import nullcontext - - from diffusers import LTX2Pipeline - from vllm.config import VllmConfig, set_current_vllm_config - from vllm.distributed.parallel_state import init_distributed_environment, initialize_model_parallel - - from vllm_omni.diffusion.models.ltx2.pipeline_ltx2 import create_transformer_from_config, load_transformer_config - - vllm_config = VllmConfig() - ctx = set_current_vllm_config(vllm_config) - ctx.__enter__() - - if not torch.distributed.is_initialized(): - os.environ.setdefault("MASTER_ADDR", "localhost") - os.environ.setdefault("MASTER_PORT", "29503") - os.environ.setdefault("RANK", "0") - os.environ.setdefault("WORLD_SIZE", "1") - init_distributed_environment(world_size=1, rank=0, local_rank=0) - initialize_model_parallel(tensor_model_parallel_size=1) - - local = _local_files_only(model) - pipe = LTX2Pipeline.from_pretrained(model, torch_dtype=torch.bfloat16, local_files_only=local) - - transformer_config = load_transformer_config(model, "transformer", local) - our_transformer = create_transformer_from_config(transformer_config) - - diffusers_state = dict(pipe.transformer.named_parameters()) - - def _weight_iter(): - for name, param in diffusers_state.items(): - yield name, param.data - - our_transformer.load_weights(_weight_iter()) - our_transformer = our_transformer.to(dtype=torch.bfloat16, device="cuda").eval() - - # Compatibility shims for diffusers pipeline - our_transformer.dtype = torch.bfloat16 - if not hasattr(our_transformer, "cache_context"): - our_transformer.cache_context = lambda name: nullcontext() - - del pipe.transformer - pipe.transformer = our_transformer - for name, component in pipe.components.items(): - if name != "transformer" and hasattr(component, "to"): - try: - component.to("cuda") - except Exception: - pass - - generator = torch.Generator(device="cuda").manual_seed(SEED) - result = pipe( - prompt=PROMPT, - negative_prompt=NEGATIVE_PROMPT, - width=WIDTH, - height=HEIGHT, - num_frames=NUM_FRAMES, - num_inference_steps=NUM_INFERENCE_STEPS, - guidance_scale=GUIDANCE_SCALE, - generator=generator, - output_type="np", - ) - frames = _extract_diffusers_frames(result) - for i, f in enumerate(frames): - f.save(output_dir / f"vllm_transformer_frame_{i:04d}.png") - - del pipe, result, our_transformer - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - return frames - - -@pytest.mark.advanced_model -@pytest.mark.benchmark -@pytest.mark.diffusion -@pytest.mark.skipif( - not _DIFFUSERS_038, reason="LTX-2.3 requires diffusers >= 0.38.0 for cross_attn_mod and BWE vocoder" -) -@hardware_test(res={"cuda": "H100"}, num_cards=1) -def test_ltx2_3_transformer_matches_diffusers(accuracy_artifact_root: Path) -> None: - """Transformer-level parity: swap our transformer into diffusers pipeline. - - Isolates transformer numerical accuracy from pipeline-level differences. - Both runs use diffusers' denoising loop, CFG, scheduler, and RNG. - """ - model = _model_name() - output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID) - - diffusers_frames = _run_diffusers_baseline(model=model, output_dir=output_dir) - vllm_frames = _run_with_custom_transformer(model=model, output_dir=output_dir) - - _assert_video_similarity( - model_name=f"{MODEL_ID} (transformer-swap)", - vllm_frames=vllm_frames, - diffusers_frames=diffusers_frames, - ssim_threshold=TRANSFORMER_SSIM_THRESHOLD, - psnr_threshold=TRANSFORMER_PSNR_THRESHOLD, - ) - - -# --------------------------------------------------------------------------- -# Test 2: Full pipeline (OmniServer → HTTP API vs diffusers) -# --------------------------------------------------------------------------- - - -def _run_vllm_omni_serving(model: str, output_dir: Path) -> list[Image.Image]: - """Generate video via the full vLLM-Omni serving stack.""" - server_args = [ - "--model-class-name", - "LTX23Pipeline", - "--stage-init-timeout", - "600", - ] - with OmniServer(model, server_args, use_omni=True) as server: - # Submit generation request - response = requests.post( - f"http://{server.host}:{server.port}/v1/videos", - files={ - "prompt": (None, PROMPT), - "negative_prompt": (None, NEGATIVE_PROMPT), - "model": (None, server.model), - "num_frames": (None, str(NUM_FRAMES)), - "fps": (None, "24"), - "size": (None, f"{WIDTH}x{HEIGHT}"), - "num_inference_steps": (None, str(NUM_INFERENCE_STEPS)), - "guidance_scale": (None, str(GUIDANCE_SCALE)), - "seed": (None, str(SEED)), - }, - timeout=120, - ) - response.raise_for_status() - video_id = response.json()["id"] - - # Poll for completion - import time - - for _ in range(120): - status_resp = requests.get( - f"http://{server.host}:{server.port}/v1/videos/{video_id}", - timeout=30, - ) - status_resp.raise_for_status() - status = status_resp.json()["status"] - if status == "completed": - break - if status in ("error", "failed"): - raise RuntimeError(f"Video generation failed: {status_resp.json()}") - time.sleep(5) - else: - raise TimeoutError(f"Video generation timed out after 600s (id={video_id})") - - # Download video content - content_resp = requests.get( - f"http://{server.host}:{server.port}/v1/videos/{video_id}/content", - timeout=120, - ) - content_resp.raise_for_status() - mp4_bytes = content_resp.content - - # Save MP4 - mp4_path = output_dir / "vllm_omni_pipeline.mp4" - with open(mp4_path, "wb") as f: - f.write(mp4_bytes) - - # Extract frames - frames = _extract_mp4_frames(mp4_bytes) - for i, frame in enumerate(frames): - frame.save(output_dir / f"vllm_pipeline_frame_{i:04d}.png") - return frames - - -@pytest.mark.advanced_model -@pytest.mark.benchmark -@pytest.mark.diffusion -@pytest.mark.skipif( - not _DIFFUSERS_038, reason="LTX-2.3 requires diffusers >= 0.38.0 for cross_attn_mod and BWE vocoder" -) -@hardware_test(res={"cuda": "H100"}, num_cards=1) -def test_ltx2_3_pipeline_matches_diffusers(accuracy_artifact_root: Path) -> None: - """Full-pipeline parity: vLLM-Omni serving stack vs diffusers. - - Runs the complete vLLM-Omni OmniServer (subprocess, HTTP API, video - encoding) and compares per-frame against stock diffusers output. - Follows the Wan2.2 / Qwen Image pattern with seed-based determinism. - """ - model = _model_name() - output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID) - - diffusers_frames = _run_diffusers_baseline(model=model, output_dir=output_dir) - vllm_frames = _run_vllm_omni_serving(model=model, output_dir=output_dir) - - _assert_video_similarity( - model_name=f"{MODEL_ID} (full-pipeline)", - vllm_frames=vllm_frames, - diffusers_frames=diffusers_frames, - ssim_threshold=PIPELINE_SSIM_THRESHOLD, - psnr_threshold=PIPELINE_PSNR_THRESHOLD, - ) diff --git a/tests/e2e/accuracy/test_qwen_image.py b/tests/e2e/accuracy/test_qwen_image.py deleted file mode 100644 index 4b8215d54b5..00000000000 --- a/tests/e2e/accuracy/test_qwen_image.py +++ /dev/null @@ -1,122 +0,0 @@ -from __future__ import annotations - -import base64 -import gc -import io -import os -from pathlib import Path - -import pytest -import requests -import torch -from diffusers.pipelines.pipeline_utils import DiffusionPipeline -from PIL import Image - -from tests.e2e.accuracy.helpers import assert_similarity, model_output_dir -from tests.helpers.env import run_post_test_cleanup, run_pre_test_cleanup -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniServer - -pytestmark = [pytest.mark.full_model, pytest.mark.diffusion] - - -MODEL_ID = "Qwen/Qwen-Image" -MODEL_ENV_VAR = "QWEN_IMAGE_MODEL" -PROMPT = "A photo of a cat sitting on a laptop keyboard, digital art style." -NEGATIVE_PROMPT = "blurry, low quality" -WIDTH = 512 -HEIGHT = 512 -NUM_INFERENCE_STEPS = 20 -TRUE_CFG_SCALE = 4.0 -SEED = 42 -SSIM_THRESHOLD = 0.97 -PSNR_THRESHOLD = 30.0 - - -def _model_name() -> str: - return os.environ.get(MODEL_ENV_VAR, MODEL_ID) - - -def _local_files_only(model: str) -> bool: - return Path(model).exists() - - -def _run_vllm_omni_qwen_image(*, model: str, output_path: Path) -> Image.Image: - server_args = ["--num-gpus", "1", "--stage-init-timeout", "300", "--init-timeout", "900"] - with OmniServer(model, server_args, use_omni=True) as omni_server: - response = requests.post( - f"http://{omni_server.host}:{omni_server.port}/v1/images/generations", - json={ - "model": omni_server.model, - "prompt": PROMPT, - "size": f"{WIDTH}x{HEIGHT}", - "n": 1, - "response_format": "b64_json", - "negative_prompt": NEGATIVE_PROMPT, - "num_inference_steps": NUM_INFERENCE_STEPS, - "true_cfg_scale": TRUE_CFG_SCALE, - "seed": SEED, - }, - timeout=600, - ) - response.raise_for_status() - payload = response.json() - assert len(payload["data"]) == 1 - image_bytes = base64.b64decode(payload["data"][0]["b64_json"]) - image = Image.open(io.BytesIO(image_bytes)).convert("RGB") - image.load() - image.save(output_path) - return image - - -def _run_diffusers_qwen_image(*, model: str, output_path: Path) -> Image.Image: - run_pre_test_cleanup(enable_force=True) - pipe: DiffusionPipeline | None = None - try: - pipe = DiffusionPipeline.from_pretrained( - model, - torch_dtype=torch.bfloat16, - trust_remote_code=True, - local_files_only=_local_files_only(model), - ).to("cuda") - generator = torch.Generator(device="cuda").manual_seed(SEED) - result = pipe( # pyright: ignore[reportCallIssue] - prompt=PROMPT, - negative_prompt=NEGATIVE_PROMPT, - width=WIDTH, - height=HEIGHT, - num_inference_steps=NUM_INFERENCE_STEPS, - true_cfg_scale=TRUE_CFG_SCALE, - generator=generator, - ) - output_image = result.images[0].convert("RGB") - output_image.save(output_path) - return output_image - finally: - if pipe is not None and hasattr(pipe, "maybe_free_model_hooks"): - pipe.maybe_free_model_hooks() - del pipe - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - run_post_test_cleanup(enable_force=True) - - -@pytest.mark.benchmark -@hardware_test(res={"cuda": "H100"}, num_cards=1) -def test_qwen_image_matches_diffusers(accuracy_artifact_root: Path) -> None: - model = _model_name() - output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID) - - vllm_output = _run_vllm_omni_qwen_image(model=model, output_path=output_dir / "vllm_omni.png") - diffusers_output = _run_diffusers_qwen_image(model=model, output_path=output_dir / "diffusers.png") - - assert_similarity( - model_name=MODEL_ID, - vllm_image=vllm_output, - diffusers_image=diffusers_output, - width=WIDTH, - height=HEIGHT, - ssim_threshold=SSIM_THRESHOLD, - psnr_threshold=PSNR_THRESHOLD, - ) diff --git a/tests/e2e/accuracy/test_qwen_image_edit.py b/tests/e2e/accuracy/test_qwen_image_edit.py deleted file mode 100644 index 07deecca976..00000000000 --- a/tests/e2e/accuracy/test_qwen_image_edit.py +++ /dev/null @@ -1,228 +0,0 @@ -from __future__ import annotations - -import gc -from pathlib import Path - -import pytest -import requests -import torch -from diffusers import QwenImageEditPipeline, QwenImageEditPlusPipeline -from PIL import Image - -from benchmarks.accuracy.common import decode_base64_image, pil_to_png_bytes -from tests.e2e.accuracy.helpers import assert_similarity, model_output_dir -from tests.helpers.env import run_post_test_cleanup, run_pre_test_cleanup -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniServer - -pytestmark = [pytest.mark.full_model, pytest.mark.diffusion] - - -SINGLE_MODEL = "Qwen/Qwen-Image-Edit" -MULTIPLE_MODEL = "Qwen/Qwen-Image-Edit-2509" -WIDTH = 512 -HEIGHT = 512 -NUM_INFERENCE_STEPS = 20 -TRUE_CFG_SCALE = 4.0 -SEED = 42 -SSIM_THRESHOLD = 0.94 -PSNR_THRESHOLD = 28.0 - -PROMPT_SINGLE_IMAGE = "The input is a 2D cartoon bear mascot. Restyle it into a painterly oil artwork with warm colors while preserving the main structure." -PROMPT_MULTIPLE_IMAGE = "Put the cartoon bear mascot and the furry rabbit into one coherent scene with a painterly oil artwork style and consistent lighting." -NEGATIVE_PROMPT = "low quality, blurry, artifacts, distortion" -SERVER_ARGS = ["--num-gpus", "1", "--stage-init-timeout", "300", "--init-timeout", "900"] - - -def _run_vllm_omni_image_edit( - *, - omni_server: OmniServer, - prompt: str, - input_images: list[Image.Image], - output_path: Path, -) -> Image.Image: - response = requests.post( - f"http://{omni_server.host}:{omni_server.port}/v1/images/edits", - data={ - "model": omni_server.model, - "prompt": prompt, - "size": f"{WIDTH}x{HEIGHT}", - "n": 1, - "response_format": "b64_json", - "negative_prompt": NEGATIVE_PROMPT, - "num_inference_steps": NUM_INFERENCE_STEPS, - "true_cfg_scale": TRUE_CFG_SCALE, - "seed": SEED, - }, - files=[ - ("image", (f"image_{index}.png", pil_to_png_bytes(image), "image/png")) - for index, image in enumerate(input_images) - ], - timeout=600, - ) - response.raise_for_status() - payload = response.json() - assert len(payload["data"]) == 1 - image = decode_base64_image(payload["data"][0]["b64_json"]) - image.load() - image.save(output_path) - return image - - -def _run_diffusers_image_edit( - *, - model: str, - pipeline_class: type[QwenImageEditPipeline] | type[QwenImageEditPlusPipeline], - prompt: str, - input_images: list[Image.Image], - output_path: Path, -) -> Image.Image: - run_pre_test_cleanup(enable_force=True) - pipe: QwenImageEditPipeline | QwenImageEditPlusPipeline | None = None - device = torch.device("cuda:0") - torch.cuda.set_device(device) - try: - images = input_images[0] if len(input_images) == 1 else input_images - pipe = pipeline_class.from_pretrained( - model, - torch_dtype=torch.bfloat16, - trust_remote_code=True, - ).to(device) - pipe.set_progress_bar_config(disable=False) - generator = torch.Generator(device=device).manual_seed(SEED) - result = pipe( # pyright: ignore[reportCallIssue] - prompt=prompt, - image=images, - negative_prompt=NEGATIVE_PROMPT, - num_inference_steps=NUM_INFERENCE_STEPS, - true_cfg_scale=TRUE_CFG_SCALE, - width=WIDTH, - height=HEIGHT, - generator=generator, - ) - output_image = result.images[0].convert("RGB") # pyright: ignore[reportAttributeAccessIssue] - output_image.save(output_path) - return output_image - finally: - if pipe is not None and hasattr(pipe, "maybe_free_model_hooks"): - pipe.maybe_free_model_hooks() - del pipe - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - run_post_test_cleanup(enable_force=True) - - -def _vllm_omni_output_single_image( - accuracy_artifact_root: Path, - qwen_bear_image: Image.Image, -) -> Image.Image: - output_dir = model_output_dir(accuracy_artifact_root, SINGLE_MODEL) - output_path = output_dir / "vllm_omni_single.png" - with OmniServer(model=SINGLE_MODEL, serve_args=SERVER_ARGS) as server: - output = _run_vllm_omni_image_edit( - omni_server=server, - prompt=PROMPT_SINGLE_IMAGE, - input_images=[qwen_bear_image], - output_path=output_path, - ) - return output - - -def _diffusers_output_single_image(accuracy_artifact_root: Path, qwen_bear_image: Image.Image) -> Image.Image: - output_dir = model_output_dir(accuracy_artifact_root, SINGLE_MODEL) - output_path = output_dir / "diffusers_single.png" - return _run_diffusers_image_edit( - model=SINGLE_MODEL, - pipeline_class=QwenImageEditPipeline, - prompt=PROMPT_SINGLE_IMAGE, - input_images=[qwen_bear_image], - output_path=output_path, - ) - - -def _vllm_omni_output_multiple_image( - accuracy_artifact_root: Path, - qwen_bear_image: Image.Image, - rabbit_image: Image.Image, -) -> Image.Image: - output_dir = model_output_dir(accuracy_artifact_root, MULTIPLE_MODEL) - output_path = output_dir / "vllm_omni_multiple.png" - with OmniServer(model=MULTIPLE_MODEL, serve_args=SERVER_ARGS) as server: - output = _run_vllm_omni_image_edit( - omni_server=server, - prompt=PROMPT_MULTIPLE_IMAGE, - input_images=[qwen_bear_image, rabbit_image], - output_path=output_path, - ) - return output - - -def _diffusers_output_multiple_image( - accuracy_artifact_root: Path, qwen_bear_image: Image.Image, rabbit_image: Image.Image -) -> Image.Image: - output_dir = model_output_dir(accuracy_artifact_root, MULTIPLE_MODEL) - output_path = output_dir / "diffusers_multiple.png" - return _run_diffusers_image_edit( - model=MULTIPLE_MODEL, - pipeline_class=QwenImageEditPlusPipeline, - prompt=PROMPT_MULTIPLE_IMAGE, - input_images=[qwen_bear_image, rabbit_image], - output_path=output_path, - ) - - -@pytest.mark.benchmark -@hardware_test(res={"cuda": "H100"}, num_cards=1) -def test_qwen_image_edit_single_matches_diffusers( - accuracy_artifact_root: Path, - qwen_bear_image: Image.Image, -) -> None: - vllm_image = _vllm_omni_output_single_image( - accuracy_artifact_root=accuracy_artifact_root, - qwen_bear_image=qwen_bear_image, - ) - diffusers_image = _diffusers_output_single_image( - accuracy_artifact_root=accuracy_artifact_root, - qwen_bear_image=qwen_bear_image, - ) - assert_similarity( - model_name=SINGLE_MODEL, - vllm_image=vllm_image, - diffusers_image=diffusers_image, - width=WIDTH, - height=HEIGHT, - ssim_threshold=SSIM_THRESHOLD, - psnr_threshold=PSNR_THRESHOLD, - ) - - -@pytest.mark.benchmark -@hardware_test(res={"cuda": "H100"}, num_cards=1) -@pytest.mark.skip( - reason="Skipping as the second image seems to be ignored by the API. Will come back to this later after #2772 is merged." -) -def test_qwen_image_edit_multiple_matches_diffusers( - accuracy_artifact_root: Path, - qwen_bear_image: Image.Image, - rabbit_image: Image.Image, -) -> None: - vllm_image = _vllm_omni_output_multiple_image( - accuracy_artifact_root=accuracy_artifact_root, - qwen_bear_image=qwen_bear_image, - rabbit_image=rabbit_image, - ) - diffusers_image = _diffusers_output_multiple_image( - accuracy_artifact_root=accuracy_artifact_root, - qwen_bear_image=qwen_bear_image, - rabbit_image=rabbit_image, - ) - assert_similarity( - model_name=MULTIPLE_MODEL, - vllm_image=vllm_image, - diffusers_image=diffusers_image, - width=WIDTH, - height=HEIGHT, - ssim_threshold=SSIM_THRESHOLD, - psnr_threshold=PSNR_THRESHOLD, - ) diff --git a/tests/e2e/accuracy/test_qwen_image_layered.py b/tests/e2e/accuracy/test_qwen_image_layered.py deleted file mode 100644 index 30ad2966ff6..00000000000 --- a/tests/e2e/accuracy/test_qwen_image_layered.py +++ /dev/null @@ -1,149 +0,0 @@ -from __future__ import annotations - -import base64 -import gc -import io -import os -from pathlib import Path - -import pytest -import requests -import torch -from diffusers.pipelines.pipeline_utils import DiffusionPipeline -from PIL import Image - -from tests.e2e.accuracy.helpers import assert_image_sequence_similarity, model_output_dir -from tests.helpers.env import run_post_test_cleanup, run_pre_test_cleanup -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniServer - -pytestmark = [pytest.mark.full_model, pytest.mark.diffusion] - - -MODEL_ID = "Qwen/Qwen-Image-Layered" -MODEL_ENV_VAR = "QWEN_IMAGE_LAYERED_MODEL" -PROMPT = "decompose into layers" -NEGATIVE_PROMPT = " " -NUM_INFERENCE_STEPS = 20 -TRUE_CFG_SCALE = 4.0 -SEED = 777 -LAYERS = 3 -RESOLUTION = 640 -SSIM_THRESHOLD = 0.97 -PSNR_THRESHOLD = 30.0 - - -def _model_name() -> str: - return os.environ.get(MODEL_ENV_VAR, MODEL_ID) - - -def _local_files_only(model: str) -> bool: - return Path(model).exists() - - -def _normalize_layered_images(images: object) -> list[Image.Image]: - if not isinstance(images, list) or not images: - raise AssertionError(f"Unexpected layered output container: {type(images).__name__}") - - first_item = images[0] - if isinstance(first_item, Image.Image): - return [image.convert("RGBA") for image in images if isinstance(image, Image.Image)] - if isinstance(first_item, (list, tuple)): - return [image.convert("RGBA") for image in first_item if isinstance(image, Image.Image)] - raise AssertionError(f"Unexpected layered image element type: {type(first_item).__name__}") - - -def _run_vllm_omni_qwen_image_layered(*, model: str, input_image: Image.Image, output_dir: Path) -> list[Image.Image]: - input_image.save(output_dir / "input.png") - server_args = ["--num-gpus", "1", "--stage-init-timeout", "300", "--init-timeout", "900"] - with OmniServer(model, server_args, use_omni=True) as omni_server: - buffer = io.BytesIO() - input_image.save(buffer, format="PNG") - buffer.seek(0) - response = requests.post( - f"http://{omni_server.host}:{omni_server.port}/v1/images/edits", - data={ - "model": omni_server.model, - "prompt": PROMPT, - "size": "auto", - "n": 1, - "response_format": "b64_json", - "negative_prompt": NEGATIVE_PROMPT, - "num_inference_steps": NUM_INFERENCE_STEPS, - "true_cfg_scale": TRUE_CFG_SCALE, - "seed": SEED, - "layers": LAYERS, - "resolution": RESOLUTION, - }, - files=[("image", ("input.png", buffer, "image/png"))], - timeout=600, - ) - response.raise_for_status() - payload = response.json() - assert len(payload["data"]) == LAYERS - output_images = [] - for item in payload["data"]: - image_bytes = base64.b64decode(item["b64_json"]) - image = Image.open(io.BytesIO(image_bytes)).convert("RGBA") - image.load() - output_images.append(image) - for index, image in enumerate(output_images, start=1): - image.save(output_dir / f"vllm_omni_layer_{index}.png") - return output_images - - -def _run_diffusers_qwen_image_layered(*, model: str, input_image: Image.Image, output_dir: Path) -> list[Image.Image]: - run_pre_test_cleanup(enable_force=True) - pipe: DiffusionPipeline | None = None - try: - pipe = DiffusionPipeline.from_pretrained( - model, - torch_dtype=torch.bfloat16, - trust_remote_code=True, - local_files_only=_local_files_only(model), - ).to("cuda") - generator = torch.Generator(device="cuda").manual_seed(SEED) - result = pipe( # pyright: ignore[reportCallIssue] - image=input_image, - prompt=PROMPT, - negative_prompt=NEGATIVE_PROMPT, - num_inference_steps=NUM_INFERENCE_STEPS, - true_cfg_scale=TRUE_CFG_SCALE, - generator=generator, - num_images_per_prompt=1, - layers=LAYERS, - resolution=RESOLUTION, - ) - output_images = _normalize_layered_images(result.images) - assert len(output_images) == LAYERS, f"Expected {LAYERS} diffusers layers, got {len(output_images)}" - for index, image in enumerate(output_images, start=1): - image.save(output_dir / f"diffusers_layer_{index}.png") - return output_images - finally: - if pipe is not None and hasattr(pipe, "maybe_free_model_hooks"): - pipe.maybe_free_model_hooks() - del pipe - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - run_post_test_cleanup(enable_force=True) - - -@pytest.mark.benchmark -@hardware_test(res={"cuda": "H100"}, num_cards=1) -def test_qwen_image_layered_matches_diffusers(accuracy_artifact_root: Path, qwen_bear_image: Image.Image) -> None: - model = _model_name() - output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID) - input_image = qwen_bear_image.convert("RGBA") - - vllm_outputs = _run_vllm_omni_qwen_image_layered(model=model, input_image=input_image, output_dir=output_dir) - diffusers_outputs = _run_diffusers_qwen_image_layered(model=model, input_image=input_image, output_dir=output_dir) - - assert_image_sequence_similarity( - model_name=MODEL_ID, - vllm_images=vllm_outputs, - diffusers_images=diffusers_outputs, - ssim_threshold=SSIM_THRESHOLD, - psnr_threshold=PSNR_THRESHOLD, - compare_mode="RGBA", - ) diff --git a/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py b/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py index 1caef3bff54..3cdda1f9ffa 100644 --- a/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py +++ b/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py @@ -22,6 +22,7 @@ from diffusers import UniPCMultistepScheduler from PIL import Image +from tests.conftest import OmniServerParams from tests.e2e.accuracy.wan22_i2v.run_wan22_i2v_diffusers_cp import ( _configure_scheduler, _ensure_wan_ftfy_fallback, @@ -47,10 +48,7 @@ SSIM_THRESHOLD, WIDTH, ) -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniServerParams - -pytestmark = [pytest.mark.diffusion, pytest.mark.full_model] +from tests.utils import hardware_test def test_parse_video_metadata_extracts_dimensions_and_fps() -> None: @@ -539,7 +537,9 @@ def _generate_offline_video(*, image_source: str) -> tuple[Path, Path]: return offline_path, offline_metadata_path +@pytest.mark.advanced_model @pytest.mark.benchmark +@pytest.mark.diffusion @hardware_test(res={"cuda": "H100"}, num_cards=1) def test_wan22_i2v_diffusers_offline_generates_video( wan22_i2v_image_source: str | None, @@ -563,7 +563,9 @@ def test_wan22_i2v_diffusers_offline_generates_video( assert offline_metadata["frame_count"] == NUM_FRAMES +@pytest.mark.advanced_model @pytest.mark.benchmark +@pytest.mark.diffusion @hardware_test(res={"cuda": "H100"}, num_cards=2) @pytest.mark.parametrize("omni_server", SERVER_CASES, indirect=True) def test_wan22_i2v_online_serving_generates_video( @@ -592,7 +594,9 @@ def test_wan22_i2v_online_serving_generates_video( assert online_metadata["frame_count"] == NUM_FRAMES +@pytest.mark.advanced_model @pytest.mark.benchmark +@pytest.mark.diffusion @hardware_test(res={"cuda": "H100"}, num_cards=2) def test_wan22_i2v_serving_matches_diffusers_video_similarity( wan22_i2v_image_source: str | None, diff --git a/tests/e2e/offline_inference/custom_pipeline/qwen_image_pipeline_with_logprob.py b/tests/e2e/offline_inference/custom_pipeline/qwen_image_pipeline_with_logprob.py index 709c6655565..ed5b219f80f 100644 --- a/tests/e2e/offline_inference/custom_pipeline/qwen_image_pipeline_with_logprob.py +++ b/tests/e2e/offline_inference/custom_pipeline/qwen_image_pipeline_with_logprob.py @@ -6,8 +6,7 @@ This pipeline follows the structure of the user's reference implementation: - supports pre-tokenized prompt IDs via OmniCustomPrompt-style dict input - uses an SDE scheduler that can return step logprobs -- returns structured trajectory_* fields (latents, timesteps, log_probs) - consistent with the BAGEL trajectory recording design +- returns rich custom_output fields for testing """ from __future__ import annotations @@ -394,10 +393,10 @@ def forward( return DiffusionOutput( output=_maybe_to_cpu(image), - trajectory_latents=_maybe_to_cpu(all_latents), - trajectory_log_probs=_maybe_to_cpu(all_log_probs), - trajectory_timesteps=_maybe_to_cpu(all_timesteps), custom_output={ + "all_latents": _maybe_to_cpu(all_latents), + "all_log_probs": _maybe_to_cpu(all_log_probs), + "all_timesteps": _maybe_to_cpu(all_timesteps), "prompt_embeds": _maybe_to_cpu(prompt_embeds), "prompt_embeds_mask": _maybe_to_cpu(prompt_embeds_mask), "negative_prompt_embeds": _maybe_to_cpu(negative_prompt_embeds), diff --git a/tests/e2e/offline_inference/custom_pipeline/test_async_omni_collective_rpc.py b/tests/e2e/offline_inference/custom_pipeline/test_async_omni_collective_rpc.py index bd3f2e09975..57743d62bf6 100644 --- a/tests/e2e/offline_inference/custom_pipeline/test_async_omni_collective_rpc.py +++ b/tests/e2e/offline_inference/custom_pipeline/test_async_omni_collective_rpc.py @@ -26,7 +26,7 @@ import pytest -from tests.helpers.mark import hardware_test +from tests.utils import hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput diff --git a/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py b/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py index 0681687fe73..f1b4595c9df 100644 --- a/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py +++ b/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py @@ -1,12 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""E2E tests for AsyncOmni Qwen-Image generation with trajectory_* fields. - -Validates that the custom Qwen-Image pipeline returns structured trajectory -outputs (latents, timesteps, log_probs) via OmniRequestOutput's trajectory_* -fields instead of the legacy custom_output dict. -""" +"""E2E tests for AsyncOmni Qwen-Image generation flow (no Ray, no HTTP server).""" from __future__ import annotations @@ -19,7 +14,7 @@ import pytest from transformers import AutoTokenizer -from tests.helpers.mark import hardware_test +from tests.utils import hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput @@ -196,17 +191,10 @@ async def test_async_omni_generate_with_logprobs(): _assert_valid_image_output(output) - assert output.trajectory_latents is not None, "trajectory_latents should be present" - assert hasattr(output.trajectory_latents, "shape") - assert output.trajectory_latents.numel() > 0 - - assert output.trajectory_timesteps is not None, "trajectory_timesteps should be present" - assert hasattr(output.trajectory_timesteps, "shape") - assert output.trajectory_timesteps.numel() > 0 - - assert output.trajectory_log_probs is not None, "trajectory_log_probs should be present when logprobs=True" - assert hasattr(output.trajectory_log_probs, "shape") - assert output.trajectory_log_probs.numel() > 0 + all_log_probs = output.custom_output.get("all_log_probs") + assert all_log_probs is not None, "all_log_probs should be present when logprobs=True" + assert hasattr(all_log_probs, "shape") + assert all_log_probs.numel() > 0 @pytest.mark.core_model diff --git a/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py b/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py index 653b35d7e2f..ffbe703ca78 100644 --- a/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py +++ b/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py @@ -10,7 +10,7 @@ from tests.e2e.offline_inference.custom_pipeline.worker_extension import ( vLLMOmniColocateWorkerExtensionForTest, ) -from tests.helpers.mark import hardware_test +from tests.utils import hardware_test from vllm_omni.diffusion.worker.diffusion_worker import CustomPipelineWorkerExtension from vllm_omni.entrypoints.async_omni import AsyncOmni diff --git a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml new file mode 100644 index 00000000000..590244acd26 --- /dev/null +++ b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml @@ -0,0 +1,89 @@ +# stage config for running BAGEL with Mooncake connector for CI e2e tests. +# This config is optimized for single GPU tests with Mooncake inter-stage communication. + +stage_args: + - stage_id: 0 + stage_type: llm + prompt_expand_func: vllm_omni.model_executor.stage_input_processors.bagel.expand_cfg_prompts + runtime: + devices: "0" + engine_args: + model_stage: thinker + max_num_seqs: 1 + model_arch: BagelForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.45 + enforce_eager: true + trust_remote_code: true + engine_output_type: text + distributed_executor_backend: mp + enable_prefix_caching: false + max_num_batched_tokens: 32768 + tensor_parallel_size: 1 + load_format: dummy + omni_kv_config: + need_send_cache: true + kv_transfer_criteria: + type: prefill_finished + final_output: true + final_output_type: text + is_comprehension: true + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + top_k: 1 + max_tokens: 2048 + seed: 52 + detokenize: true + repetition_penalty: 1.05 + output_connectors: + to_stage_1: mooncake_connector + - stage_id: 1 + stage_type: diffusion + cfg_kv_collect_func: vllm_omni.model_executor.stage_input_processors.bagel.collect_cfg_kv_caches + runtime: + devices: "0" + engine_args: + model_stage: dit + max_num_seqs: 1 + gpu_memory_utilization: 0.45 + enforce_eager: true + trust_remote_code: true + engine_output_type: image + distributed_executor_backend: mp + enable_prefix_caching: false + max_num_batched_tokens: 32768 + tensor_parallel_size: 1 + load_format: dummy + omni_kv_config: + need_recv_cache: true + engine_input_source: [0] + final_output: true + final_output_type: image + is_comprehension: false + default_sampling_params: + seed: 52 + input_connectors: + from_stage_0: mooncake_connector + +# Top-level runtime config with Mooncake connector +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + connectors: + mooncake_connector: + name: MooncakeConnector + extra: + host: "${MOONCAKE_HOST}" + metadata_server: "http://${MOONCAKE_HOST}:${MOONCAKE_HTTP_PORT}/metadata" + master: "${MOONCAKE_HOST}:${MOONCAKE_RPC_PORT}" + segment: 64000000 + localbuf: 64000000 + proto: tcp + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml new file mode 100644 index 00000000000..b7999652e23 --- /dev/null +++ b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml @@ -0,0 +1,87 @@ +# stage config for running BAGEL with SharedMemory connector for CI e2e tests. +# This config is optimized for single GPU tests with SharedMemory inter-stage communication. + +stage_args: + - stage_id: 0 + stage_type: llm + prompt_expand_func: vllm_omni.model_executor.stage_input_processors.bagel.expand_cfg_prompts + runtime: + devices: "0" + engine_args: + model_stage: thinker + max_num_seqs: 1 + model_arch: OmniBagelForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.45 + enforce_eager: true + trust_remote_code: true + engine_output_type: text + distributed_executor_backend: "mp" + enable_prefix_caching: false + max_num_batched_tokens: 32768 + tensor_parallel_size: 1 + load_format: dummy + omni_kv_config: + need_send_cache: true + kv_transfer_criteria: + type: prefill_finished #or special token generated + final_output: true + final_output_type: text + is_comprehension: true + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + top_k: 1 + max_tokens: 2048 + seed: 52 + detokenize: True + repetition_penalty: 1.05 + + - stage_id: 1 + stage_type: diffusion + cfg_kv_collect_func: vllm_omni.model_executor.stage_input_processors.bagel.collect_cfg_kv_caches + runtime: + devices: "0" + engine_args: + model_stage: dit + max_num_seqs: 1 + gpu_memory_utilization: 0.45 + enforce_eager: true + trust_remote_code: true + engine_output_type: image + distributed_executor_backend: "mp" + enable_prefix_caching: false + max_num_batched_tokens: 32768 + tensor_parallel_size: 1 + load_format: dummy + omni_kv_config: + need_recv_cache: true + engine_input_source: [0] + + final_output: true + final_output_type: image + is_comprehension: false + default_sampling_params: + seed: 52 + +# Runtime edges +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + + # Distributed connectors configuration (optional) + # More connectors will be supported in the future. + connectors: + shared_memory_connector: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 # 64KB threshold + + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml new file mode 100644 index 00000000000..f93a6c71473 --- /dev/null +++ b/tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml @@ -0,0 +1,103 @@ +# stage config for running qwen2.5-omni for multi-stage omni runtime. + +# This config is optimized for CI e2e tests. +stage_args: + - stage_id: 0 + runtime: + process: true # Run this stage in a separate process + devices: "0" + engine_args: + model_stage: thinker + model_arch: Qwen2_5OmniForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + max_model_len: 896 + max_num_batched_tokens: 896 + max_num_seqs: 1 + gpu_memory_utilization: 0.8 + skip_mm_profiling: true + enforce_eager: true # Now we only support eager mode + trust_remote_code: true + engine_output_type: latent + enable_prefix_caching: false + mm_processor_cache_gb: 0 + is_comprehension: true + final_output: true + final_output_type: text + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 128 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + - stage_id: 1 + runtime: + process: true + devices: "1" + engine_args: + model_stage: talker + model_arch: Qwen2_5OmniForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + max_model_len: 896 + max_num_batched_tokens: 896 + max_num_seqs: 1 + gpu_memory_utilization: 0.8 + skip_mm_profiling: true + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: latent + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker + default_sampling_params: + temperature: 0.9 + top_p: 0.8 + top_k: 40 + max_tokens: 128 + seed: 42 + detokenize: True + repetition_penalty: 1.05 + stop_token_ids: [8294] + - stage_id: 2 + runtime: + process: true + devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU + engine_args: + model_stage: code2wav + max_num_seqs: 1 + model_arch: Qwen2_5OmniForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + gpu_memory_utilization: 0.15 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: audio + engine_input_source: [1] + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 128 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + +# Top-level runtime config (concise): default windows and stage edges +runtime: + enabled: true + defaults: + window_size: -1 # Simplified: trigger downstream only after full upstream completion + max_inflight: 1 # Simplified: process serially within each stage + edges: + - from: 0 # thinker → talker: trigger only after receiving full input (-1) + to: 1 + window_size: -1 + - from: 1 # talker → code2wav: trigger only after receiving full input (-1) + to: 2 + window_size: -1 diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py index b4de059f2d0..a0c3f6cc9fc 100644 --- a/tests/e2e/offline_inference/test_bagel_img2img.py +++ b/tests/e2e/offline_inference/test_bagel_img2img.py @@ -15,49 +15,47 @@ """ import socket +from pathlib import Path from typing import Any import pytest from PIL import Image from vllm.assets.image import ImageAsset -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config +from tests.conftest import modify_stage_config +from tests.utils import hardware_test from vllm_omni.entrypoints.omni import Omni from vllm_omni.platforms import current_omni_platform -BAGEL_CI_DEPLOY = get_deploy_config_path("ci/bagel.yaml") - # Reference pixel data extracted from the known-good output image # Generated with seed=52, num_inference_steps=15, # prompt='Change the grass color to red', # input image: 2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (156, 172, 217)}, - {"position": (400, 50), "rgb": (105, 144, 217)}, - {"position": (700, 100), "rgb": (118, 159, 232)}, - {"position": (150, 400), "rgb": (180, 22, 52)}, - {"position": (512, 336), "rgb": (221, 211, 194)}, - {"position": (700, 400), "rgb": (192, 10, 46)}, - {"position": (100, 600), "rgb": (102, 12, 22)}, - {"position": (400, 600), "rgb": (161, 28, 47)}, - {"position": (700, 600), "rgb": (100, 87, 94)}, - {"position": (256, 256), "rgb": (181, 201, 221)}, + {"position": (100, 100), "rgb": (157, 172, 217)}, + {"position": (400, 50), "rgb": (105, 144, 218)}, + {"position": (700, 100), "rgb": (118, 159, 233)}, + {"position": (150, 400), "rgb": (195, 34, 60)}, + {"position": (512, 336), "rgb": (222, 214, 193)}, + {"position": (700, 400), "rgb": (197, 15, 43)}, + {"position": (100, 600), "rgb": (105, 13, 18)}, + {"position": (400, 600), "rgb": (169, 33, 44)}, + {"position": (700, 600), "rgb": (101, 86, 93)}, + {"position": (256, 256), "rgb": (181, 202, 222)}, ] if current_omni_platform.is_rocm(): REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (156, 172, 217)}, - {"position": (400, 50), "rgb": (105, 144, 217)}, - {"position": (700, 100), "rgb": (118, 159, 232)}, - {"position": (150, 400), "rgb": (180, 22, 52)}, - {"position": (512, 336), "rgb": (221, 211, 194)}, - {"position": (700, 400), "rgb": (192, 10, 46)}, - {"position": (100, 600), "rgb": (102, 12, 22)}, - {"position": (400, 600), "rgb": (161, 28, 47)}, - {"position": (700, 600), "rgb": (100, 87, 94)}, - {"position": (256, 256), "rgb": (181, 201, 221)}, + {"position": (100, 100), "rgb": (156, 172, 215)}, + {"position": (400, 50), "rgb": (106, 144, 216)}, + {"position": (700, 100), "rgb": (118, 158, 231)}, + {"position": (150, 400), "rgb": (183, 23, 48)}, + {"position": (512, 336), "rgb": (218, 215, 191)}, + {"position": (700, 400), "rgb": (194, 14, 42)}, + {"position": (100, 600), "rgb": (105, 10, 16)}, + {"position": (400, 600), "rgb": (167, 33, 46)}, + {"position": (700, 600), "rgb": (102, 86, 92)}, + {"position": (256, 256), "rgb": (181, 201, 220)}, ] PIXEL_TOLERANCE = 10 @@ -184,8 +182,8 @@ def _generate_bagel_img2img( return generated_image -def _resolve_deploy_config(config_path: str, run_level: str) -> str: - """Resolve deploy config based on run level. +def _resolve_stage_config(config_path: str, run_level: str) -> str: + """Resolve stage config based on run level. For advanced_model (real weights), strip load_format: dummy so the model falls back to loading real weights from HuggingFace. @@ -194,9 +192,9 @@ def _resolve_deploy_config(config_path: str, run_level: str) -> str: return modify_stage_config( config_path, deletes={ - "stages": { - 0: ["load_format"], - 1: ["load_format"], + "stage_args": { + 0: ["engine_args.load_format"], + 1: ["engine_args.load_format"], } }, ) @@ -210,11 +208,13 @@ def _resolve_deploy_config(config_path: str, run_level: str) -> str: def test_bagel_img2img_shared_memory_connector(run_level): """Test Bagel img2img with shared memory connector.""" input_image = _load_input_image() - config_path = _resolve_deploy_config(BAGEL_CI_DEPLOY, run_level) - with OmniRunner( - "ByteDance-Seed/BAGEL-7B-MoT", - stage_configs_path=config_path, - ) as runner: - generated_image = _generate_bagel_img2img(runner.omni, input_image) + config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") + config_path = _resolve_stage_config(config_path, run_level) + omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300) + + try: + generated_image = _generate_bagel_img2img(omni, input_image) if run_level == "advanced_model": _validate_pixels(generated_image) + finally: + omni.close() diff --git a/tests/e2e/offline_inference/test_bagel_lora.py b/tests/e2e/offline_inference/test_bagel_lora.py deleted file mode 100644 index 785d0c7fb8f..00000000000 --- a/tests/e2e/offline_inference/test_bagel_lora.py +++ /dev/null @@ -1,195 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -""" -End-to-end test for BAGEL LoRA support (Stage 1 / DiT). - -Validates that LoRA adapters are correctly loaded, applied with controllable -scale, and cleanly deactivated. Uses a synthetic rank-1 adapter targeting the -first decoder layer's QKV projection. - -Assertions: - (a) LoRA at scale=1.0 visibly changes the output (diff > 0.5) - (b) scale=2.0 produces a larger delta than scale=1.0 (linearity) - (c) The delta is bounded (diff < 80, not corrupted) - (d) Deactivating LoRA exactly restores the baseline (diff == 0) -""" - -import json -import os -from pathlib import Path - -from vllm_omni.inputs.data import OmniSamplingParams -from vllm_omni.outputs import OmniRequestOutput - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - -import numpy as np -import pytest -import torch -from PIL import Image -from safetensors.torch import save_file - -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config -from vllm_omni.entrypoints.omni import Omni -from vllm_omni.lora.request import LoRARequest -from vllm_omni.lora.utils import stable_lora_int_id - -MODEL = "ByteDance-Seed/BAGEL-7B-MoT" -BAGEL_STAGE_CONFIG = get_deploy_config_path("ci/bagel.yaml") -DEFAULT_PROMPT = "<|im_start|>A cute cat<|im_end|>" - - -# --------------------------------------------------------------------------- -# Helpers (reused from test_bagel_text2img.py patterns) -# --------------------------------------------------------------------------- - - -def _resolve_deploy_config(config_path: str, run_level: str) -> str: - if run_level == "advanced_model": - return modify_stage_config( - config_path, - deletes={ - "stages": { - 0: ["load_format"], - 1: ["load_format"], - } - }, - ) - return config_path - - -def _configure_sampling_params(omni: Omni, num_inference_steps: int = 10) -> list[OmniSamplingParams]: - params_list = omni.default_sampling_params_list - if len(params_list) > 1: - params_list[1].num_inference_steps = num_inference_steps - params_list[1].extra_args = { - "cfg_text_scale": 4.0, - "cfg_img_scale": 1.5, - } - return params_list - - -def _extract_generated_image(omni_outputs: list[OmniRequestOutput]) -> Image.Image | None: - for req_output in omni_outputs: - if req_output.images: - return req_output.images[0] - return None - - -def _generate_bagel_image(omni: Omni) -> Image.Image: - params_list = _configure_sampling_params(omni) - params_list[1].lora_request = None - outputs = list( - omni.generate( - prompts=[{"prompt": DEFAULT_PROMPT, "modalities": ["image"]}], - sampling_params_list=params_list, - ) - ) - img = _extract_generated_image(outputs) - assert img is not None, "No image generated" - return img - - -def _generate_bagel_image_with_lora( - omni: Omni, - lora_request: LoRARequest, - lora_scale: float = 1.0, -) -> Image.Image: - params_list = _configure_sampling_params(omni) - params_list[1].lora_request = lora_request - params_list[1].lora_scale = lora_scale - outputs = list( - omni.generate( - prompts=[{"prompt": DEFAULT_PROMPT, "modalities": ["image"]}], - sampling_params_list=params_list, - ) - ) - img = _extract_generated_image(outputs) - assert img is not None, "No image generated with LoRA" - return img - - -# BAGEL uses GQA: hidden_size=3584, 28 Q heads, 4 KV heads, head_dim=128 -# QKV packed dim = 28*128 + 4*128 + 4*128 = 3584 + 512 + 512 = 4608 -_LORA_DIM = 3584 -_LORA_QKV_DIM = 4608 -_LORA_MODULE = "bagel.language_model.model.layers.0.self_attn.qkv_proj" -_LORA_RANK = 4 - - -def _make_file_lora_request(adapter_dir: Path) -> LoRARequest: - """Write synthetic adapter to disk and return a file-backed LoRARequest.""" - adapter_dir.mkdir(parents=True, exist_ok=True) - gen = torch.Generator().manual_seed(42) - lora_a = torch.randn((_LORA_RANK, _LORA_DIM), dtype=torch.float32, generator=gen) * 0.1 - lora_b = torch.randn((_LORA_QKV_DIM, _LORA_RANK), dtype=torch.float32, generator=gen) * 0.5 - save_file( - { - f"base_model.model.{_LORA_MODULE}.lora_A.weight": lora_a, - f"base_model.model.{_LORA_MODULE}.lora_B.weight": lora_b, - }, - str(adapter_dir / "adapter_model.safetensors"), - ) - (adapter_dir / "adapter_config.json").write_text( - json.dumps({"r": _LORA_RANK, "lora_alpha": _LORA_RANK, "target_modules": [_LORA_MODULE]}), - encoding="utf-8", - ) - lora_dir = str(adapter_dir) - return LoRARequest(lora_name="test_file", lora_int_id=stable_lora_int_id(lora_dir), lora_path=lora_dir) - - -# --------------------------------------------------------------------------- -# Test -# --------------------------------------------------------------------------- - - -@pytest.mark.core_model -@pytest.mark.advanced_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) -def test_bagel_lora_scale_and_deactivation(run_level, tmp_path): - """Validate LoRA effect, bounded perturbation, and clean deactivation.""" - config_path = _resolve_deploy_config(BAGEL_STAGE_CONFIG, run_level) - with OmniRunner(MODEL, stage_configs_path=config_path) as runner: - omni = runner.omni - lora_request = _make_file_lora_request(tmp_path / "bagel_lora") - - # 1) Baseline (no LoRA) - baseline = _generate_bagel_image(omni) - - # 2) LoRA with scale=1.0 - img_1x = _generate_bagel_image_with_lora(omni, lora_request, lora_scale=1.0) - - # 3) LoRA with scale=2.0 - img_2x = _generate_bagel_image_with_lora(omni, lora_request, lora_scale=2.0) - - # 4) No LoRA again (deactivation) - restored = _generate_bagel_image(omni) - - baseline_arr = np.array(baseline, dtype=np.int16) - img_1x_arr = np.array(img_1x, dtype=np.int16) - img_2x_arr = np.array(img_2x, dtype=np.int16) - restored_arr = np.array(restored, dtype=np.int16) - - diff_1x = np.abs(baseline_arr - img_1x_arr).mean() - diff_2x = np.abs(baseline_arr - img_2x_arr).mean() - diff_restored = np.abs(baseline_arr - restored_arr).mean() - - # (a) Adapter has visible effect at both scales - assert diff_1x > 0.5, f"LoRA scale=1.0 had no visible effect: diff={diff_1x}" - assert diff_2x > 0.5, f"LoRA scale=2.0 had no visible effect: diff={diff_2x}" - - # (b) Different scales produce different outputs - assert not np.isclose(diff_1x, diff_2x, atol=1.0), ( - f"LoRA scale has no effect: diff_1x={diff_1x:.2f}, diff_2x={diff_2x:.2f}" - ) - - # (c) Output is not corrupted (scale=2.0 can produce ~2x the diff of scale=1.0) - assert diff_1x < 80, f"LoRA output looks corrupted: diff_1x={diff_1x}" - assert diff_2x < 120, f"LoRA output looks corrupted: diff_2x={diff_2x}" - - # (d) Deactivation fully restores base model - assert diff_restored == 0.0, f"Base model not restored after LoRA deactivation: diff={diff_restored}" diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py index 65cd8425cd0..7cce8da3a73 100644 --- a/tests/e2e/offline_inference/test_bagel_text2img.py +++ b/tests/e2e/offline_inference/test_bagel_text2img.py @@ -16,54 +16,52 @@ import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" import signal import socket import subprocess import tempfile import time +from pathlib import Path from typing import Any import pytest from PIL import Image -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config +from tests.conftest import modify_stage_config +from tests.utils import hardware_test from vllm_omni.entrypoints.omni import Omni from vllm_omni.platforms import current_omni_platform -BAGEL_CI_DEPLOY = get_deploy_config_path("ci/bagel.yaml") -BAGEL_MOONCAKE_CI_DEPLOY = get_deploy_config_path("ci/bagel_mooncake.yaml") - # Reference pixel data extracted from the known-good output image # Each entry contains (x, y) position and expected (R, G, B) values # "Generated with seed=52, num_inference_steps=15, # prompt='A futuristic city skyline at twilight, cyberpunk style'" REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (115, 113, 94)}, - {"position": (400, 50), "rgb": (159, 160, 144)}, - {"position": (700, 100), "rgb": (164, 151, 123)}, - {"position": (150, 400), "rgb": (120, 121, 107)}, - {"position": (512, 512), "rgb": (165, 133, 127)}, - {"position": (700, 400), "rgb": (217, 130, 66)}, - {"position": (100, 700), "rgb": (191, 168, 152)}, - {"position": (400, 700), "rgb": (130, 96, 77)}, - {"position": (700, 700), "rgb": (247, 203, 140)}, - {"position": (256, 256), "rgb": (167, 156, 150)}, + {"position": (100, 100), "rgb": (121, 118, 100)}, + {"position": (400, 50), "rgb": (163, 162, 143)}, + {"position": (700, 100), "rgb": (170, 156, 127)}, + {"position": (150, 400), "rgb": (129, 127, 112)}, + {"position": (512, 512), "rgb": (135, 61, 59)}, + {"position": (700, 400), "rgb": (205, 107, 43)}, + {"position": (100, 700), "rgb": (197, 177, 157)}, + {"position": (400, 700), "rgb": (139, 107, 86)}, + {"position": (700, 700), "rgb": (247, 205, 146)}, + {"position": (256, 256), "rgb": (171, 160, 153)}, ] if current_omni_platform.is_rocm(): REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (115, 113, 94)}, - {"position": (400, 50), "rgb": (159, 160, 144)}, - {"position": (700, 100), "rgb": (164, 151, 123)}, - {"position": (150, 400), "rgb": (120, 121, 107)}, - {"position": (512, 512), "rgb": (165, 133, 127)}, - {"position": (700, 400), "rgb": (217, 130, 66)}, - {"position": (100, 700), "rgb": (191, 168, 152)}, - {"position": (400, 700), "rgb": (130, 96, 77)}, - {"position": (700, 700), "rgb": (247, 203, 140)}, - {"position": (256, 256), "rgb": (167, 156, 150)}, + {"position": (100, 100), "rgb": (123, 119, 100)}, + {"position": (400, 50), "rgb": (162, 161, 142)}, + {"position": (700, 100), "rgb": (171, 156, 127)}, + {"position": (150, 400), "rgb": (131, 128, 112)}, + {"position": (512, 512), "rgb": (134, 61, 59)}, + {"position": (700, 400), "rgb": (204, 107, 43)}, + {"position": (100, 700), "rgb": (201, 180, 165)}, + {"position": (400, 700), "rgb": (140, 108, 87)}, + {"position": (700, 700), "rgb": (247, 205, 145)}, + {"position": (256, 256), "rgb": (171, 160, 153)}, ] # Maximum allowed difference per color channel @@ -174,8 +172,8 @@ def _generate_bagel_image(omni: Omni, prompt: str = DEFAULT_PROMPT) -> Image.Ima return generated_image -def _resolve_deploy_config(config_path: str, run_level: str) -> str: - """Resolve deploy config based on run level. +def _resolve_stage_config(config_path: str, run_level: str) -> str: + """Resolve stage config based on run level. For advanced_model (real weights), strip load_format: dummy so the model falls back to loading real weights from HuggingFace. @@ -184,9 +182,9 @@ def _resolve_deploy_config(config_path: str, run_level: str) -> str: return modify_stage_config( config_path, deletes={ - "stages": { - 0: ["load_format"], - 1: ["load_format"], + "stage_args": { + 0: ["engine_args.load_format"], + 1: ["engine_args.load_format"], } }, ) @@ -199,14 +197,16 @@ def _resolve_deploy_config(config_path: str, run_level: str) -> str: @hardware_test(res={"cuda": "H100", "rocm": "MI325"}) def test_bagel_text2img_shared_memory_connector(run_level): """Test Bagel text2img with shared memory connector.""" - config_path = _resolve_deploy_config(BAGEL_CI_DEPLOY, run_level) - with OmniRunner( - "ByteDance-Seed/BAGEL-7B-MoT", - stage_configs_path=config_path, - ) as runner: - generated_image = _generate_bagel_image(runner.omni) + config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") + config_path = _resolve_stage_config(config_path, run_level) + omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300) + + try: + generated_image = _generate_bagel_image(omni) if run_level == "advanced_model": _validate_pixels(generated_image) + finally: + omni.close() def _wait_for_port(host: str, port: int, timeout: int = 30) -> bool: @@ -278,7 +278,7 @@ def _cleanup_mooncake_processes(timeout_secs: int = 5) -> None: def _load_mooncake_config(host: str, rpc_port: int, http_port: int) -> str: - """Load Mooncake config from CI overlay and substitute placeholders. + """Load Mooncake config from YAML and substitute placeholders. Args: host: Mooncake host address. @@ -288,13 +288,16 @@ def _load_mooncake_config(host: str, rpc_port: int, http_port: int) -> str: Returns: Path to the temporary config file with substituted values. """ - with open(BAGEL_MOONCAKE_CI_DEPLOY) as f: + config_path = str(Path(__file__).parent / "stage_configs" / "bagel_mooncake_ci.yaml") + with open(config_path) as f: config_content = f.read() + # Substitute placeholders config_content = config_content.replace("${MOONCAKE_HOST}", host) config_content = config_content.replace("${MOONCAKE_RPC_PORT}", str(rpc_port)) config_content = config_content.replace("${MOONCAKE_HTTP_PORT}", str(http_port)) + # Write to temp file temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) temp_file.write(config_content) temp_file.close() @@ -316,6 +319,7 @@ def test_bagel_text2img_mooncake_connector(run_level): mooncake_master_proc = None temp_config_file = None + omni = None try: _cleanup_mooncake_processes() @@ -344,17 +348,16 @@ def test_bagel_text2img_mooncake_connector(run_level): http_port=MOONCAKE_HTTP_PORT, ) - temp_config_file = _resolve_deploy_config(temp_config_file, run_level) - with OmniRunner( - "ByteDance-Seed/BAGEL-7B-MoT", - stage_configs_path=temp_config_file, - stage_init_timeout=300, - ) as runner: - generated_image = _generate_bagel_image(runner.omni) - if run_level == "advanced_model": - _validate_pixels(generated_image) + temp_config_file = _resolve_stage_config(temp_config_file, run_level) + omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=temp_config_file, stage_init_timeout=300) + + generated_image = _generate_bagel_image(omni) + if run_level == "advanced_model": + _validate_pixels(generated_image) finally: + if omni: + omni.close() if temp_config_file: try: os.unlink(temp_config_file) diff --git a/tests/e2e/offline_inference/test_bagel_understanding.py b/tests/e2e/offline_inference/test_bagel_understanding.py deleted file mode 100644 index e342152fc02..00000000000 --- a/tests/e2e/offline_inference/test_bagel_understanding.py +++ /dev/null @@ -1,135 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -""" -End-to-end tests for Bagel text2text and img2text (understanding) tasks. - -These tests validate that the Bagel multistage pipeline correctly generates -text output for understanding tasks, matching reference results. - -Equivalent to running: - python3 examples/offline_inference/bagel/end2end.py \ - --modality text2text \ - --prompts "Where is the capital of France?" - - python3 examples/offline_inference/bagel/end2end.py \ - --modality img2text \ - --prompts "Please describe this image" \ - --image-path 2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg -""" - -import os - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - -import pytest -from vllm.assets.image import ImageAsset - -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config - -MODEL_NAME = "ByteDance-Seed/BAGEL-7B-MoT" -STAGE_CONFIG = get_deploy_config_path("ci/bagel.yaml") - -REFERENCE_TEXT_TEXT2TEXT = "The capital of France is Paris." - -REFERENCE_TEXT_IMG2TEXT = ( - "This is a photo of a wooden boardwalk or pathway that leads through " - "tall green grass. The path appears to be in a natural setting, possibly " - "a wetland or marsh area. The sky above is blue with some scattered " - "clouds, suggesting it might be a sunny day. The overall scene looks " - "peaceful and serene." -) - - -def _resolve_deploy_config(config_path: str, run_level: str) -> str: - """Strip load_format: dummy for advanced_model (real weights).""" - if run_level == "advanced_model": - return modify_stage_config( - config_path, - deletes={ - "stages": { - 0: ["load_format"], - 1: ["load_format"], - } - }, - ) - return config_path - - -def _extract_text(omni_outputs: list) -> str: - """Extract generated text from OmniRequestOutput list.""" - for req_output in omni_outputs: - ro = getattr(req_output, "request_output", None) - if ro and getattr(ro, "outputs", None): - return "".join(getattr(o, "text", "") or "" for o in ro.outputs) - return "" - - -@pytest.mark.core_model -@pytest.mark.advanced_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) -def test_bagel_text2text(run_level): - """Test Bagel text2text produces correct text output.""" - config_path = _resolve_deploy_config(STAGE_CONFIG, run_level) - with OmniRunner( - MODEL_NAME, - stage_configs_path=config_path, - ) as runner: - omni = runner.omni - prompt = "<|im_start|>user\nWhere is the capital of France?<|im_end|>\n<|im_start|>assistant\n" - params_list = omni.default_sampling_params_list - omni_outputs = list( - omni.generate( - prompts=[{"prompt": prompt, "modalities": ["text"]}], - sampling_params_list=params_list, - ) - ) - - assert len(omni_outputs) > 0, "No outputs returned" - text = _extract_text(omni_outputs) - assert len(text) > 0, "Generated text is empty" - - if run_level == "advanced_model": - assert text == REFERENCE_TEXT_TEXT2TEXT, ( - f"Text mismatch: expected {REFERENCE_TEXT_TEXT2TEXT!r}, got {text!r}" - ) - - -@pytest.mark.core_model -@pytest.mark.advanced_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) -def test_bagel_img2text(run_level): - """Test Bagel img2text produces correct text output.""" - input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") - config_path = _resolve_deploy_config(STAGE_CONFIG, run_level) - with OmniRunner( - MODEL_NAME, - stage_configs_path=config_path, - stage_init_timeout=300, - ) as runner: - omni = runner.omni - prompt = "<|im_start|>user\n<|image_pad|>\nPlease describe this image<|im_end|>\n<|im_start|>assistant\n" - params_list = omni.default_sampling_params_list - omni_outputs = list( - omni.generate( - prompts=[ - { - "prompt": prompt, - "multi_modal_data": {"image": input_image}, - "modalities": ["text"], - } - ], - sampling_params_list=params_list, - ) - ) - - assert len(omni_outputs) > 0, "No outputs returned" - text = _extract_text(omni_outputs) - assert len(text) > 0, "Generated text is empty" - - if run_level == "advanced_model": - assert text == REFERENCE_TEXT_IMG2TEXT, f"Text mismatch: expected {REFERENCE_TEXT_IMG2TEXT!r}, got {text!r}" diff --git a/tests/e2e/offline_inference/test_cache_dit.py b/tests/e2e/offline_inference/test_cache_dit.py index 1577dd9f6db..0e31413dc07 100644 --- a/tests/e2e/offline_inference/test_cache_dit.py +++ b/tests/e2e/offline_inference/test_cache_dit.py @@ -8,15 +8,27 @@ It uses minimal settings to keep test time short for CI. """ +import os +import sys +from pathlib import Path + import pytest import torch -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner +from tests.utils import hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams + +# ruff: noqa: E402 +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from vllm_omni import Omni from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" + # Use random weights model for testing models = ["riverclouds/qwen_image_random"] @@ -36,17 +48,20 @@ def test_cache_dit(model_name: str): "residual_diff_threshold": 0.24, "max_continuous_cached_steps": 3, } - with OmniRunner( - model_name, - cache_backend="cache_dit", - cache_config=cache_config, - ) as runner: + m = None + try: + m = Omni( + model=model_name, + cache_backend="cache_dit", + cache_config=cache_config, + ) + # Use minimal settings for fast testing height = 256 width = 256 num_inference_steps = 4 # Minimal steps for fast test - outputs = runner.omni.generate( + outputs = m.generate( "a photo of a cat sitting on a laptop keyboard", OmniDiffusionSamplingParams( height=height, @@ -75,3 +90,9 @@ def test_cache_dit(model_name: str): # Check image size assert images[0].width == width assert images[0].height == height + except Exception as e: + print(f"Test failed with error: {e}") + raise + finally: + if m is not None and hasattr(m, "close"): + m.close() diff --git a/tests/e2e/offline_inference/test_cosyvoice3.py b/tests/e2e/offline_inference/test_cosyvoice3.py deleted file mode 100644 index 7206f1e7b0c..00000000000 --- a/tests/e2e/offline_inference/test_cosyvoice3.py +++ /dev/null @@ -1,200 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Offline E2E smoke test for CosyVoice3 zero-shot reference inference. - -This test uses the official upstream zero-shot prompt text/audio pair and -verifies a stable reference recipe: -- config-derived top_p/top_k and token-length ratios -- model EOS token as the stop token -- a conservative repetition penalty to avoid degenerate loops -""" - -from __future__ import annotations - -import functools -import io -import os -from pathlib import Path -from urllib.request import urlopen - -import numpy as np -import pytest -import soundfile as sf -from huggingface_hub import snapshot_download -from vllm.sampling_params import SamplingParams - -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner -from tests.helpers.stage_config import get_deploy_config_path -from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config -from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer - -MODEL = "FunAudioLLM/Fun-CosyVoice3-0.5B-2512" -MODEL_DIR_ENV = "VLLM_OMNI_COSYVOICE3_MODEL_DIR" - -REFERENCE_PROMPT_WAV_URL = "https://raw.githubusercontent.com/FunAudioLLM/CosyVoice/main/asset/zero_shot_prompt.wav" -REFERENCE_PROMPT_TEXT = "You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。" -REFERENCE_SYNTH_TEXT = ( - "CosyVoice is undergoing a comprehensive upgrade, providing more accurate, " - "stable, faster, and better voice generation capabilities." -) -REFERENCE_STAGE0_TEMPERATURE = 1.0 -REFERENCE_STAGE0_REPETITION_PENALTY = 2.0 - - -ASYNC_CHUNK_MODES = [ - pytest.param(False, id="sync"), - pytest.param(True, id="async_chunk"), -] - - -@functools.lru_cache(maxsize=1) -def _load_reference_prompt_wav() -> tuple[np.ndarray, int]: - with urlopen(REFERENCE_PROMPT_WAV_URL, timeout=30) as resp: - data = resp.read() - audio, sr = sf.read(io.BytesIO(data), dtype="float32", always_2d=False) - if isinstance(audio, np.ndarray) and audio.ndim > 1: - audio = np.mean(audio, axis=-1) - return np.asarray(audio, dtype=np.float32), int(sr) - - -@functools.lru_cache(maxsize=1) -def _resolve_model_dir() -> Path: - override = os.environ.get(MODEL_DIR_ENV) - if override: - return Path(override).expanduser().resolve() - return Path(snapshot_download(MODEL, allow_patterns=["*"])) - - -def _reference_zero_shot_stage0_sampling(*, text: str) -> SamplingParams: - config = CosyVoice3Config() - sampling_cfg = config.llm.get("sampling", {}) - eos_token_id = int(config.llm["eos_token_id"]) - model_dir = _resolve_model_dir() - tokenizer = get_qwen_tokenizer( - token_path=str(model_dir / config.qwen_pretrain_path), - skip_special_tokens=config.skip_special_tokens, - version=config.version, - ) - text_len = max(1, len(tokenizer.encode(text, allowed_special=config.allowed_special))) - return SamplingParams( - temperature=REFERENCE_STAGE0_TEMPERATURE, - top_p=float(sampling_cfg.get("top_p", 0.8)), - top_k=int(sampling_cfg.get("top_k", 25)), - repetition_penalty=REFERENCE_STAGE0_REPETITION_PENALTY, - stop_token_ids=[eos_token_id], - min_tokens=int(text_len * config.min_token_text_ratio), - max_tokens=int(text_len * config.max_token_text_ratio), - ) - - -def _concat_audio(audio_val) -> np.ndarray: - import torch - - if isinstance(audio_val, list): - tensors = [] - for t in audio_val: - if t is None: - continue - if hasattr(t, "detach"): - t = t.detach() - if hasattr(t, "cpu"): - t = t.cpu() - if hasattr(t, "float"): - t = t.float() - if isinstance(t, torch.Tensor): - tensors.append(t.reshape(-1)) - if not tensors: - return np.zeros((0,), dtype=np.float32) - return torch.cat(tensors, dim=-1).numpy().astype(np.float32, copy=False) - - if hasattr(audio_val, "detach"): - audio_val = audio_val.detach() - if hasattr(audio_val, "cpu"): - audio_val = audio_val.cpu() - if hasattr(audio_val, "float"): - audio_val = audio_val.float() - if hasattr(audio_val, "numpy"): - audio_val = audio_val.numpy() - audio_np = np.asarray(audio_val, dtype=np.float32) - return audio_np.reshape(-1) - - -def _get_stage_engine_outputs(omni_runner: OmniRunner, stage_id: int): - stage_list = getattr(omni_runner.omni, "stage_list", None) - if stage_list is not None: - return getattr(stage_list[stage_id], "engine_outputs", None) or [] - - stage_clients = getattr(getattr(omni_runner.omni, "engine", None), "stage_clients", None) - if stage_clients is not None: - return getattr(stage_clients[stage_id], "engine_outputs", None) or [] - - raise AttributeError("Unable to locate stage outputs on Omni runner") - - -def _build_reference_inputs(prompt_audio: tuple[np.ndarray, int]) -> list[dict[str, object]]: - return [ - { - "prompt": REFERENCE_SYNTH_TEXT, - "multi_modal_data": {"audio": prompt_audio}, - "modalities": ["audio"], - "mm_processor_kwargs": {"prompt_text": REFERENCE_PROMPT_TEXT}, - } - ] - - -@pytest.mark.core_model -@pytest.mark.omni -@hardware_test(res={"cuda": "L4"}, num_cards=1) -@pytest.mark.parametrize("async_chunk", ASYNC_CHUNK_MODES) -def test_cosyvoice3_offline_reference_zero_shot(async_chunk: bool) -> None: - """CosyVoice3 zero-shot reference inference should stop cleanly and produce sane audio.""" - prompt_audio, prompt_sr = _load_reference_prompt_wav() - model_dir = _resolve_model_dir() - expected_stop_token = int(CosyVoice3Config().llm["eos_token_id"]) - - with OmniRunner( - str(model_dir), - seed=42, - stage_configs_path=get_deploy_config_path("cosyvoice3.yaml"), - async_chunk=async_chunk, - stage_init_timeout=300, - ) as omni_runner: - sampling_params_list = omni_runner.get_default_sampling_params_list() - sampling_params_list[0] = _reference_zero_shot_stage0_sampling(text=REFERENCE_SYNTH_TEXT) - - outputs = omni_runner.omni.generate(_build_reference_inputs((prompt_audio, prompt_sr)), sampling_params_list) - - assert outputs, "No outputs returned" - audio_mm = outputs[0].multimodal_output - assert "audio" in audio_mm, "No audio output found" - - audio = _concat_audio(audio_mm["audio"]) - assert audio.size > 0, "Generated audio is empty" - - sr_val = audio_mm.get("sr", 24000) - if isinstance(sr_val, list) and sr_val: - sr_val = sr_val[-1] - if hasattr(sr_val, "item"): - sr_val = sr_val.item() - sr = int(sr_val) - assert sr == 24000, f"Unexpected sample_rate={sr}" - - duration_s = audio.size / sr - assert 2.8 <= duration_s <= 8.8, f"Unexpected duration={duration_s:.3f}s (samples={audio.size}, sr={sr})" - - stage0_outputs = _get_stage_engine_outputs(omni_runner, 0) - if stage0_outputs: - completion = stage0_outputs[0].outputs[0] - finish_reason = getattr(completion, "finish_reason", None) - stop_reason = getattr(completion, "stop_reason", None) - num_tokens = len(getattr(completion, "token_ids", []) or []) - - assert finish_reason == "stop", f"Stage-0 finish_reason={finish_reason}, expected 'stop'" - assert int(stop_reason) == expected_stop_token, ( - f"Stage-0 stop_reason={stop_reason}, expected {expected_stop_token}" - ) - assert 80 <= num_tokens <= 220, f"Stage-0 num_tokens={num_tokens}, expected sane stop-bound range" - else: - assert async_chunk, "Stage-0 produced no engine outputs" diff --git a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py index d7fd6f72f5b..f3830f02e97 100644 --- a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py @@ -1,15 +1,22 @@ import gc +import sys +from pathlib import Path import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory -from tests.helpers.env import DeviceMemoryMonitor -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner +from tests.utils import DeviceMemoryMonitor, hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform +# ruff: noqa: E402 +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from vllm_omni import Omni + models = ["riverclouds/qwen_image_random"] @@ -20,29 +27,30 @@ def inference(model_name: str, offload: bool = True): current_omni_platform.reset_peak_memory_stats() monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - with OmniRunner( - model_name, + m = Omni( + model=model_name, # TODO: we might want to add overlapped feature e2e tests # cache_backend="cache_dit", enable_cpu_offload=offload, - ) as runner: - current_omni_platform.reset_peak_memory_stats() - height = 256 - width = 256 + ) + current_omni_platform.reset_peak_memory_stats() + height = 256 + width = 256 - runner.omni.generate( - "a photo of a cat sitting on a laptop keyboard", - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=9, - guidance_scale=0.0, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - ), - ) + m.generate( + "a photo of a cat sitting on a laptop keyboard", + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=9, + guidance_scale=0.0, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + ), + ) peak = monitor.peak_used_mb monitor.stop() + del m gc.collect() current_omni_platform.empty_cache() diff --git a/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py b/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py index 4f19c100476..6132f1bd0eb 100644 --- a/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py @@ -1,12 +1,21 @@ +import sys +from pathlib import Path + import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory -from tests.helpers.env import DeviceMemoryMonitor -from tests.helpers.runtime import OmniRunner +from tests.utils import DeviceMemoryMonitor from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform +# ruff: noqa: E402 +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from vllm_omni import Omni + # Models to test and expected saved memory in MB, correspondingly MODELS_SAVED_MEMORY_MB = { "riverclouds/qwen_image_random": 4500, @@ -24,33 +33,34 @@ def run_inference( monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - with OmniRunner( - model_name, + m = Omni( + model=model_name, enable_layerwise_offload=layerwise_offload, # TODO: we might want to add overlapped feature e2e tests # cache_backend="cache_dit", boundary_ratio=0.875, flow_shift=5.0, - ) as runner: - current_omni_platform.reset_peak_memory_stats() - - # Refer to tests/e2e/offline_inference/test_t2v_model.py - # Use minimal settings for testing - height = 480 - width = 640 - num_frames = 5 - - runner.omni.generate( - "A cat sitting on a table", - OmniDiffusionSamplingParams( - height=height, - width=width, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - guidance_scale=1.0, - num_inference_steps=num_inference_steps, - num_frames=num_frames, - ), - ) + ) + + current_omni_platform.reset_peak_memory_stats() + + # Refer to tests/e2e/offline_inference/test_t2v_model.py + # Use minimal settings for testing + height = 480 + width = 640 + num_frames = 5 + + m.generate( + "A cat sitting on a table", + OmniDiffusionSamplingParams( + height=height, + width=width, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + guidance_scale=1.0, + num_inference_steps=num_inference_steps, + num_frames=num_frames, + ), + ) peak = monitor.peak_used_mb monitor.stop() diff --git a/tests/e2e/offline_inference/test_diffusion_lora.py b/tests/e2e/offline_inference/test_diffusion_lora.py index 027dadb3f4e..b414fe30eeb 100644 --- a/tests/e2e/offline_inference/test_diffusion_lora.py +++ b/tests/e2e/offline_inference/test_diffusion_lora.py @@ -7,7 +7,6 @@ import torch from safetensors.torch import save_file -from tests.helpers.runtime import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform @@ -17,12 +16,15 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) +from vllm_omni import Omni + os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" # This test is specific to Z-Image LoRA behavior. Keep it focused on a single # model to reduce runtime and avoid extra downloads. models = ["Tongyi-MAI/Z-Image-Turbo"] +DIFFUSION_INIT_TIMEOUT_S = 600 @pytest.mark.parametrize("model_name", models) @@ -75,8 +77,12 @@ def _write_zimage_lora(adapter_dir: Path) -> str: ) return str(adapter_dir) - with OmniRunner(model_name) as runner: - m = runner.omni + m = Omni( + model=model_name, + stage_init_timeout=DIFFUSION_INIT_TIMEOUT_S, + init_timeout=DIFFUSION_INIT_TIMEOUT_S, + ) + try: # high resolution may cause OOM on L4 height = 256 width = 256 @@ -134,3 +140,5 @@ def _write_zimage_lora(adapter_dir: Path) -> str: diff = np.abs(np.array(images[0], dtype=np.int16) - np.array(images_lora[0], dtype=np.int16)).mean() assert diff > 0.0 + finally: + m.close() diff --git a/tests/e2e/offline_inference/test_dynin_omni.py b/tests/e2e/offline_inference/test_dynin_omni.py deleted file mode 100644 index f891fc4f12e..00000000000 --- a/tests/e2e/offline_inference/test_dynin_omni.py +++ /dev/null @@ -1,374 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -E2E offline smoke tests for Dynin-Omni. - -- model: "snu-aidas/Dynin-Omni" -- stage config: tests/e2e/stage_configs/dynin_omni_ci.yaml -""" - -from __future__ import annotations - -import os -from pathlib import Path -from typing import Any - -import numpy as np -import pytest -import torch -from transformers import AutoTokenizer - -from tests.helpers.mark import hardware_test - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" - -_REPO_ROOT = Path(__file__).resolve().parents[3] -_DEFAULT_DYNIN_CONFIG_PATH: Path | None = None -_DEFAULT_STAGE_CONFIG_PATH = _REPO_ROOT / "tests" / "e2e" / "stage_configs" / "dynin_omni_ci.yaml" - -models = ["snu-aidas/Dynin-Omni"] -stage_configs = [str(_DEFAULT_STAGE_CONFIG_PATH)] -test_params = [(model, stage_config) for model in models for stage_config in stage_configs] - -DYNIN_CONFIG_PATH = str(_DEFAULT_DYNIN_CONFIG_PATH) if _DEFAULT_DYNIN_CONFIG_PATH is not None else None - -pytestmark = [ - pytest.mark.core_model, - pytest.mark.omni, - pytest.mark.parametrize("omni_runner", test_params, indirect=True), -] - - -# prompting util -def _build_mmu_prompt(tokenizer: Any, question: str, dynin_config_path: str | None) -> dict[str, Any]: - encoded = tokenizer(question, return_tensors="pt", add_special_tokens=True) - token_ids = [int(v) for v in encoded["input_ids"][0].tolist()] - attention_mask = [int(v) for v in encoded["attention_mask"][0].tolist()] - additional_information: dict[str, Any] = { - "task": ["mmu"], - "detok_id": [0], - "prompt_length": [len(token_ids)], - "attention_mask": [attention_mask], - "max_new_tokens": [64], - "steps": [64], - "block_length": [16], - "temperature": [0.0], - } - if dynin_config_path: - additional_information["dynin_config_path"] = [str(dynin_config_path)] - return { - "prompt_token_ids": token_ids, - "additional_information": additional_information, - "modalities": ["text"], - } - - -def _build_mmu_multimodal_prompt( - tokenizer: Any, - question: str, - dynin_config_path: str | None, - *, - image: Any | None = None, - audio: tuple[np.ndarray, int] | None = None, -) -> dict[str, Any]: - if image is None and audio is None: - raise ValueError("At least one multimodal input (image or audio) must be provided.") - - prefix_chunks: list[str] = [] - mm_data: dict[str, Any] = {} - if image is not None: - prefix_chunks.append("<|soi|><|image|><|eoi|>") - mm_data["image"] = image - if audio is not None: - prefix_chunks.append("<|soa|><|audio|><|eoa|>") - mm_data["audio"] = audio - - prefixed_question = " ".join(prefix_chunks + [question]).strip() - prompt = _build_mmu_prompt( - tokenizer=tokenizer, - question=prefixed_question, - dynin_config_path=dynin_config_path, - ) - prompt["multi_modal_data"] = mm_data - prompt["modalities"] = ["text"] - return prompt - - -def _generate_synthetic_image(width: int = 224, height: int = 224) -> np.ndarray: - x = np.linspace(0, 255, width, dtype=np.uint8) - y = np.linspace(0, 255, height, dtype=np.uint8)[:, None] - red = np.tile(x, (height, 1)) - green = np.tile(y, (1, width)) - blue = ((red.astype(np.uint16) + green.astype(np.uint16)) // 2).astype(np.uint8) - return np.stack([red, green, blue], axis=-1) - - -def _generate_synthetic_audio(duration_s: int = 5, sample_rate: int = 48_000) -> tuple[np.ndarray, int]: - t = np.linspace(0, duration_s, int(sample_rate * duration_s), endpoint=False, dtype=np.float32) - waveform = 0.1 * np.sin(2.0 * np.pi * 440.0 * t) - return waveform.astype(np.float32), sample_rate - - -# prompting util -def _build_t2s_decode_prompt(dynin_config_path: str | None) -> dict[str, Any]: - # Bypass stage-0 generation and directly validate token->audio decode path. - generated_audio_token_ids = [int(v) for v in ([10, 11, 12, 13, 14] * 32)] - additional_information: dict[str, Any] = { - "task": ["t2s"], - "detok_id": [1], - "generated_token_ids": [generated_audio_token_ids], - "audio_codebook_size": [4096], - } - if dynin_config_path: - additional_information["dynin_config_path"] = [str(dynin_config_path)] - return { - "prompt_token_ids": [0], - "additional_information": additional_information, - "modalities": ["audio"], - } - - -# prompting util -def _build_t2i_decode_prompt(dynin_config_path: str | None) -> dict[str, Any]: - # Bypass stage-0 generation and directly validate token->image decode path. - # MAGVIT decode path expects a square token grid; 1024 tokens -> 32x32. - generated_image_token_ids = [int(v) for v in ([10, 11, 12, 13, 14, 15, 16, 17] * 128)] - additional_information: dict[str, Any] = { - "task": ["t2i"], - "detok_id": [2], - "generated_token_ids": [generated_image_token_ids], - "codebook_size": [8192], - } - if dynin_config_path: - additional_information["dynin_config_path"] = [str(dynin_config_path)] - return { - "prompt_token_ids": [0], - "additional_information": additional_information, - "modalities": ["image"], - } - - -def _configure_dynin_config_env() -> None: - if DYNIN_CONFIG_PATH: - os.environ["DYNIN_CONFIG_PATH"] = str(DYNIN_CONFIG_PATH) - else: - os.environ.pop("DYNIN_CONFIG_PATH", None) - - -def _is_finished_request_output(request_output: Any) -> bool: - if request_output is None: - return False - req_list = request_output if isinstance(request_output, list) else [request_output] - for req in req_list: - if req is not None and bool(getattr(req, "finished", False)): - return True - return False - - -def _find_stage_output(outputs: list[Any], output_type: str) -> Any | None: - matched = [ - stage_output for stage_output in outputs if getattr(stage_output, "final_output_type", None) == output_type - ] - if not matched: - return None - - # Prefer the latest finished chunk to avoid picking an intermediate stream output. - for stage_output in reversed(matched): - if _is_finished_request_output(getattr(stage_output, "request_output", None)): - return stage_output - return matched[-1] - - -def _to_token_list(value: Any) -> list[int]: - if value is None: - return [] - if hasattr(value, "detach"): - value = value.detach() - if hasattr(value, "cpu"): - value = value.cpu() - if hasattr(value, "flatten"): - value = value.flatten().tolist() - if isinstance(value, tuple): - value = list(value) - if not isinstance(value, list): - return [] - out: list[int] = [] - for token in value: - if isinstance(token, bool): - continue - try: - out.append(int(token)) - except Exception: - continue - return out - - -def _extract_text(stage_output: Any, tokenizer: Any | None = None) -> str: - request_output = getattr(stage_output, "request_output", None) - if request_output is None: - return "" - req_list = request_output if isinstance(request_output, list) else [request_output] - for req in req_list: - completions = getattr(req, "outputs", None) or [] - if not completions: - continue - completion = completions[0] - mm_out = ( - getattr(completion, "multimodal_output", None) - or getattr(req, "multimodal_output", None) - or getattr(stage_output, "multimodal_output", None) - or {} - ) - text = mm_out.get("text") - if isinstance(text, list) and text: - text = text[-1] - if isinstance(text, str) and text.strip(): - return text.strip() - if tokenizer is not None: - for key in ("text_tokens", "token_ids"): - token_ids = _to_token_list(mm_out.get(key)) - if not token_ids: - continue - decoded = tokenizer.decode(token_ids, skip_special_tokens=True) - if isinstance(decoded, str) and decoded.strip(): - return decoded.strip() - fallback = getattr(completion, "text", None) - if isinstance(fallback, str) and fallback.strip(): - return fallback.strip() - return "" - - -def _extract_audio(stage_output: Any) -> Any | None: - request_output = getattr(stage_output, "request_output", None) - if request_output is None: - return None - req_list = request_output if isinstance(request_output, list) else [request_output] - for req in req_list: - completions = getattr(req, "outputs", None) or [] - if not completions: - continue - completion = completions[0] - mm_out = getattr(completion, "multimodal_output", None) or {} - if "audio" in mm_out: - return mm_out["audio"] - return None - - -def _extract_image(stage_output: Any) -> Any | None: - request_output = getattr(stage_output, "request_output", None) - if request_output is None: - return None - req_list = request_output if isinstance(request_output, list) else [request_output] - for req in req_list: - completions = getattr(req, "outputs", None) or [] - if not completions: - continue - completion = completions[0] - mm_out = getattr(completion, "multimodal_output", None) or {} - if "image" in mm_out: - return mm_out["image"] - return None - - -def _numel(value: Any) -> int: - if value is None: - return 0 - if isinstance(value, torch.Tensor): - return int(value.numel()) - shape = getattr(value, "shape", None) - if shape is not None: - try: - total = 1 - for dim in shape: - total *= int(dim) - return int(total) - except Exception: - pass - if isinstance(value, (list, tuple)): - return len(value) - return 0 - - -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -def test_dynin_t2i_decode_to_image(omni_runner) -> None: - _configure_dynin_config_env() - prompt = _build_t2i_decode_prompt(dynin_config_path=DYNIN_CONFIG_PATH) - - outputs = omni_runner.generate([prompt]) - - image_output = _find_stage_output(outputs, "image") - assert image_output is not None - image_value = _extract_image(image_output) - assert image_value is not None - assert _numel(image_value) > 0 - - -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -def test_dynin_mmu_to_text(omni_runner) -> None: - _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) - prompt = _build_mmu_prompt( - tokenizer=tokenizer, - question="What is 2 + 2? Answer in one short sentence.", - dynin_config_path=DYNIN_CONFIG_PATH, - ) - - outputs = omni_runner.generate([prompt]) - - text_output = _find_stage_output(outputs, "text") - assert text_output is not None - text_content = _extract_text(text_output, tokenizer=tokenizer) - assert text_content - - -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -def test_dynin_image_to_text(omni_runner) -> None: - _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) - prompt = _build_mmu_multimodal_prompt( - tokenizer=tokenizer, - question="Describe the image briefly in one sentence.", - dynin_config_path=DYNIN_CONFIG_PATH, - image=_generate_synthetic_image(), - ) - - outputs = omni_runner.generate([prompt]) - - text_output = _find_stage_output(outputs, "text") - assert text_output is not None - text_content = _extract_text(text_output, tokenizer=tokenizer) - assert text_content - - -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -def test_dynin_speech_to_text(omni_runner) -> None: - _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) - prompt = _build_mmu_multimodal_prompt( - tokenizer=tokenizer, - question="Transcribe the audio briefly in one sentence.", - dynin_config_path=DYNIN_CONFIG_PATH, - audio=_generate_synthetic_audio(), - ) - - outputs = omni_runner.generate([prompt]) - - text_output = _find_stage_output(outputs, "text") - assert text_output is not None - text_content = _extract_text(text_output, tokenizer=tokenizer) - assert text_content - - -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -def test_dynin_t2s_decode_to_audio(omni_runner) -> None: - _configure_dynin_config_env() - prompt = _build_t2s_decode_prompt(dynin_config_path=DYNIN_CONFIG_PATH) - - outputs = omni_runner.generate([prompt]) - - audio_output = _find_stage_output(outputs, "audio") - assert audio_output is not None - audio_value = _extract_audio(audio_output) - assert audio_value is not None - assert _numel(audio_value) > 0 diff --git a/tests/e2e/offline_inference/test_expert_parallel.py b/tests/e2e/offline_inference/test_expert_parallel.py index f11646b300d..ba126986ec7 100644 --- a/tests/e2e/offline_inference/test_expert_parallel.py +++ b/tests/e2e/offline_inference/test_expert_parallel.py @@ -18,8 +18,8 @@ import torch.distributed as dist from PIL import Image -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner +from tests.utils import hardware_test +from vllm_omni import Omni from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -96,26 +96,12 @@ def _run_inference( tensor_parallel_size=tensor_parallel_size, enable_expert_parallel=enable_expert_parallel, ) + omni = Omni(model=model_name, parallel_config=parallel_config) + try: - with OmniRunner(model_name, parallel_config=parallel_config) as runner: - omni = runner.omni - # Warmup run (not timed) - if warmup: - _ = omni.generate( - PROMPT, - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=DEFAULT_STEPS, - guidance_scale=guidance_scale, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), - num_outputs_per_prompt=1, - ), - ) - - # Timed run - start = time.time() - outputs = omni.generate( + # Warmup run (not timed) + if warmup: + _ = omni.generate( PROMPT, OmniDiffusionSamplingParams( height=height, @@ -126,13 +112,28 @@ def _run_inference( num_outputs_per_prompt=1, ), ) - elapsed_ms = (time.time() - start) * 1000 - return InferenceResult( - images=outputs[0].images, - elapsed_ms=elapsed_ms, - ) + # Timed run + start = time.time() + outputs = omni.generate( + PROMPT, + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=DEFAULT_STEPS, + guidance_scale=guidance_scale, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), + num_outputs_per_prompt=1, + ), + ) + elapsed_ms = (time.time() - start) * 1000 + + return InferenceResult( + images=outputs[0].images, + elapsed_ms=elapsed_ms, + ) finally: + omni.close() _cleanup_distributed() diff --git a/tests/e2e/offline_inference/test_flux.py b/tests/e2e/offline_inference/test_flux.py deleted file mode 100644 index 02c6787be2b..00000000000 --- a/tests/e2e/offline_inference/test_flux.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -"""Tests for Flux1 Schnell.""" - -import pytest -from PIL import Image - -from vllm_omni.entrypoints.omni import Omni -from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -MODEL = "black-forest-labs/FLUX.1-schnell" - - -@pytest.mark.core_model -@pytest.mark.diffusion -def test_flux_schnell_text_to_image(): - """Test FLUX.1-schnell text-to-image generation.""" - omni = Omni(model=MODEL) - - omni_outputs = list( - omni.generate( - prompts=["A photo of a cat sitting on a laptop"], - sampling_params_list=OmniDiffusionSamplingParams( - height=512, - width=512, - num_inference_steps=2, - seed=42, - ), - ) - ) - - assert len(omni_outputs) > 0 - images = omni_outputs[0].images - assert len(images) == 1 - assert isinstance(images[0], Image.Image) - assert images[0].size == (512, 512) diff --git a/tests/e2e/offline_inference/test_flux2_klein.py b/tests/e2e/offline_inference/test_flux2_klein.py deleted file mode 100644 index a1376753467..00000000000 --- a/tests/e2e/offline_inference/test_flux2_klein.py +++ /dev/null @@ -1,227 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -""" -End-to-end test for Flux2 Klein inpainting. - -""" - -# ruff: noqa: E402 - -import os -import sys -from pathlib import Path - -import pytest -import torch -from PIL import Image, ImageDraw - -from vllm_omni.entrypoints.omni import Omni -from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.outputs import OmniRequestOutput -from vllm_omni.platforms import current_omni_platform - -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" - -MODEL = "black-forest-labs/FLUX.2-klein-4B" - -_HEIGHT = 512 -_WIDTH = 512 -_NUM_INFERENCE_STEPS = 4 - - -def _create_test_image(width: int = _WIDTH, height: int = _HEIGHT, color: tuple = (128, 128, 128)) -> Image.Image: - return Image.new("RGB", (width, height), color) - - -def _create_test_mask(width: int = _WIDTH, height: int = _HEIGHT) -> Image.Image: - mask = Image.new("L", (width, height), 0) - draw = ImageDraw.Draw(mask) - draw.rectangle([width // 4, height // 4, width * 3 // 4, height * 3 // 4], fill=255) - return mask - - -def _create_test_inputs(color: tuple = (100, 150, 200)): - return _create_test_image(_WIDTH, _HEIGHT, color), _create_test_mask(_WIDTH, _HEIGHT) - - -def _extract_images_from_output(outputs: list) -> list[Image.Image]: - images = [] - for req_output in outputs: - if hasattr(req_output, "images") and req_output.images: - images.extend(req_output.images) - elif hasattr(req_output, "request_output") and req_output.request_output: - stage_out = req_output.request_output - if isinstance(stage_out, OmniRequestOutput) and hasattr(stage_out, "images"): - images.extend(stage_out.images) - elif isinstance(stage_out, list): - for s in stage_out: - if hasattr(s, "images") and s.images: - images.extend(s.images) - return images - - -# Regression test for https://github.com/vllm-project/vllm-omni/issues/3097 -@pytest.mark.core_model -@pytest.mark.diffusion -def test_flux2_klein_can_accept_text_inputs(): - model = Omni(model=MODEL) - outputs = model.generate( - "a cup of coffee on the table", - OmniDiffusionSamplingParams(num_inference_steps=2, seed=42), - ) - assert len(outputs[0].images) == 1 - - -@pytest.mark.core_model -@pytest.mark.diffusion -def test_flux2_klein_inpaint_basic(): - m = None - try: - m = Omni(model=MODEL) - input_image, mask_image = _create_test_inputs() - - outputs = m.generate( - prompts=[ - { - "prompt": "Fill in the masked area with a beautiful garden", - "multi_modal_data": {"image": input_image, "mask_image": mask_image}, - } - ], - sampling_params_list=OmniDiffusionSamplingParams( - height=_HEIGHT, - width=_WIDTH, - num_inference_steps=_NUM_INFERENCE_STEPS, - guidance_scale=0.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), - num_outputs_per_prompt=1, - ), - ) - - images = _extract_images_from_output(list(outputs)) - assert len(images) == 1 - assert images[0].size == (_WIDTH, _HEIGHT) - finally: - if m is not None and hasattr(m, "close"): - m.close() - - -@pytest.mark.diffusion -def test_flux2_klein_inpaint_deterministic(): - m = None - try: - m = Omni(model=MODEL) - input_image, mask_image = _create_test_inputs() - seed = 12345 - - gen1 = torch.Generator(current_omni_platform.device_type).manual_seed(seed) - gen2 = torch.Generator(current_omni_platform.device_type).manual_seed(seed) - - outputs1 = m.generate( - prompts=[ - { - "prompt": "A red flower in a field", - "multi_modal_data": {"image": input_image, "mask_image": mask_image}, - } - ], - sampling_params_list=OmniDiffusionSamplingParams( - height=_HEIGHT, - width=_WIDTH, - num_inference_steps=_NUM_INFERENCE_STEPS, - guidance_scale=0.0, - generator=gen1, - num_outputs_per_prompt=1, - ), - ) - - outputs2 = m.generate( - prompts=[ - { - "prompt": "A red flower in a field", - "multi_modal_data": {"image": input_image, "mask_image": mask_image}, - } - ], - sampling_params_list=OmniDiffusionSamplingParams( - height=_HEIGHT, - width=_WIDTH, - num_inference_steps=_NUM_INFERENCE_STEPS, - guidance_scale=0.0, - generator=gen2, - num_outputs_per_prompt=1, - ), - ) - - images1 = _extract_images_from_output(list(outputs1)) - images2 = _extract_images_from_output(list(outputs2)) - - assert len(images1) == 1 - assert len(images2) == 1 - - assert list(images1[0].getdata()) == list(images2[0].getdata()), ( - "Same input with same seed should produce identical output. " - "This is critical for offline/online consistency." - ) - finally: - if m is not None and hasattr(m, "close"): - m.close() - - -@pytest.mark.diffusion -def test_flux2_klein_inpaint_different_seeds_different_output(): - m = None - try: - m = Omni(model=MODEL) - input_image, mask_image = _create_test_inputs() - - gen1 = torch.Generator(current_omni_platform.device_type).manual_seed(42) - gen2 = torch.Generator(current_omni_platform.device_type).manual_seed(99999) - - outputs1 = m.generate( - prompts=[ - { - "prompt": "A beautiful landscape", - "multi_modal_data": {"image": input_image, "mask_image": mask_image}, - } - ], - sampling_params_list=OmniDiffusionSamplingParams( - height=_HEIGHT, - width=_WIDTH, - num_inference_steps=_NUM_INFERENCE_STEPS, - guidance_scale=0.0, - generator=gen1, - num_outputs_per_prompt=1, - ), - ) - - outputs2 = m.generate( - prompts=[ - { - "prompt": "A beautiful landscape", - "multi_modal_data": {"image": input_image, "mask_image": mask_image}, - } - ], - sampling_params_list=OmniDiffusionSamplingParams( - height=_HEIGHT, - width=_WIDTH, - num_inference_steps=_NUM_INFERENCE_STEPS, - guidance_scale=0.0, - generator=gen2, - num_outputs_per_prompt=1, - ), - ) - - images1 = _extract_images_from_output(list(outputs1)) - images2 = _extract_images_from_output(list(outputs2)) - - assert len(images1) == 1 - assert len(images2) == 1 - - different_pixel_count = sum(1 for p1, p2 in zip(images1[0].getdata(), images2[0].getdata()) if p1 != p2) - assert different_pixel_count > 0, "Different seeds should produce different outputs" - finally: - if m is not None and hasattr(m, "close"): - m.close() diff --git a/tests/e2e/offline_inference/test_flux_autoround_w4a16.py b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py index ef5d6f9e051..42aab7f26a8 100644 --- a/tests/e2e/offline_inference/test_flux_autoround_w4a16.py +++ b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py @@ -8,22 +8,31 @@ """ import gc -import os as _os +import sys +from pathlib import Path import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory -from tests.helpers.env import DeviceMemoryMonitor -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner +from tests.utils import DeviceMemoryMonitor, hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform +# ruff: noqa: E402 +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from vllm_omni import Omni + QUANTIZED_MODEL = "vllm-project-org/FLUX.1-dev-AutoRound-w4a16" BASELINE_MODEL = "black-forest-labs/FLUX.1-dev" +# Allow overriding via environment for local testing +import os as _os + QUANTIZED_MODEL = _os.environ.get("FLUX_AUTOROUND_MODEL", QUANTIZED_MODEL) BASELINE_MODEL = _os.environ.get("FLUX_BASELINE_MODEL", BASELINE_MODEL) @@ -42,18 +51,19 @@ def _generate_image(model_name: str, **extra_kwargs) -> tuple[list, float]: monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - with OmniRunner(model_name, enforce_eager=True, **extra_kwargs) as runner: - current_omni_platform.reset_peak_memory_stats() - outputs = runner.omni.generate( - "a photo of a cat sitting on a laptop keyboard", - OmniDiffusionSamplingParams( - height=HEIGHT, - width=WIDTH, - num_inference_steps=NUM_STEPS, - guidance_scale=0.0, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - ), - ) + m = Omni(model=model_name, enforce_eager=True, **extra_kwargs) + + current_omni_platform.reset_peak_memory_stats() + outputs = m.generate( + "a photo of a cat sitting on a laptop keyboard", + OmniDiffusionSamplingParams( + height=HEIGHT, + width=WIDTH, + num_inference_steps=NUM_STEPS, + guidance_scale=0.0, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + ), + ) peak = monitor.peak_used_mb monitor.stop() @@ -64,6 +74,7 @@ def _generate_image(model_name: str, **extra_kwargs) -> tuple[list, float]: assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images") images = req_out.images + del m gc.collect() current_omni_platform.empty_cache() diff --git a/tests/e2e/offline_inference/test_flux_kontext.py b/tests/e2e/offline_inference/test_flux_kontext.py index 057319c855f..93dca21c9ad 100644 --- a/tests/e2e/offline_inference/test_flux_kontext.py +++ b/tests/e2e/offline_inference/test_flux_kontext.py @@ -9,14 +9,23 @@ - Image editing with text guidance """ +import os +import sys +from pathlib import Path + import pytest from PIL import Image -from vllm.assets.image import ImageAsset -from tests.helpers.runtime import OmniRunner from vllm_omni.diffusion.data import DiffusionParallelConfig +from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" + MODEL = "black-forest-labs/FLUX.1-Kontext-dev" @@ -24,15 +33,17 @@ @pytest.mark.diffusion def test_flux_kontext_text_to_image(): """Test FluxKontext text-to-image generation with real model.""" - with OmniRunner( - MODEL, + omni = Omni( + model=MODEL, parallel_config=DiffusionParallelConfig( tensor_parallel_size=2, ), enable_cpu_offload=False, - ) as runner: + ) + + try: omni_outputs = list( - runner.omni.generate( + omni.generate( prompts=["A photo of a cat sitting on a laptop"], sampling_params_list=OmniDiffusionSamplingParams( height=512, @@ -43,37 +54,43 @@ def test_flux_kontext_text_to_image(): ) ) - assert len(omni_outputs) > 0 - output = omni_outputs[0] - images = None - if output.images: - images = output.images - elif hasattr(output, "request_output") and output.request_output: - for stage_out in output.request_output: - if hasattr(stage_out, "images") and stage_out.images: - images = stage_out.images - break + assert len(omni_outputs) > 0 + output = omni_outputs[0] + images = None + if output.images: + images = output.images + elif hasattr(output, "request_output") and output.request_output: + for stage_out in output.request_output: + if hasattr(stage_out, "images") and stage_out.images: + images = stage_out.images + break - assert images is not None - assert len(images) > 0 - assert isinstance(images[0], Image.Image) - assert images[0].size == (512, 512) + assert images is not None + assert len(images) > 0 + assert isinstance(images[0], Image.Image) + assert images[0].size == (512, 512) + finally: + omni.close() @pytest.mark.core_model @pytest.mark.diffusion def test_flux_kontext_image_edit(): """Test FluxKontext image-to-image editing with real model.""" + from vllm.assets.image import ImageAsset + input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") - with OmniRunner( - MODEL, + omni = Omni( + model=MODEL, parallel_config=DiffusionParallelConfig( tensor_parallel_size=2, ), enable_cpu_offload=False, - ) as runner: + ) + + try: omni_outputs = list( - runner.omni.generate( + omni.generate( prompts=[ { "prompt": "Transform this image into a Vincent van Gogh style painting", @@ -90,18 +107,20 @@ def test_flux_kontext_image_edit(): ) ) - assert len(omni_outputs) > 0 - output = omni_outputs[0] - images = None - if output.images: - images = output.images - elif hasattr(output, "request_output") and output.request_output: - for stage_out in output.request_output: - if hasattr(stage_out, "images") and stage_out.images: - images = stage_out.images - break - - assert images is not None - assert len(images) > 0 - assert isinstance(images[0], Image.Image) - assert images[0].size == (512, 512) + assert len(omni_outputs) > 0 + output = omni_outputs[0] + images = None + if output.images: + images = output.images + elif hasattr(output, "request_output") and output.request_output: + for stage_out in output.request_output: + if hasattr(stage_out, "images") and stage_out.images: + images = stage_out.images + break + + assert images is not None + assert len(images) > 0 + assert isinstance(images[0], Image.Image) + assert images[0].size == (512, 512) + finally: + omni.close() diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py deleted file mode 100644 index bd0d132d093..00000000000 --- a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py +++ /dev/null @@ -1,343 +0,0 @@ -# ruff: noqa: E501 -from collections.abc import Generator -from pathlib import Path - -import pytest -import torch -import torch.nn.functional as F -from PIL import Image -from transformers import CLIPModel, CLIPProcessor - -from tests.helpers.runtime import OmniRunner -from vllm_omni import Omni -from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.platforms import current_omni_platform - -PROMPT = "A brown and white dog is running on the grass" -MODEL_NAME = "tencent/HunyuanImage-3.0" -LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32" -REPO_ROOT = Path(__file__).resolve().parents[3] -STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image3_t2i.yaml" - -pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion] - -# System prompt type. Options: None, dynamic, en_vanilla, en_recaption, en_think_recaption, en_unified -# Below are the CLIP embedding tensors from the official HunyuanImage model (seed=1234, prompt: "A brown and white dog is running on the grass"). -# SEED_1234 denotes the output without system prompt, while the remaining entries correspond to outputs generated with different system prompts. -# fmt: off -SEED_1234 = torch.tensor( - [ - 0.027797, 0.028964, -0.005051, 0.001059, 0.017021, -0.034029, 0.021989, 0.033318, -0.000308, 0.016179, 0.010504, -0.034201, 0.050230, -0.021170, 0.083530, -0.003621, - 0.040758, 0.039913, 0.044305, -0.019285, -0.058387, -0.001099, 0.042782, -0.036136, -0.014955, 0.002147, 0.009439, 0.012943, -0.028732, -0.018349, 0.002861, 0.013019, - 0.014362, -0.038833, 0.029413, 0.020724, 0.002714, 0.010416, -0.020527, 0.050266, -0.081026, -0.006814, -0.007457, -0.032333, 0.008417, -0.122455, -0.006085, -0.025610, - 0.012614, 0.025817, -0.005419, 0.038657, 0.000789, 0.067111, 0.002818, 0.028696, 0.047305, -0.009993, -0.019508, 0.038604, 0.099657, 0.026728, 0.012361, 0.013626, - 0.023164, -0.037186, 0.007535, 0.054645, -0.009012, -0.019383, -0.005234, -0.018715, -0.000346, 0.051317, -0.028744, 0.029933, -0.006382, -0.018414, -0.033906, -0.028892, - -0.015301, -0.004276, 0.014626, -0.008505, 0.013717, -0.027323, -0.001332, -0.040227, 0.047021, -0.019082, -0.037260, -0.029780, -0.594026, 0.016573, -0.010523, 0.042616, - -0.013136, 0.030540, -0.151685, -0.005367, 0.016209, -0.034183, 0.009852, 0.038452, 0.005494, -0.017887, -0.007167, 0.017262, -0.038980, 0.011995, 0.021952, -0.031660, - 0.020507, -0.035880, 0.035183, -0.026975, -0.050788, -0.002553, 0.037774, -0.020082, -0.015403, 0.045022, 0.072167, -0.029237, 0.003895, -0.051250, 0.008581, 0.023545, - -0.026827, 0.020895, 0.041780, -0.040766, -0.008146, 0.080630, 0.000404, 0.032003, -0.005279, -0.090707, -0.013813, 0.010204, -0.001513, 0.016394, -0.001321, 0.020535, - -0.038645, 0.024858, 0.024378, 0.018717, -0.056314, 0.024402, 0.018694, 0.029009, -0.008502, -0.014694, -0.028345, 0.005202, 0.046116, -0.032166, -0.030706, -0.038738, - -0.031356, -0.009683, 0.040069, 0.001596, -0.012621, 0.018590, -0.024138, 0.035330, 0.011546, 0.015791, -0.026932, 0.004531, 0.022455, -0.012871, 0.013915, -0.009567, - -0.010976, 0.013497, 0.042590, 0.002072, -0.052718, -0.045494, 0.013036, -0.005403, -0.005947, -0.003437, 0.016653, -0.016805, -0.040291, 0.007927, 0.001296, -0.008319, - 0.021514, -0.001452, -0.121998, 0.015396, -0.022594, -0.006977, -0.040108, -0.035550, -0.021872, -0.014721, 0.019799, 0.036556, 0.015072, -0.057988, -0.011684, -0.045220, - -0.026295, 0.052647, 0.013741, -0.013428, 0.061794, 0.021431, -0.011316, -0.009963, 0.008198, 0.027746, 0.074219, -0.019499, 0.042673, 0.016028, 0.007214, -0.010650, - -0.019682, 0.001902, 0.038867, -0.007333, 0.031749, 0.004391, 0.018688, 0.044654, 0.030615, -0.027816, 0.031711, -0.056952, -0.033499, -0.039368, 0.025801, -0.027610, - -0.009329, -0.001799, 0.024061, -0.012593, -0.050266, -0.012512, 0.019528, -0.083434, 0.018238, 0.034138, -0.020120, -0.009910, -0.002280, 0.035325, 0.034440, -0.055205, - -0.017698, -0.000439, -0.034703, 0.013356, -0.037287, 0.048494, -0.018570, 0.028069, 0.019269, -0.007263, -0.008521, 0.000426, -0.016677, 0.056162, -0.011944, 0.017322, - 0.022219, -0.014266, -0.009292, -0.009979, 0.014973, 0.011623, -0.017799, 0.032925, -0.024668, 0.007312, -0.025035, -0.008967, -0.026827, 0.011889, -0.138517, -0.009608, - -0.020592, -0.001272, 0.015676, -0.025706, 0.031775, -0.004195, 0.026876, -0.014748, -0.025966, -0.008741, 0.035437, 0.017139, -0.005140, -0.007101, -0.012510, -0.023600, - 0.032969, -0.005510, 0.020010, 0.032567, 0.015558, 0.004265, -0.036300, 0.048210, 0.080424, -0.052820, -0.002063, -0.020875, 0.052530, -0.001638, -0.020299, -0.035202, - 0.087818, 0.034614, -0.032735, 0.033201, -0.001751, 0.029574, 0.009926, 0.011619, -0.001267, -0.020149, -0.003826, -0.029860, 0.011437, -0.051276, 0.024344, 0.003096, - -0.011573, 0.038228, -0.005730, -0.052328, 0.001909, -0.025877, 0.019976, -0.010160, 0.023892, 0.049161, -0.028978, 0.018700, -0.026460, 0.001090, -0.072128, -0.008406, - 0.010828, 0.020621, -0.005706, 0.023797, 0.036231, -0.112069, 0.017601, 0.007496, 0.045999, 0.016771, 0.021977, 0.022305, 0.018377, 0.002036, -0.029815, -0.082922, - -0.012710, -0.026355, 0.003790, 0.017472, -0.023148, -0.002901, -0.057854, 0.028393, 0.230866, -0.023486, 0.051094, 0.047508, 0.018957, -0.037130, 0.001054, -0.026126, - 0.021970, -0.046915, -0.019419, -0.014077, 0.002502, -0.079454, -0.057149, -0.081701, 0.041979, -0.043074, -0.009425, -0.035776, -0.021794, -0.004826, -0.057263, -0.072940, - 0.037651, -0.013991, -0.043863, -0.020581, 0.034319, -0.052566, -0.010355, -0.022963, 0.027144, -0.017339, 0.088930, -0.000670, -0.026547, -0.026586, -0.032531, 0.040314, - 0.010148, 0.021104, 0.009228, -0.073227, 0.036650, -0.019337, 0.010211, -0.089620, -0.024676, -0.020729, -0.004070, 0.000784, -0.110561, 0.015390, 0.027151, -0.003228, - -0.066704, -0.004797, -0.026117, -0.018131, -0.090114, 0.020659, -0.007157, 0.013608, -0.022324, 0.027487, 0.018873, 0.027854, 0.045085, -0.039992, -0.017829, 0.011071, - -0.011393, -0.004454, -0.037189, -0.030299, 0.059668, 0.005064, 0.024655, -0.037239, 0.046882, -0.010356, -0.009690, 0.061909, -0.024736, 0.016849, 0.000784, 0.000201, - 0.066165, 0.010234, -0.012134, -0.002823, -0.060847, 0.008953, 0.010348, 0.022292, -0.044602, -0.020981, 0.038839, 0.006616, -0.016836, -0.043995, -0.005463, -0.036413, - 0.034895, -0.018008, -0.009543, -0.025080, -0.035243, 0.042696, -0.028911, -0.030676, -0.038542, -0.027798, -0.026607, 0.019467, 0.070629, -0.037356, -0.042648, -0.000284, - 0.033095, 0.077781, -0.052930, 0.022515, -0.029926, -0.033821, -0.003277, -0.000038, -0.026871, 0.018223, -0.004221, 0.023454, -0.030611, -0.006396, -0.009873, -0.008402, - ], - dtype=torch.float32, -) -SYSTEM_PROMPT_DYNAMIC = torch.tensor( - [ - 0.010809, 0.021177, -0.017600, -0.016814, 0.012351, -0.024554, 0.018299, 0.039305, 0.003331, 0.030473, 0.005557, -0.040898, 0.047294, -0.016136, 0.076989, -0.002723, - 0.017622, 0.042330, 0.058266, -0.016232, -0.029502, 0.004529, 0.033543, -0.041481, -0.017631, 0.002727, 0.018874, 0.019932, -0.030052, -0.009997, 0.004582, 0.002135, - -0.003720, -0.030923, 0.021174, 0.034033, -0.007096, 0.011522, -0.009518, 0.055688, -0.092351, -0.003914, 0.004589, -0.032635, 0.012479, -0.140607, -0.014141, -0.031821, - 0.001396, 0.026780, -0.007623, 0.039957, 0.006434, 0.047516, 0.014377, 0.015237, 0.034212, 0.003576, -0.027357, 0.038888, 0.087272, 0.020248, 0.015165, 0.016002, - 0.020781, -0.040509, -0.008929, 0.080857, -0.002642, -0.009738, -0.005683, -0.000615, -0.012801, 0.046457, -0.045004, 0.024689, 0.002498, -0.017333, -0.027366, -0.023231, - -0.006064, -0.021505, 0.007405, -0.021249, 0.026252, -0.018690, 0.020093, -0.036954, 0.037510, -0.032027, -0.030871, -0.011173, -0.618627, 0.021213, -0.004366, 0.029555, - -0.004324, 0.020221, -0.143832, -0.021386, 0.010482, -0.042113, 0.016164, 0.040350, 0.014627, -0.011778, -0.018102, 0.035380, -0.020305, 0.010590, 0.009227, -0.011415, - 0.018623, -0.036384, 0.031003, -0.017073, -0.056456, -0.010423, 0.033029, -0.023511, -0.008717, 0.045716, 0.068273, -0.027886, 0.009665, -0.039801, 0.001465, 0.024361, - -0.015039, 0.022903, 0.033362, -0.022804, 0.008631, 0.076518, 0.000619, 0.022786, -0.015435, -0.095242, -0.006092, 0.015496, -0.009081, 0.015740, 0.004280, 0.013103, - -0.031836, 0.034241, 0.031836, 0.032636, -0.053721, 0.034370, 0.019172, 0.018383, 0.006907, -0.036039, -0.027927, 0.008646, 0.040496, -0.060314, -0.039116, -0.021488, - -0.031682, -0.005077, 0.034920, 0.002148, -0.008087, 0.002024, -0.008480, 0.041096, 0.011401, 0.020380, -0.025078, 0.005002, 0.022252, -0.014577, 0.008051, -0.014476, - -0.007078, 0.021075, 0.036965, 0.005343, -0.038671, -0.037222, 0.014052, -0.009952, -0.003958, -0.001878, 0.017848, -0.016608, -0.030813, 0.010921, 0.001068, 0.003095, - 0.007076, -0.001936, -0.102996, 0.006838, -0.005243, -0.009140, -0.043796, -0.027227, -0.008426, -0.013177, 0.015602, 0.021036, 0.025484, -0.064836, -0.003593, -0.038036, - -0.023102, 0.064053, 0.007850, 0.000771, 0.039297, 0.011903, -0.015866, -0.017612, 0.006308, 0.024342, 0.086761, -0.016705, 0.039239, 0.025079, -0.006452, 0.003174, - -0.010146, 0.010787, 0.035932, -0.015346, 0.037191, 0.010990, 0.011573, 0.044958, 0.035560, -0.017339, 0.018878, -0.025394, -0.044339, -0.029852, 0.015951, -0.032248, - -0.012019, 0.013497, 0.012224, -0.001284, -0.034041, -0.015768, 0.000230, -0.086076, 0.024878, 0.031929, -0.016668, -0.019815, -0.001325, 0.007944, 0.017674, -0.036097, - -0.019651, -0.001272, -0.032842, 0.002056, -0.037140, 0.043191, -0.003710, 0.011767, 0.020313, -0.018396, -0.015935, 0.010228, -0.017349, 0.049363, -0.010007, 0.019533, - 0.018076, 0.016608, -0.005523, -0.007793, 0.016868, 0.019341, -0.008236, 0.026765, -0.025324, -0.007849, -0.023648, -0.007791, -0.018508, 0.015357, -0.166499, -0.003718, - -0.035447, -0.005229, 0.019327, -0.014207, 0.028433, -0.002619, 0.013888, -0.033146, -0.017015, 0.004677, 0.039554, 0.003803, -0.014592, -0.018886, -0.023868, -0.022708, - 0.033661, 0.008626, 0.015687, 0.046395, 0.014173, 0.015083, -0.025994, 0.039120, 0.076334, -0.061165, 0.001791, -0.017579, 0.067567, -0.002415, -0.032495, -0.025576, - 0.079027, 0.036370, -0.013303, 0.030510, -0.009061, 0.019135, 0.015627, 0.024864, 0.015093, -0.017066, -0.014075, -0.021907, 0.017388, -0.033492, 0.013317, -0.000040, - 0.003396, 0.044030, -0.009194, -0.049524, -0.005015, -0.040007, 0.009104, 0.000580, 0.005603, 0.035891, -0.038913, 0.023239, -0.017022, -0.002695, -0.095759, 0.018503, - 0.017365, 0.011104, -0.003433, 0.024113, 0.052609, -0.085274, 0.027565, -0.005833, 0.020700, 0.015842, 0.019148, 0.020203, -0.000698, -0.005337, -0.037400, -0.060144, - -0.031893, -0.038396, -0.001949, 0.018901, -0.014268, -0.004721, -0.055913, 0.013814, 0.215024, -0.011357, 0.057530, 0.050092, 0.016513, -0.059254, 0.001494, -0.031472, - 0.032190, -0.047512, -0.020501, -0.002571, 0.007844, -0.063630, -0.043938, -0.079595, 0.032820, -0.021659, -0.003738, -0.035267, -0.013794, -0.021172, -0.046356, -0.077079, - 0.021526, -0.007447, -0.050276, -0.029743, 0.022208, -0.039137, -0.021426, -0.029825, 0.029390, -0.002943, 0.073158, -0.000435, -0.032029, -0.038524, -0.029886, 0.017473, - 0.013513, 0.022738, 0.000632, -0.073718, 0.029219, -0.018896, 0.007302, -0.116122, -0.013324, -0.012214, -0.005960, -0.003720, -0.155869, 0.019896, 0.016919, -0.021133, - -0.066911, -0.000926, -0.020871, -0.015295, -0.086108, 0.014918, -0.009284, 0.001689, -0.038155, 0.039163, 0.015988, 0.014413, 0.034205, -0.053273, 0.001687, 0.012227, - -0.007341, -0.006123, -0.005731, -0.026863, 0.060196, 0.028929, 0.019328, -0.033709, 0.038789, -0.015624, 0.013323, 0.053821, -0.015538, -0.001610, 0.012959, -0.013897, - 0.082010, 0.012866, -0.017269, 0.000017, -0.059458, 0.015870, 0.028455, 0.025234, -0.051163, -0.022976, 0.011866, -0.005613, -0.008738, -0.047658, -0.002155, -0.029432, - 0.039242, -0.013491, -0.001641, -0.024210, -0.019187, 0.026716, -0.025698, -0.027591, -0.034678, -0.002473, -0.019391, 0.017597, 0.064385, -0.029104, -0.034501, -0.004955, - 0.015008, 0.060749, -0.051693, 0.020279, -0.027170, -0.027003, 0.000254, 0.011352, -0.028116, 0.028938, -0.007224, 0.019978, -0.025379, -0.004874, -0.019361, -0.020278, - ], - dtype=torch.float32, -) -SYSTEM_EN_RECAPTION = torch.tensor( - [ - 0.007721, 0.015421, -0.019305, -0.000920, 0.016031, -0.019730, 0.029683, 0.026810, -0.010510, 0.021463, 0.008833, -0.040851, 0.043260, -0.007042, 0.057224, 0.011995, - 0.007818, 0.046369, 0.059838, -0.028548, -0.047399, -0.000983, 0.024343, -0.052259, -0.013638, 0.006856, 0.009186, 0.014235, -0.031497, -0.008644, -0.009349, 0.018900, - 0.002913, -0.022475, 0.039518, 0.019052, -0.007600, 0.010634, -0.011830, 0.075675, -0.071738, -0.014947, 0.004995, -0.025804, -0.002553, -0.093262, 0.002881, -0.033744, - -0.007234, 0.013659, 0.009897, 0.039185, -0.005366, 0.041534, -0.005924, 0.019786, 0.048566, -0.009356, -0.027360, 0.042557, 0.091286, 0.009286, 0.015410, 0.028166, - 0.022476, -0.025162, 0.012144, 0.084603, -0.003150, -0.008549, -0.002099, -0.014987, -0.019480, 0.046843, -0.030613, 0.015557, -0.008965, -0.008798, -0.027032, -0.014112, - 0.018703, -0.014749, -0.000928, -0.024660, 0.024004, 0.004560, 0.028156, -0.028467, 0.025444, -0.038699, -0.014927, -0.031593, -0.648498, 0.018529, 0.003378, 0.030188, - -0.002314, 0.014950, -0.146615, -0.009005, 0.016579, -0.037867, 0.020907, 0.033160, 0.007877, -0.026345, -0.056428, 0.031255, -0.018404, 0.013334, 0.009988, -0.022790, - 0.020803, -0.036862, 0.036222, -0.006646, -0.058084, -0.012036, 0.044199, -0.027665, -0.015779, 0.051554, 0.059970, -0.025977, 0.003967, -0.035247, -0.000488, 0.023182, - 0.000468, 0.019190, 0.047268, -0.032279, -0.005302, 0.078669, -0.001915, 0.024918, -0.014952, -0.078905, -0.018333, 0.001362, -0.015115, 0.005435, 0.002313, 0.018766, - -0.032773, 0.037344, 0.024061, 0.012143, -0.057106, 0.029490, 0.019537, 0.009099, 0.026064, -0.015927, -0.037047, 0.006002, 0.025191, -0.035318, -0.032245, -0.047822, - -0.023568, -0.004533, 0.025100, 0.002758, -0.002649, -0.012287, -0.012139, 0.043080, 0.003295, 0.024667, -0.021050, 0.006752, 0.025315, -0.011127, 0.009800, -0.021343, - -0.024866, 0.010098, 0.026954, 0.012467, -0.035866, -0.031780, 0.007479, -0.003388, -0.012619, -0.012099, 0.014974, -0.001908, -0.032700, 0.004703, 0.003238, -0.007498, - 0.023241, 0.002715, -0.111739, 0.003317, 0.006475, -0.019792, -0.046558, -0.032593, -0.020762, -0.005059, 0.016934, 0.029195, 0.028744, -0.050633, 0.001907, -0.028791, - -0.016695, 0.052143, 0.010439, 0.007204, 0.028502, 0.012607, -0.012414, -0.031238, 0.007305, 0.032309, 0.087924, -0.010530, 0.029925, 0.032666, -0.002202, 0.017539, - -0.009091, -0.001631, 0.024906, -0.013102, 0.031772, 0.018465, 0.012035, 0.031460, 0.030193, 0.005289, 0.025859, -0.038971, -0.046577, -0.025852, 0.035235, -0.038514, - 0.001042, 0.013012, 0.023701, -0.014630, -0.029269, -0.011981, 0.008219, -0.067347, -0.003456, 0.028198, -0.008657, -0.017773, 0.010540, 0.023964, 0.021012, -0.034465, - -0.023748, 0.004065, -0.021598, 0.008440, -0.031533, 0.038390, -0.007680, -0.003852, 0.016136, -0.017906, -0.008927, 0.006300, -0.001251, 0.029337, -0.008632, 0.020568, - 0.021560, -0.007222, 0.005313, -0.013089, 0.012299, 0.031303, -0.013951, 0.016547, -0.024771, -0.008753, -0.030908, -0.014421, -0.017656, 0.014044, -0.114986, 0.000956, - -0.035588, 0.003756, 0.015383, -0.013358, 0.009385, -0.001359, 0.012623, -0.028724, 0.001607, 0.012809, 0.032668, 0.011834, -0.015587, -0.007170, -0.021344, -0.019664, - 0.017690, -0.014538, 0.016511, 0.038037, 0.029919, 0.020907, -0.018565, 0.032964, 0.078548, -0.050386, -0.003012, -0.016965, 0.064131, 0.008077, -0.025879, -0.035820, - 0.095075, 0.019901, -0.019114, 0.022832, 0.003741, 0.027148, 0.018231, 0.027741, 0.020328, 0.001700, -0.006939, -0.024154, 0.018523, -0.029819, 0.008050, -0.004477, - 0.006087, 0.056878, -0.009083, -0.061537, -0.011531, -0.037551, 0.000434, -0.005843, 0.024739, 0.032020, -0.053119, 0.020704, -0.012385, -0.002726, -0.082489, 0.009072, - 0.013341, 0.000316, 0.001899, 0.022868, 0.034407, -0.066857, 0.020589, 0.012195, 0.023211, -0.001520, 0.000897, 0.029670, -0.015930, 0.006509, -0.035172, -0.061215, - -0.014099, -0.038584, -0.012213, 0.018613, -0.012365, -0.002777, -0.055184, 0.017146, 0.214358, -0.015750, 0.052488, 0.045205, 0.025334, -0.054615, 0.002117, -0.038122, - 0.012402, -0.053418, -0.025405, 0.007235, 0.013208, -0.092481, -0.048700, -0.085186, 0.029039, -0.036767, -0.000777, -0.017625, -0.012556, -0.004887, -0.033660, -0.082310, - 0.013387, -0.003256, -0.062981, -0.019886, 0.017624, -0.037421, -0.020743, -0.020894, 0.041974, -0.008502, 0.088413, -0.018697, -0.029398, -0.029389, -0.043721, 0.013872, - 0.003944, 0.030361, 0.005355, -0.081355, 0.041843, -0.016395, 0.011954, -0.060440, -0.000966, -0.019101, 0.006803, -0.011310, -0.148581, 0.020342, 0.012795, -0.016473, - -0.053300, -0.012340, -0.016640, -0.029834, -0.082405, 0.011859, -0.004255, -0.004396, -0.012515, 0.031962, 0.030438, 0.013792, 0.031557, -0.047200, 0.006485, 0.024815, - -0.019376, -0.011454, -0.034184, -0.021329, 0.050115, 0.021720, 0.002874, -0.047163, 0.044031, -0.014663, 0.020534, 0.056017, 0.007017, 0.003323, 0.005734, -0.002777, - 0.082836, 0.012048, -0.023236, -0.007401, -0.071598, 0.016760, 0.017282, 0.028306, -0.026220, -0.008016, -0.000202, -0.020271, -0.019828, -0.046986, -0.005805, -0.039647, - 0.042879, -0.004463, 0.007753, -0.028916, -0.020612, 0.028833, -0.039839, -0.052447, -0.013275, -0.002407, -0.018937, 0.033216, 0.075535, -0.045026, -0.009901, 0.016637, - -0.000322, 0.073925, -0.055701, 0.014912, -0.045671, -0.021189, 0.006761, -0.002015, -0.027410, 0.018250, -0.015916, 0.016254, -0.044964, 0.029261, -0.029319, -0.005222, - ], - dtype=torch.float32, -) -SYSTEM_EN_THINK_RECAPTION = torch.tensor( - [ - 0.011004, 0.017341, -0.019959, -0.018314, 0.016520, -0.027395, 0.017946, 0.039665, 0.000645, 0.035903, 0.002499, -0.045664, 0.039472, -0.013479, 0.081302, 0.000182, - 0.006947, 0.042845, 0.059741, -0.010796, -0.035240, 0.004176, 0.029557, -0.043467, -0.017271, 0.006896, 0.010997, 0.022498, -0.023308, -0.013046, -0.000742, 0.016209, - -0.007152, -0.029868, 0.028747, 0.033743, -0.000227, 0.018419, -0.015023, 0.050376, -0.098475, -0.002375, 0.007897, -0.023936, 0.007843, -0.122463, -0.011680, -0.027267, - -0.007270, 0.021869, -0.011415, 0.043770, 0.000551, 0.048573, 0.003132, 0.014233, 0.037080, -0.004818, -0.028738, 0.044468, 0.073843, 0.016947, 0.014484, 0.021931, - 0.020110, -0.032309, -0.003811, 0.095704, -0.006950, -0.007237, -0.005529, -0.020573, -0.016259, 0.041909, -0.038748, 0.018029, 0.005066, -0.021186, -0.020102, -0.019719, - 0.006239, -0.021284, 0.004213, -0.024963, 0.032345, -0.012557, 0.037268, -0.038075, 0.040998, -0.032766, -0.023509, -0.016426, -0.627412, 0.022675, 0.000101, 0.023162, - -0.002081, 0.015922, -0.138671, -0.027995, 0.011579, -0.042859, 0.019935, 0.038077, 0.012640, -0.017377, -0.027456, 0.035151, -0.015756, 0.018530, 0.004646, -0.002589, - 0.019645, -0.043736, 0.034947, -0.010166, -0.061165, -0.019195, 0.028909, -0.019415, -0.009485, 0.049566, 0.068621, -0.038644, 0.011278, -0.036133, 0.000564, 0.022611, - -0.013612, 0.020854, 0.030614, -0.025578, 0.005673, 0.076526, -0.004887, 0.027769, -0.022605, -0.092657, -0.013218, 0.008081, -0.015227, 0.018031, -0.005145, 0.015028, - -0.027193, 0.034767, 0.028710, 0.032007, -0.053175, 0.033528, 0.019437, 0.011517, 0.012107, -0.027679, -0.026937, 0.008612, 0.036909, -0.051484, -0.039971, -0.034372, - -0.023825, -0.003025, 0.033648, -0.001852, 0.007309, 0.000714, -0.001075, 0.038534, 0.007586, 0.016213, -0.025223, -0.001099, 0.015852, -0.011477, 0.020635, -0.010696, - -0.019634, 0.025613, 0.034374, 0.007169, -0.035000, -0.032268, 0.015114, -0.014217, -0.005229, -0.005495, 0.018189, -0.011360, -0.026755, 0.007036, -0.002333, -0.001174, - 0.014729, 0.001739, -0.108591, 0.004699, 0.002048, -0.014801, -0.042855, -0.028846, -0.009609, -0.004500, 0.019466, 0.021848, 0.022140, -0.063035, -0.004272, -0.030798, - -0.018452, 0.055169, 0.012240, -0.003555, 0.038293, 0.008503, -0.016608, -0.021309, 0.000690, 0.027093, 0.088054, -0.008881, 0.034087, 0.030647, 0.003284, 0.005038, - -0.008359, 0.006311, 0.032462, -0.009699, 0.035283, 0.015261, 0.012827, 0.038169, 0.033959, -0.018048, 0.018122, -0.025259, -0.040084, -0.030879, 0.019853, -0.042558, - -0.011938, 0.019602, 0.016537, -0.003378, -0.027890, -0.014909, -0.005464, -0.071862, 0.012335, 0.021899, -0.017008, -0.023228, 0.003263, 0.004571, 0.016447, -0.029446, - -0.022645, -0.001261, -0.018573, 0.007431, -0.027587, 0.035362, -0.006785, -0.000614, 0.026044, -0.009056, -0.009843, 0.010467, -0.011929, 0.042025, -0.014068, 0.023113, - 0.023880, 0.014948, 0.004370, -0.005262, 0.012587, 0.021608, -0.001783, 0.023697, -0.024945, -0.011533, -0.020953, -0.007205, -0.024693, 0.012961, -0.168760, 0.001767, - -0.041265, -0.007044, 0.015021, -0.008407, 0.029642, -0.000956, 0.008607, -0.035365, -0.012187, 0.011744, 0.032612, 0.006226, -0.015891, -0.017747, -0.022565, -0.024505, - 0.031279, 0.004188, 0.011939, 0.038032, 0.008798, 0.012314, -0.024830, 0.034484, 0.076395, -0.060108, 0.001019, -0.016138, 0.067729, 0.003899, -0.029845, -0.019960, - 0.086663, 0.040965, -0.010458, 0.027808, -0.006394, 0.017343, 0.014788, 0.024756, 0.016446, -0.012537, -0.008406, -0.028109, 0.013369, -0.033571, 0.012170, -0.002199, - 0.005263, 0.052280, -0.018171, -0.047898, -0.010087, -0.038632, 0.006773, -0.000838, 0.011197, 0.038187, -0.049525, 0.021689, -0.007385, -0.005987, -0.094551, 0.019019, - 0.012760, 0.009617, -0.002262, 0.030228, 0.047823, -0.079764, 0.023391, -0.005561, 0.018866, 0.012817, 0.020878, 0.027037, -0.013905, -0.002874, -0.035522, -0.046266, - -0.032448, -0.036010, -0.007776, 0.016512, -0.012279, -0.005665, -0.057974, 0.016967, 0.202836, -0.009066, 0.066093, 0.045689, 0.018319, -0.048465, 0.000242, -0.040874, - 0.027824, -0.049045, -0.015616, -0.000307, 0.009163, -0.072975, -0.042979, -0.082254, 0.040549, -0.027049, 0.000725, -0.034118, -0.019604, -0.019097, -0.042483, -0.075446, - 0.019387, -0.005218, -0.053573, -0.029975, 0.008195, -0.036608, -0.018920, -0.025610, 0.028426, -0.002688, 0.074996, -0.003423, -0.032505, -0.030565, -0.028142, 0.014437, - 0.013359, 0.019376, 0.008356, -0.069731, 0.031824, -0.011103, 0.019327, -0.117090, -0.009352, -0.010290, -0.002129, -0.009198, -0.172915, 0.021232, 0.017274, -0.030060, - -0.061449, -0.006598, -0.013069, -0.012857, -0.081220, 0.019058, -0.004841, 0.003066, -0.037741, 0.041806, 0.018281, 0.009458, 0.036761, -0.044987, 0.003557, 0.008890, - -0.008011, -0.004063, -0.013474, -0.022090, 0.055398, 0.037475, 0.006991, -0.035962, 0.045503, -0.017162, 0.022391, 0.052754, -0.005924, -0.005936, 0.012673, -0.017922, - 0.084548, 0.014695, -0.013817, 0.000421, -0.065167, 0.018269, 0.023317, 0.023523, -0.034229, -0.019588, 0.007911, -0.002426, -0.017109, -0.050870, 0.002848, -0.033077, - 0.043451, -0.010609, -0.000375, -0.023206, -0.018155, 0.027102, -0.036006, -0.035115, -0.023922, 0.005989, -0.015372, 0.027123, 0.075210, -0.035302, -0.029799, 0.003642, - 0.007714, 0.063498, -0.053234, 0.015699, -0.040459, -0.027354, -0.002433, 0.010923, -0.020134, 0.029292, -0.010176, 0.013508, -0.032403, 0.004323, -0.017504, -0.015237, - ], - dtype=torch.float32, -) -SYSTEM_EN_VANILLA = torch.tensor( - [ - 0.010809, 0.021177, -0.017600, -0.016814, 0.012351, -0.024554, 0.018299, 0.039305, 0.003331, 0.030473, 0.005557, -0.040898, 0.047294, -0.016136, 0.076989, -0.002723, - 0.017622, 0.042330, 0.058266, -0.016232, -0.029502, 0.004529, 0.033543, -0.041481, -0.017631, 0.002727, 0.018874, 0.019932, -0.030052, -0.009997, 0.004582, 0.002135, - -0.003720, -0.030923, 0.021174, 0.034033, -0.007096, 0.011522, -0.009518, 0.055688, -0.092351, -0.003914, 0.004589, -0.032635, 0.012479, -0.140607, -0.014141, -0.031821, - 0.001396, 0.026780, -0.007623, 0.039957, 0.006434, 0.047516, 0.014377, 0.015237, 0.034212, 0.003576, -0.027357, 0.038888, 0.087272, 0.020248, 0.015165, 0.016002, - 0.020781, -0.040509, -0.008929, 0.080857, -0.002642, -0.009738, -0.005683, -0.000615, -0.012801, 0.046457, -0.045004, 0.024689, 0.002498, -0.017333, -0.027366, -0.023231, - -0.006064, -0.021505, 0.007405, -0.021249, 0.026252, -0.018690, 0.020093, -0.036954, 0.037510, -0.032027, -0.030871, -0.011173, -0.618627, 0.021213, -0.004366, 0.029555, - -0.004324, 0.020221, -0.143832, -0.021386, 0.010482, -0.042113, 0.016164, 0.040350, 0.014627, -0.011778, -0.018102, 0.035380, -0.020305, 0.010590, 0.009227, -0.011415, - 0.018623, -0.036384, 0.031003, -0.017073, -0.056456, -0.010423, 0.033029, -0.023511, -0.008717, 0.045716, 0.068273, -0.027886, 0.009665, -0.039801, 0.001465, 0.024361, - -0.015039, 0.022903, 0.033362, -0.022804, 0.008631, 0.076518, 0.000619, 0.022786, -0.015435, -0.095242, -0.006092, 0.015496, -0.009081, 0.015740, 0.004280, 0.013103, - -0.031836, 0.034241, 0.031836, 0.032636, -0.053721, 0.034370, 0.019172, 0.018383, 0.006907, -0.036039, -0.027927, 0.008646, 0.040496, -0.060314, -0.039116, -0.021488, - -0.031682, -0.005077, 0.034920, 0.002148, -0.008087, 0.002024, -0.008480, 0.041096, 0.011401, 0.020380, -0.025078, 0.005002, 0.022252, -0.014577, 0.008051, -0.014476, - -0.007078, 0.021075, 0.036965, 0.005343, -0.038671, -0.037222, 0.014052, -0.009952, -0.003958, -0.001878, 0.017848, -0.016608, -0.030813, 0.010921, 0.001068, 0.003095, - 0.007076, -0.001936, -0.102996, 0.006838, -0.005243, -0.009140, -0.043796, -0.027227, -0.008426, -0.013177, 0.015602, 0.021036, 0.025484, -0.064836, -0.003593, -0.038036, - -0.023102, 0.064053, 0.007850, 0.000771, 0.039297, 0.011903, -0.015866, -0.017612, 0.006308, 0.024342, 0.086761, -0.016705, 0.039239, 0.025079, -0.006452, 0.003174, - -0.010146, 0.010787, 0.035932, -0.015346, 0.037191, 0.010990, 0.011573, 0.044958, 0.035560, -0.017339, 0.018878, -0.025394, -0.044339, -0.029852, 0.015951, -0.032248, - -0.012019, 0.013497, 0.012224, -0.001284, -0.034041, -0.015768, 0.000230, -0.086076, 0.024878, 0.031929, -0.016668, -0.019815, -0.001325, 0.007944, 0.017674, -0.036097, - -0.019651, -0.001272, -0.032842, 0.002056, -0.037140, 0.043191, -0.003710, 0.011767, 0.020313, -0.018396, -0.015935, 0.010228, -0.017349, 0.049363, -0.010007, 0.019533, - 0.018076, 0.016608, -0.005523, -0.007793, 0.016868, 0.019341, -0.008236, 0.026765, -0.025324, -0.007849, -0.023648, -0.007791, -0.018508, 0.015357, -0.166499, -0.003718, - -0.035447, -0.005229, 0.019327, -0.014207, 0.028433, -0.002619, 0.013888, -0.033146, -0.017015, 0.004677, 0.039554, 0.003803, -0.014592, -0.018886, -0.023868, -0.022708, - 0.033661, 0.008626, 0.015687, 0.046395, 0.014173, 0.015083, -0.025994, 0.039120, 0.076334, -0.061165, 0.001791, -0.017579, 0.067567, -0.002415, -0.032495, -0.025576, - 0.079027, 0.036370, -0.013303, 0.030510, -0.009061, 0.019135, 0.015627, 0.024864, 0.015093, -0.017066, -0.014075, -0.021907, 0.017388, -0.033492, 0.013317, -0.000040, - 0.003396, 0.044030, -0.009194, -0.049524, -0.005015, -0.040007, 0.009104, 0.000580, 0.005603, 0.035891, -0.038913, 0.023239, -0.017022, -0.002695, -0.095759, 0.018503, - 0.017365, 0.011104, -0.003433, 0.024113, 0.052609, -0.085274, 0.027565, -0.005833, 0.020700, 0.015842, 0.019148, 0.020203, -0.000698, -0.005337, -0.037400, -0.060144, - -0.031893, -0.038396, -0.001949, 0.018901, -0.014268, -0.004721, -0.055913, 0.013814, 0.215024, -0.011357, 0.057530, 0.050092, 0.016513, -0.059254, 0.001494, -0.031472, - 0.032190, -0.047512, -0.020501, -0.002571, 0.007844, -0.063630, -0.043938, -0.079595, 0.032820, -0.021659, -0.003738, -0.035267, -0.013794, -0.021172, -0.046356, -0.077079, - 0.021526, -0.007447, -0.050276, -0.029743, 0.022208, -0.039137, -0.021426, -0.029825, 0.029390, -0.002943, 0.073158, -0.000435, -0.032029, -0.038524, -0.029886, 0.017473, - 0.013513, 0.022738, 0.000632, -0.073718, 0.029219, -0.018896, 0.007302, -0.116122, -0.013324, -0.012214, -0.005960, -0.003720, -0.155869, 0.019896, 0.016919, -0.021133, - -0.066911, -0.000926, -0.020871, -0.015295, -0.086108, 0.014918, -0.009284, 0.001689, -0.038155, 0.039163, 0.015988, 0.014413, 0.034205, -0.053273, 0.001687, 0.012227, - -0.007341, -0.006123, -0.005731, -0.026863, 0.060196, 0.028929, 0.019328, -0.033709, 0.038789, -0.015624, 0.013323, 0.053821, -0.015538, -0.001610, 0.012959, -0.013897, - 0.082010, 0.012866, -0.017269, 0.000017, -0.059458, 0.015870, 0.028455, 0.025234, -0.051163, -0.022976, 0.011866, -0.005613, -0.008738, -0.047658, -0.002155, -0.029432, - 0.039242, -0.013491, -0.001641, -0.024210, -0.019187, 0.026716, -0.025698, -0.027591, -0.034678, -0.002473, -0.019391, 0.017597, 0.064385, -0.029104, -0.034501, -0.004955, - 0.015008, 0.060749, -0.051693, 0.020279, -0.027170, -0.027003, 0.000254, 0.011352, -0.028116, 0.028938, -0.007224, 0.019978, -0.025379, -0.004874, -0.019361, -0.020278, - ], - dtype=torch.float32, -) -SYSTEM_EN_UNIFIED = torch.tensor( - [ - 0.011409, 0.014191, -0.023163, -0.020119, 0.019190, -0.029559, 0.019616, 0.035872, 0.010434, 0.028709, 0.011616, -0.039422, 0.038369, -0.004631, 0.081177, 0.007400, - 0.008903, 0.040408, 0.055323, -0.011950, -0.026940, 0.004916, 0.028101, -0.046200, -0.016732, 0.005115, 0.012100, 0.016136, -0.026057, -0.013827, -0.004914, 0.015261, - -0.010824, -0.028188, 0.022934, 0.026204, -0.003855, 0.013797, -0.014518, 0.050289, -0.100077, -0.002962, 0.009050, -0.028205, 0.016294, -0.128956, -0.012730, -0.023647, - -0.009306, 0.020066, 0.000033, 0.043619, 0.003250, 0.053425, 0.005889, 0.021529, 0.036032, -0.003254, -0.029715, 0.048345, 0.077978, 0.010674, 0.019296, 0.018721, - 0.019244, -0.040115, -0.004245, 0.085214, -0.005280, -0.010746, -0.000164, -0.023405, -0.015641, 0.040193, -0.038735, 0.018966, -0.004031, -0.017879, -0.023017, -0.030379, - 0.006468, -0.015959, 0.000532, -0.026530, 0.042640, -0.006095, 0.037899, -0.043658, 0.040965, -0.034682, -0.023729, -0.019291, -0.630840, 0.029658, 0.005462, 0.026650, - -0.000292, 0.013954, -0.149594, -0.019405, 0.015321, -0.045104, 0.030332, 0.031727, 0.012349, -0.009553, -0.022371, 0.034043, -0.014838, 0.015398, -0.003657, 0.000477, - 0.021084, -0.041406, 0.029946, -0.013832, -0.057358, -0.018086, 0.031598, -0.031835, -0.006697, 0.040866, 0.068602, -0.042203, 0.007362, -0.036959, 0.003794, 0.026533, - -0.011873, 0.017343, 0.028333, -0.021804, 0.004007, 0.075133, 0.003340, 0.025326, -0.015068, -0.092280, -0.011514, 0.006827, -0.008254, 0.021181, -0.005035, 0.022263, - -0.022443, 0.043919, 0.026637, 0.028568, -0.056881, 0.036740, 0.024430, 0.015891, 0.012257, -0.031126, -0.030108, 0.007229, 0.026998, -0.051685, -0.033003, -0.031170, - -0.024021, 0.004235, 0.030164, 0.002674, 0.008018, 0.005532, 0.001621, 0.044790, 0.006413, 0.027160, -0.015022, 0.000911, 0.019723, -0.016244, 0.020077, -0.006847, - -0.014110, 0.022461, 0.031656, 0.002760, -0.039078, -0.026893, 0.006628, -0.011775, -0.000240, -0.005908, 0.014943, -0.012131, -0.021755, 0.004732, -0.005297, -0.002922, - 0.014631, -0.002010, -0.112400, 0.000842, -0.002732, -0.014861, -0.052099, -0.034167, -0.011613, -0.006101, 0.013278, 0.018867, 0.026530, -0.068150, -0.003306, -0.032801, - -0.018523, 0.050875, 0.005488, -0.007241, 0.045707, 0.023119, -0.021519, -0.022683, 0.004806, 0.024827, 0.091371, -0.014424, 0.043836, 0.033094, 0.002390, 0.005450, - -0.004893, 0.013608, 0.031272, -0.002449, 0.031607, 0.014646, 0.014146, 0.043995, 0.028826, -0.012219, 0.021008, -0.020911, -0.036967, -0.036256, 0.013328, -0.038382, - -0.012084, 0.018183, 0.018782, -0.004697, -0.024284, -0.015474, -0.001463, -0.076015, 0.013923, 0.022125, -0.018765, -0.010793, 0.008409, 0.002067, 0.017961, -0.029716, - -0.020915, -0.001779, -0.009217, -0.001933, -0.036081, 0.042577, 0.000118, -0.013920, 0.014901, -0.016486, -0.010278, -0.000449, -0.017234, 0.042453, -0.009893, 0.021087, - 0.017671, 0.009861, -0.004210, 0.004944, 0.015627, 0.014370, -0.001128, 0.030247, -0.019552, -0.014017, -0.020859, -0.002614, -0.024405, 0.016532, -0.173204, -0.001196, - -0.037415, -0.010990, 0.010449, -0.006124, 0.019211, 0.003695, 0.011679, -0.031852, -0.009764, 0.005773, 0.035793, 0.003455, -0.011772, -0.020532, -0.027434, -0.024761, - 0.027483, -0.001554, 0.010411, 0.037888, 0.015619, 0.019186, -0.021204, 0.038158, 0.074991, -0.064521, -0.002503, -0.014499, 0.068165, 0.006145, -0.032891, -0.021540, - 0.091385, 0.047584, -0.009590, 0.028004, -0.002962, 0.021061, 0.014854, 0.025840, 0.016068, -0.014364, -0.016418, -0.033454, 0.011734, -0.036518, 0.013015, -0.003966, - 0.000855, 0.051373, -0.010960, -0.047078, -0.011048, -0.042015, 0.006818, 0.005483, 0.010251, 0.034951, -0.046162, 0.021258, -0.013397, -0.005259, -0.093775, 0.019974, - 0.014992, 0.004043, -0.005931, 0.035662, 0.050723, -0.083293, 0.028047, -0.008042, 0.020763, 0.016763, 0.022913, 0.027129, -0.014314, -0.009854, -0.039019, -0.044870, - -0.028101, -0.038026, -0.006294, 0.018265, -0.015425, -0.007866, -0.052784, 0.010470, 0.200260, -0.007798, 0.064482, 0.046612, 0.025353, -0.059695, -0.001831, -0.039643, - 0.025148, -0.042752, -0.014928, -0.010216, 0.014195, -0.069149, -0.041424, -0.078360, 0.036999, -0.021357, 0.011032, -0.026564, -0.016214, -0.023440, -0.044723, -0.064498, - 0.018283, -0.007165, -0.051802, -0.026299, 0.005867, -0.034691, -0.020621, -0.030512, 0.024458, -0.011330, 0.066558, -0.004069, -0.031624, -0.030639, -0.037451, 0.013079, - 0.015152, 0.008058, 0.009223, -0.069514, 0.030702, -0.009681, 0.014826, -0.115441, -0.005514, -0.011925, 0.001046, -0.007148, -0.164128, 0.018043, 0.017001, -0.026352, - -0.049691, -0.011637, -0.013045, -0.014851, -0.079469, 0.017692, -0.006575, 0.001063, -0.028299, 0.038777, 0.019930, 0.010641, 0.036955, -0.039004, -0.006477, 0.004278, - -0.001006, -0.002514, -0.017242, -0.023927, 0.049113, 0.038393, 0.011633, -0.031537, 0.041725, -0.012146, 0.023445, 0.049999, -0.008538, 0.001319, 0.012732, -0.021170, - 0.082096, 0.009610, -0.025717, 0.002566, -0.060849, 0.017403, 0.032650, 0.018658, -0.030629, -0.025032, 0.005555, 0.000522, -0.009667, -0.043099, 0.005939, -0.027156, - 0.045634, -0.011986, 0.002713, -0.032225, -0.015494, 0.028734, -0.036528, -0.033101, -0.027174, 0.009490, -0.016537, 0.029435, 0.065709, -0.037711, -0.020497, -0.005578, - 0.011768, 0.061035, -0.044676, 0.016113, -0.042945, -0.022579, 0.002430, 0.012474, -0.018198, 0.030468, -0.016646, 0.019020, -0.035804, 0.001175, -0.018312, -0.010760, - ], - dtype=torch.float32, -) -# fmt: on -SYSTEM_PROMPT_CASES = [ - pytest.param("none", None, SEED_1234, id="none"), - pytest.param("dynamic", "dynamic", SYSTEM_PROMPT_DYNAMIC, id="dynamic"), - pytest.param("en_vanilla", "en_vanilla", SYSTEM_EN_VANILLA, id="en_vanilla"), - pytest.param("en_recaption", "en_recaption", SYSTEM_EN_RECAPTION, id="en_recaption"), - pytest.param("en_think_recaption", "en_think_recaption", SYSTEM_EN_THINK_RECAPTION, id="en_think_recaption"), - pytest.param("en_unified", "en_unified", SYSTEM_EN_UNIFIED, id="en_unified"), -] - - -@pytest.fixture(scope="session") -def clip_bundle() -> tuple[CLIPModel, CLIPProcessor]: - try: - model = CLIPModel.from_pretrained(LOCAL_CLIP_PATH, local_files_only=True) - processor = CLIPProcessor.from_pretrained(LOCAL_CLIP_PATH, local_files_only=True) - except OSError as exc: - pytest.skip(f"Could not load CLIP model from local cache ({LOCAL_CLIP_PATH}): {exc}") - - model.eval() - return model, processor - - -@pytest.fixture(scope="module") -def omni() -> Generator[Omni, None, None]: - with OmniRunner( - MODEL_NAME, - stage_configs_path=str(STAGE_CONFIG_PATH), - ) as runner: - yield runner.omni - - -def _extract_generated_image(outputs: list[object]) -> Image.Image: - if not outputs: - raise AssertionError("No outputs were returned from Omni.generate()") - - first_output = outputs[0] - if images := getattr(first_output, "images", None): - return images[0] - - request_output = getattr(first_output, "request_output", None) - if request_output is not None and (images := getattr(request_output, "images", None)): - return images[0] - - raise AssertionError("No generated image found in Omni output") - - -def extract_embedding(image: Image.Image, clip_model: CLIPModel, clip_processor: CLIPProcessor) -> torch.Tensor: - inputs = clip_processor(images=image.convert("RGB"), return_tensors="pt") - with torch.inference_mode(): - features = clip_model.get_image_features(**inputs) - features = F.normalize(features, p=2, dim=-1) - return features.squeeze(0) - - -def compare_semantic( - expected_embedding: torch.Tensor, - image: Image.Image, - clip_model: CLIPModel, - clip_processor: CLIPProcessor, -) -> float: - features = extract_embedding(image, clip_model, clip_processor) - expected = F.normalize(expected_embedding, p=2, dim=-1) - return torch.dot(expected, features).item() - - -def _generate_image(omni: Omni, use_system_prompt: str | None) -> Image.Image: - generator_device = current_omni_platform.device_type or "cuda" - sampling_params = OmniDiffusionSamplingParams( - seed=1234, - generator=torch.Generator(device=generator_device).manual_seed(1234), - num_outputs_per_prompt=1, - ) - if use_system_prompt is not None: - sampling_params.extra_args = {"use_system_prompt": use_system_prompt} - - outputs = omni.generate({"prompt": PROMPT}, sampling_params) - return _extract_generated_image(outputs) - - -@pytest.mark.skipif(torch.cuda.device_count() < 8, reason="Need at least 8 CUDA GPUs for this test.") -@pytest.mark.parametrize("system_prompt_name,use_system_prompt,expected_embedding", SYSTEM_PROMPT_CASES) -def test_system_prompt_scores( - omni: Omni, - clip_bundle: tuple[CLIPModel, CLIPProcessor], - system_prompt_name: str, - use_system_prompt: str | None, - expected_embedding: torch.Tensor, -) -> None: - clip_model, clip_processor = clip_bundle - generated_image = _generate_image(omni, use_system_prompt) - score = compare_semantic(expected_embedding, generated_image, clip_model, clip_processor) - - print(f"{system_prompt_name}: CLIP cosine similarity = {score:.6f}") diff --git a/tests/e2e/offline_inference/test_ltx2_cfg_parallel_parity.py b/tests/e2e/offline_inference/test_ltx2_cfg_parallel_parity.py deleted file mode 100644 index 07aa5a647be..00000000000 --- a/tests/e2e/offline_inference/test_ltx2_cfg_parallel_parity.py +++ /dev/null @@ -1,243 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import hashlib -import os -import subprocess -import sys -from pathlib import Path - -import numpy as np -import pytest -from PIL import Image - -from tests.helpers.mark import hardware_test - -REPO_ROOT = Path(__file__).resolve().parents[3] -T2V_EXAMPLE = REPO_ROOT / "examples" / "offline_inference" / "text_to_video" / "text_to_video.py" -I2V_EXAMPLE = REPO_ROOT / "examples" / "offline_inference" / "image_to_video" / "image_to_video.py" - -T2V_PROMPT = ( - "At sunrise, a glowing paper lantern boat drifts through a narrow canal between mossy stone walls, " - "soft fog above the water, the camera slowly gliding forward as golden reflections shimmer across " - "the ripples, cinematic, realistic, highly detailed." -) -T2V_NEGATIVE_PROMPT = "worst quality, blurry, jittery motion, distorted, oversaturated, artifacts" -I2V_PROMPT = "A cinematic dolly shot of a boat drifting on calm water at sunset" -I2V_NEGATIVE_PROMPT = "worst quality, blurry, jittery motion" - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - - -def _get_ltx2_model() -> str: - return os.environ.get("VLLM_TEST_LTX2_MODEL", "Lightricks/LTX-2") - - -def _md5(path: Path) -> str: - digest = hashlib.md5(usedforsecurity=False) - with path.open("rb") as f: - for chunk in iter(lambda: f.read(1024 * 1024), b""): - digest.update(chunk) - return digest.hexdigest() - - -def _make_deterministic_test_image(path: Path) -> None: - """Create a deterministic 256x256 test image for I2V tests.""" - rng = np.random.RandomState(42) - img = Image.fromarray(rng.randint(0, 255, (256, 256, 3), dtype=np.uint8)) - img.save(path) - - -def _run_and_check(cmd: list[str], env: dict, output_path: Path, expected_md5: str) -> None: - result = subprocess.run(cmd, cwd=REPO_ROOT, env=env, capture_output=True, text=True, check=False) - assert result.returncode == 0, ( - f"Command failed (exit {result.returncode}).\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}" - ) - generated_md5 = _md5(output_path) - assert generated_md5 == expected_md5, ( - f"Unexpected output md5: {generated_md5} != {expected_md5}.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}" - ) - - -# ── T2V tests ── - - -@pytest.mark.advanced_model -@pytest.mark.diffusion -@pytest.mark.parallel -@pytest.mark.slow -@hardware_test(res={"cuda": "L4"}, num_cards=2) -def test_ltx2_t2v_cfg_parallel(tmp_path: Path): - """T2V with CFG=4.0, cfg-parallel-size=2.""" - output = tmp_path / "t2v_cfg4.mp4" - env = os.environ.copy() - env.setdefault("CUDA_VISIBLE_DEVICES", "0,1") - cmd = [ - sys.executable, - str(T2V_EXAMPLE), - "--model", - _get_ltx2_model(), - "--prompt", - T2V_PROMPT, - "--negative-prompt", - T2V_NEGATIVE_PROMPT, - "--height", - "256", - "--width", - "256", - "--num-frames", - "145", - "--num-inference-steps", - "6", - "--guidance-scale", - "4.0", - "--frame-rate", - "24", - "--fps", - "24", - "--seed", - "42", - "--cfg-parallel-size", - "2", - "--enforce-eager", - "--output", - str(output), - ] - _run_and_check(cmd, env, output, expected_md5="08e606b9c522fee4b6f30cee8b77db40") - - -@pytest.mark.advanced_model -@pytest.mark.diffusion -@pytest.mark.slow -@hardware_test(res={"cuda": "L4"}, num_cards=1) -def test_ltx2_t2v_no_cfg(tmp_path: Path): - """T2V with CFG=1.0 (no classifier-free guidance).""" - output = tmp_path / "t2v_nocfg.mp4" - env = os.environ.copy() - env.setdefault("CUDA_VISIBLE_DEVICES", "0") - cmd = [ - sys.executable, - str(T2V_EXAMPLE), - "--model", - _get_ltx2_model(), - "--prompt", - T2V_PROMPT, - "--height", - "256", - "--width", - "256", - "--num-frames", - "145", - "--num-inference-steps", - "6", - "--guidance-scale", - "1.0", - "--frame-rate", - "24", - "--fps", - "24", - "--seed", - "42", - "--enforce-eager", - "--output", - str(output), - ] - _run_and_check(cmd, env, output, expected_md5="a83994b94b6e67c54a524e0383c45ce8") - - -# ── I2V tests ── - - -@pytest.mark.advanced_model -@pytest.mark.diffusion -@pytest.mark.parallel -@pytest.mark.slow -@hardware_test(res={"cuda": "L4"}, num_cards=2) -def test_ltx2_i2v_cfg_parallel(tmp_path: Path): - """I2V with CFG=4.0, cfg-parallel-size=2.""" - test_image = tmp_path / "test_input.png" - _make_deterministic_test_image(test_image) - output = tmp_path / "i2v_cfg4.mp4" - env = os.environ.copy() - env.setdefault("CUDA_VISIBLE_DEVICES", "0,1") - cmd = [ - sys.executable, - str(I2V_EXAMPLE), - "--model", - _get_ltx2_model(), - "--model-class-name", - "LTX2ImageToVideoPipeline", - "--image", - str(test_image), - "--prompt", - I2V_PROMPT, - "--negative-prompt", - I2V_NEGATIVE_PROMPT, - "--height", - "256", - "--width", - "256", - "--num-frames", - "73", - "--num-inference-steps", - "6", - "--guidance-scale", - "4.0", - "--frame-rate", - "24", - "--fps", - "24", - "--seed", - "42", - "--cfg-parallel-size", - "2", - "--enforce-eager", - "--output", - str(output), - ] - _run_and_check(cmd, env, output, expected_md5="aed7e56084b36373244d8f839b16d115") - - -@pytest.mark.advanced_model -@pytest.mark.diffusion -@pytest.mark.slow -@hardware_test(res={"cuda": "L4"}, num_cards=1) -def test_ltx2_i2v_no_cfg(tmp_path: Path): - """I2V with CFG=1.0 (no classifier-free guidance).""" - test_image = tmp_path / "test_input.png" - _make_deterministic_test_image(test_image) - output = tmp_path / "i2v_nocfg.mp4" - env = os.environ.copy() - env.setdefault("CUDA_VISIBLE_DEVICES", "0") - cmd = [ - sys.executable, - str(I2V_EXAMPLE), - "--model", - _get_ltx2_model(), - "--model-class-name", - "LTX2ImageToVideoPipeline", - "--image", - str(test_image), - "--prompt", - I2V_PROMPT, - "--height", - "256", - "--width", - "256", - "--num-frames", - "73", - "--num-inference-steps", - "6", - "--guidance-scale", - "1.0", - "--frame-rate", - "24", - "--fps", - "24", - "--seed", - "42", - "--enforce-eager", - "--output", - str(output), - ] - _run_and_check(cmd, env, output, expected_md5="81b21ede12753e9e14a357a6c548b666") diff --git a/tests/e2e/offline_inference/test_magi_human.py b/tests/e2e/offline_inference/test_magi_human.py deleted file mode 100644 index 6d46141729e..00000000000 --- a/tests/e2e/offline_inference/test_magi_human.py +++ /dev/null @@ -1,141 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""End-to-end tests for MagiHuman pipeline via vLLM-Omni.""" - -import io - -import av -import numpy as np -import pytest - -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner -from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes -from vllm_omni.inputs.data import OmniDiffusionSamplingParams - - -def _validate_mp4(video_bytes: bytes, min_frames: int = 10) -> None: - """Validate that the MP4 contains meaningful video and audio tracks.""" - container = av.open(io.BytesIO(video_bytes)) - - v_streams = [s for s in container.streams if s.type == "video"] - assert len(v_streams) >= 1, "No video stream found in MP4" - - a_streams = [s for s in container.streams if s.type == "audio"] - assert len(a_streams) >= 1, "No audio stream found in MP4" - - v_stream = v_streams[0] - assert v_stream.width >= 1080, f"Unexpected video width: {v_stream.width}" - assert v_stream.height >= 1056, f"Unexpected video height: {v_stream.height}" - - frame_count = 0 - for frame in container.decode(video=0): - frame_count += 1 - if frame_count >= min_frames: - break - assert frame_count >= min_frames, f"Video has only {frame_count} frames (expected >= {min_frames})" - - container.close() - - -@pytest.mark.core_model -@pytest.mark.advanced_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "H100"}, num_cards=2) -def test_magi_human_e2e(run_level): - """End-to-end test for MagiHuman generating video and audio.""" - if run_level != "advanced_model": - pytest.skip("MagiHuman e2e test requires advanced_model run level with real weights.") - - model_path = "SII-GAIR/daVinci-MagiHuman-Base-1080p" - - prompt = ( - "A young woman with long, wavy golden blonde hair and bright blue eyes, " - "wearing a fitted ivory silk blouse with a delicate lace collar, sits " - "stationary in front of a softly lit, blurred warm-toned interior. Her " - "overall disposition is warm, composed, and gently confident. The camera " - "holds a static medium close-up, framing her from the shoulders up, " - "with shallow depth of field keeping her face in sharp focus. Soft " - "directional key light falls from the upper left, casting a gentle " - "highlight along her cheekbone and nose bridge. She draws a quiet breath, " - "the levator labii superiors relaxing as her lips part. She speaks in " - "clear, warm, unhurried American English: " - "\"The most beautiful things in life aren't things at all — " - "they're moments, feelings, and the people who make you feel truly alive.\" " - "Her jaw descends smoothly on each stressed syllable; the orbicularis oris " - "shapes each vowel with precision. A faint, genuine smile engages the " - "zygomaticus major, lifting her lip corners fractionally. Her brows rest " - "in a soft, neutral arch throughout. She maintains steady, forward-facing " - "eye contact. Head position remains level; no torso displacement occurs.\n\n" - "Dialogue:\n" - ": " - "\"The most beautiful things in life aren't things at all — " - "they're moments, feelings, and the people who make you feel truly alive.\"\n\n" - "Background Sound:\n" - "" - ) - - sampling_params = OmniDiffusionSamplingParams( - height=256, - width=448, - num_inference_steps=8, - seed=52, - extra_args={ - "seconds": 5, - "sr_height": 1080, - "sr_width": 1920, - "sr_num_inference_steps": 5, - }, - ) - - with OmniRunner( - model_path, - init_timeout=1200, - tensor_parallel_size=2, - ) as runner: - omni = runner.omni - outputs = list( - omni.generate( - prompts=[prompt], - sampling_params_list=[sampling_params], - ) - ) - - assert len(outputs) > 0, "No outputs returned" - first = outputs[0] - - assert hasattr(first, "images") and first.images, "No video frames in output" - video_frames = first.images[0] - assert isinstance(video_frames, np.ndarray), f"Expected numpy array, got {type(video_frames)}" - assert video_frames.ndim == 4, f"Expected 4D array (T,H,W,3), got shape {video_frames.shape}" - - mm = first.multimodal_output - assert mm, "multimodal_output is empty or missing" - - audio_waveform = mm.get("audio") - assert audio_waveform is not None, "No audio waveform in multimodal_output" - - audio_sample_rate = mm.get("audio_sample_rate") - assert audio_sample_rate is not None, ( - "audio_sample_rate not found in multimodal_output; model post-process must propagate it" - ) - assert isinstance(audio_sample_rate, (int, float)), ( - f"audio_sample_rate should be numeric, got {type(audio_sample_rate)}" - ) - assert int(audio_sample_rate) > 0, f"audio_sample_rate must be positive, got {audio_sample_rate}" - - fps = mm.get("fps") - assert fps is not None, "fps not found in multimodal_output; model post-process must propagate it" - assert isinstance(fps, (int, float)), f"fps should be numeric, got {type(fps)}" - assert int(fps) > 0, f"fps must be positive, got {fps}" - - video_bytes = mux_video_audio_bytes( - video_frames, - audio_waveform, - fps=float(fps), - audio_sample_rate=int(audio_sample_rate), - ) - assert isinstance(video_bytes, bytes), f"Expected MP4 bytes, got {type(video_bytes)}" - assert len(video_bytes) > 1000, f"MP4 too small ({len(video_bytes)} bytes)" - - _validate_mp4(video_bytes) diff --git a/tests/e2e/offline_inference/test_mammoth_moda2.py b/tests/e2e/offline_inference/test_mammoth_moda2.py index c3d95844c11..5293b5ed1b7 100644 --- a/tests/e2e/offline_inference/test_mammoth_moda2.py +++ b/tests/e2e/offline_inference/test_mammoth_moda2.py @@ -23,8 +23,7 @@ import torch from vllm.sampling_params import SamplingParams -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner +from tests.utils import hardware_test os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" @@ -117,6 +116,8 @@ def test_mammothmoda2_t2i_e2e(): - A fixed set of pixel values matches a golden reference (regenerate with ``UPDATE_GOLDEN=1``). """ + from vllm_omni import Omni + if not Path(MODEL_PATH).exists(): pytest.skip(f"Model weights not found at {MODEL_PATH}") if not Path(T2I_STAGE_CONFIG).exists(): @@ -134,8 +135,8 @@ def test_mammothmoda2_t2i_e2e(): prompt_text = "A cat sitting on a laptop keyboard" formatted_prompt = _format_t2i_prompt(prompt_text, ar_width, ar_height) - with OmniRunner(MODEL_PATH, stage_configs_path=T2I_STAGE_CONFIG, trust_remote_code=True) as runner: - omni = runner.omni + omni = Omni(model=MODEL_PATH, stage_configs_path=T2I_STAGE_CONFIG, trust_remote_code=True) + try: # Greedy / deterministic sampling so pixel values are reproducible. ar_sampling = SamplingParams( temperature=0.0, @@ -210,3 +211,5 @@ def test_mammothmoda2_t2i_e2e(): found_image = True assert found_image, "No image tensor found in pipeline output" + finally: + omni.close() diff --git a/tests/e2e/offline_inference/test_ming_flash_omni.py b/tests/e2e/offline_inference/test_ming_flash_omni.py deleted file mode 100644 index ca0b0fe0d96..00000000000 --- a/tests/e2e/offline_inference/test_ming_flash_omni.py +++ /dev/null @@ -1,195 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" - -from pathlib import Path - -import pytest - -from tests.helpers.mark import hardware_test -from tests.helpers.media import ( - generate_synthetic_audio, - generate_synthetic_image, - generate_synthetic_video, -) -from tests.helpers.stage_config import modify_stage_config - -models = ["Jonathan1909/Ming-flash-omni-2.0"] - -# Ming-specific -SYSTEM_PROMPT = "你是一个友好的AI助手。\n\ndetailed thinking off" -EOS_TOKEN = "<|role_end|>" -IMAGE_TOKEN = "" -VIDEO_TOKEN = "
", "STOP"]) - result = omni._prepare_prefill_sampling_params("req-1", sp) - assert result.stop == [] - - def test_clears_stop_token_ids(self, monkeypatch): - omni = _make_pd_omni( - monkeypatch, - [ - _prefill_stage_cfg(), - _decode_stage_cfg(engine_input_source=[0]), - ], - ) - sp = SamplingParams(max_tokens=2048, stop_token_ids=[151643, 151644]) - result = omni._prepare_prefill_sampling_params("req-1", sp) - assert result.stop_token_ids == [] - - def test_clears_include_stop_str_in_output(self, monkeypatch): - omni = _make_pd_omni( - monkeypatch, - [ - _prefill_stage_cfg(), - _decode_stage_cfg(engine_input_source=[0]), - ], - ) - sp = SamplingParams(max_tokens=2048, include_stop_str_in_output=True) - result = omni._prepare_prefill_sampling_params("req-1", sp) - assert result.include_stop_str_in_output is False - - def test_original_sp_unchanged(self, monkeypatch): - omni = _make_pd_omni( - monkeypatch, - [ - _prefill_stage_cfg(), - _decode_stage_cfg(engine_input_source=[0]), - ], - ) - sp = SamplingParams(max_tokens=2048, stop=["
"], stop_token_ids=[151643]) - _ = omni._prepare_prefill_sampling_params("req-1", sp) - assert sp.stop == [""] - assert sp.stop_token_ids == [151643] - - -# =================================================================== -# Tests: Failure mode & memory leak prevention -# =================================================================== -# NOTE: Full generate()-level failure mode tests are removed for now. -# The _run_generation error handler (line 1344-1350 in omni.py) calls -# _drop_pd_kv_params but does not increment completed_requests, causing -# the while-loop to hang. These tests need to be revisited once the -# production error-handling path is fixed to properly terminate on -# stage errors. - - -# =================================================================== -# Tests: TP size validation -# =================================================================== - - -class TestTPSizeValidation: - """Tests that _validate_pd_separation_config checks tensor_parallel_size.""" - - def test_matching_tp_passes(self, monkeypatch): - """Same TP size should not raise.""" - prefill_cfg = _prefill_stage_cfg() - prefill_cfg["engine_args"]["tensor_parallel_size"] = 2 - decode_cfg = _decode_stage_cfg(engine_input_source=[0]) - decode_cfg["engine_args"]["tensor_parallel_size"] = 2 - omni = _make_pd_omni(monkeypatch, [prefill_cfg, decode_cfg]) - assert omni._pd_separation_pair == (0, 1) - - def test_mismatched_tp_raises(self, monkeypatch): - """Different TP sizes should raise ValueError.""" - prefill_cfg = _prefill_stage_cfg() - prefill_cfg["engine_args"]["tensor_parallel_size"] = 2 - decode_cfg = _decode_stage_cfg(engine_input_source=[0]) - decode_cfg["engine_args"]["tensor_parallel_size"] = 4 - with pytest.raises(ValueError, match="tensor_parallel_size"): - _make_pd_omni(monkeypatch, [prefill_cfg, decode_cfg]) - - def test_default_tp_no_error(self, monkeypatch): - """Stages without explicit TP (defaults to 1) should pass.""" - omni = _make_pd_omni( - monkeypatch, - [ - _prefill_stage_cfg(), - _decode_stage_cfg(engine_input_source=[0]), - ], - ) - assert omni._pd_separation_pair == (0, 1) diff --git a/tests/entrypoints/test_realtime_connection_helpers.py b/tests/entrypoints/test_realtime_connection_helpers.py deleted file mode 100644 index e795aa92d0f..00000000000 --- a/tests/entrypoints/test_realtime_connection_helpers.py +++ /dev/null @@ -1,86 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for realtime streaming helpers (PR #2581 /v1/realtime path).""" - -from __future__ import annotations - -import base64 - -import numpy as np -import pytest -import torch -from vllm.sampling_params import RequestOutputKind, SamplingParams - -from vllm_omni.entrypoints.async_omni import AsyncOmni -from vllm_omni.entrypoints.openai.realtime_connection import RealtimeConnection - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -@pytest.fixture -def realtime_conn() -> RealtimeConnection: - return RealtimeConnection.__new__(RealtimeConnection) - - -class TestRealtimeConnectionTensorAndPcm: - def test_tensor_to_numpy_none(self) -> None: - assert RealtimeConnection._tensor_to_numpy(None) is None - - def test_tensor_to_numpy_1d_numpy(self) -> None: - arr = np.array([1.0, 2.0], dtype=np.float64) - out = RealtimeConnection._tensor_to_numpy(arr) - assert out is not None - assert out.dtype == np.float32 - assert out.shape == (2,) - - def test_tensor_to_numpy_2d_numpy_flattened(self) -> None: - arr = np.array([[0.5], [-0.5]], dtype=np.float32) - out = RealtimeConnection._tensor_to_numpy(arr) - assert out is not None - assert out.shape == (2,) - - def test_tensor_to_numpy_torch(self) -> None: - t = torch.tensor([[0.25, -0.25]], dtype=torch.float32) - out = RealtimeConnection._tensor_to_numpy(t) - assert out is not None - assert out.shape == (2,) - np.testing.assert_allclose(out, [0.25, -0.25], rtol=1e-5) - - def test_pcm16_b64_roundtrip(self) -> None: - audio = np.array([0.0, 1.0, -1.0], dtype=np.float32) - b64 = RealtimeConnection._pcm16_b64(audio) - raw = base64.b64decode(b64) - assert len(raw) == 6 - pcm = np.frombuffer(raw, dtype=np.int16) - assert pcm[0] == 0 - assert pcm[1] == 32767 - assert pcm[2] == -32767 - - -class TestAsyncOmniStreamingParamsValidation: - def test_accepts_streaming_friendly_params(self) -> None: - p = SamplingParams( - n=1, - stop=[], - output_kind=RequestOutputKind.DELTA, - ) - AsyncOmni._validate_streaming_input_sampling_params(p) - - def test_rejects_non_sampling_params(self) -> None: - with pytest.raises(ValueError, match="Input streaming"): - AsyncOmni._validate_streaming_input_sampling_params(object()) # type: ignore[arg-type] - - def test_rejects_n_greater_than_one(self) -> None: - p = SamplingParams(n=2, stop=[], output_kind=RequestOutputKind.DELTA) - with pytest.raises(ValueError, match="Input streaming"): - AsyncOmni._validate_streaming_input_sampling_params(p) - - def test_rejects_final_only(self) -> None: - p = SamplingParams(n=1, stop=[], output_kind=RequestOutputKind.FINAL_ONLY) - with pytest.raises(ValueError, match="Input streaming"): - AsyncOmni._validate_streaming_input_sampling_params(p) - - def test_rejects_stop_strings(self) -> None: - p = SamplingParams(n=1, stop=["\n"], output_kind=RequestOutputKind.DELTA) - with pytest.raises(ValueError, match="Input streaming"): - AsyncOmni._validate_streaming_input_sampling_params(p) diff --git a/tests/entrypoints/test_serve.py b/tests/entrypoints/test_serve.py deleted file mode 100644 index e60afc9cd7b..00000000000 --- a/tests/entrypoints/test_serve.py +++ /dev/null @@ -1,211 +0,0 @@ -"""Unit tests for the Omni serve CLI helpers.""" - -from __future__ import annotations - -import argparse - -import pytest -from pytest_mock import MockerFixture - -from vllm_omni.entrypoints.cli.serve import OmniServeCommand, run_headless -from vllm_omni.entrypoints.utils import detect_explicit_cli_keys - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -def test_serve_parser_accepts_no_async_chunk_and_marks_it_explicit() -> None: - """``--no-async-chunk`` should parse to ``async_chunk=False`` and mark the - shared deploy-level dest as explicitly provided by the user.""" - try: - from vllm.utils.argparse_utils import FlexibleArgumentParser - except Exception as exc: - pytest.skip(f"Cannot build parser in this environment: {exc}") - - root = FlexibleArgumentParser() - subparsers = root.add_subparsers(dest="subcommand") - cmd = OmniServeCommand() - serve_parser = cmd.subparser_init(subparsers) - - argv = ["serve", "fake-model", "--omni", "--no-async-chunk"] - args = root.parse_args(argv) - - assert args.async_chunk is False - explicit = detect_explicit_cli_keys(argv, serve_parser) - assert "async_chunk" in explicit - - -def _make_headless_args() -> argparse.Namespace: - return argparse.Namespace( - model="fake-model", - stage_id=3, - omni_master_address="127.0.0.1", - omni_master_port=26000, - api_server_count=0, - worker_backend="multi_process", - stage_configs_path=None, - log_stats=False, - disable_log_stats=False, - ) - - -def test_run_headless_registers_stage_once_and_launches_all_local_engines(mocker: MockerFixture) -> None: - args = _make_headless_args() - stage_cfg = mocker.Mock(stage_id=3) - stage_cfgs = [stage_cfg] - parallel_config = mocker.Mock( - data_parallel_size_local=2, - data_parallel_rank=4, - data_parallel_rank_local=1, - node_rank_within_dp=0, - ) - vllm_config = mocker.Mock(parallel_config=parallel_config) - executor_class = mocker.Mock() - engine_manager = mocker.Mock() - - mocker.patch( - "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", - return_value=("/fake/stages.yaml", stage_cfgs), - ) - mocker.patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment") - mocker.patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=mocker.Mock()) - mocker.patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}) - mocker.patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}) - mocker.patch( - "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", - return_value=(None, None, None), - ) - mock_build_vllm_config = mocker.patch( - "vllm_omni.engine.stage_init_utils.build_vllm_config", - return_value=(vllm_config, executor_class), - ) - mock_register = mocker.patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value="tcp://127.0.0.1:26001", - ) - mock_manager_cls = mocker.patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) - mocker.patch("signal.signal") - run_headless(args) - - mock_build_vllm_config.assert_called_once_with( - stage_cfg, - "fake-model", - stage_connector_spec={}, - engine_args_dict={}, - headless=True, - ) - mock_register.assert_called_once_with( - omni_master_address="127.0.0.1", - omni_master_port=26000, - omni_stage_id=3, - omni_stage_config=stage_cfg, - coordinator=None, - ) - mock_manager_cls.assert_called_once() - manager_kwargs = mock_manager_cls.call_args.kwargs - assert manager_kwargs["local_engine_count"] == 2 - assert manager_kwargs["start_index"] == 4 - assert manager_kwargs["local_start_index"] == 0 - assert manager_kwargs["local_client"] is False - assert manager_kwargs["handshake_address"] == "tcp://127.0.0.1:26001" - assert manager_kwargs["log_stats"] is False - engine_manager.join_first.assert_called_once_with() - engine_manager.shutdown.assert_called_once_with() - - -def test_run_headless_honors_explicit_log_stats_flag(mocker: MockerFixture) -> None: - args = _make_headless_args() - args.log_stats = True - stage_cfg = mocker.Mock(stage_id=3) - stage_cfgs = [stage_cfg] - parallel_config = mocker.Mock( - data_parallel_size_local=2, - data_parallel_rank=4, - data_parallel_rank_local=1, - node_rank_within_dp=0, - ) - vllm_config = mocker.Mock(parallel_config=parallel_config) - executor_class = mocker.Mock() - engine_manager = mocker.Mock() - - mocker.patch( - "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", - return_value=("/fake/stages.yaml", stage_cfgs), - ) - mocker.patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment") - mocker.patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=mocker.Mock()) - mocker.patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}) - mocker.patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}) - mocker.patch( - "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", - return_value=(None, None, None), - ) - mocker.patch( - "vllm_omni.engine.stage_init_utils.build_vllm_config", - return_value=(vllm_config, executor_class), - ) - mocker.patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value="tcp://127.0.0.1:26001", - ) - mock_manager_cls = mocker.patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) - mocker.patch("signal.signal") - run_headless(args) - - manager_kwargs = mock_manager_cls.call_args.kwargs - assert manager_kwargs["log_stats"] is True - - -def test_run_headless_launches_diffusion_stage_via_omni_master(mocker: MockerFixture) -> None: - args = _make_headless_args() - stage_cfg = mocker.Mock(stage_id=3, stage_type="diffusion") - stage_cfg.engine_args = mocker.Mock() - stage_cfg.engine_input_source = [] - stage_cfgs = [stage_cfg] - metadata = mocker.Mock(stage_id=3) - od_config = mocker.Mock() - proc = mocker.Mock() - proc.exitcode = 0 - proc.is_alive.return_value = False - - mocker.patch( - "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", - return_value=("/fake/stages.yaml", stage_cfgs), - ) - mocker.patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment") - mocker.patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=mocker.Mock()) - mocker.patch( - "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", - return_value=(None, None, None), - ) - mocker.patch("vllm_omni.engine.stage_init_utils.extract_stage_metadata", return_value=metadata) - mock_inject_stage_info = mocker.patch("vllm_omni.engine.stage_init_utils.inject_kv_stage_info") - mocker.patch("vllm_omni.engine.stage_init_utils.build_diffusion_config", return_value=od_config) - mock_register = mocker.patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value=("tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), - ) - mock_spawn = mocker.patch( - "vllm_omni.diffusion.stage_diffusion_proc.spawn_diffusion_proc", - return_value=(proc, "tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), - ) - mock_handshake = mocker.patch("vllm_omni.diffusion.stage_diffusion_proc.complete_diffusion_handshake") - mocker.patch("signal.signal") - run_headless(args) - - mock_inject_stage_info.assert_called_once_with(stage_cfg, 3) - mock_register.assert_called_once_with( - omni_master_address="127.0.0.1", - omni_master_port=26000, - omni_stage_id=3, - omni_stage_config=stage_cfg, - return_addresses=True, - ) - mock_spawn.assert_called_once_with( - "fake-model", - od_config, - handshake_address="tcp://127.0.0.1:26001", - request_address="tcp://127.0.0.1:26002", - response_address="tcp://127.0.0.1:26003", - ) - mock_handshake.assert_called_once_with(proc, "tcp://127.0.0.1:26001") - proc.join.assert_called_once_with() diff --git a/tests/entrypoints/test_stage_utils.py b/tests/entrypoints/test_stage_utils.py index 15ee9c32a4e..2bb2231ccb8 100644 --- a/tests/entrypoints/test_stage_utils.py +++ b/tests/entrypoints/test_stage_utils.py @@ -6,6 +6,8 @@ from vllm_omni.entrypoints.stage_utils import set_stage_devices +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def _make_dummy_torch(call_log): class _Props: @@ -53,8 +55,6 @@ def _make_mock_platform(mocker, device_type: str = "cuda", env_var: str = "CUDA_ return mock_platform -@pytest.mark.core_model -@pytest.mark.cpu @pytest.mark.usefixtures("clean_gpu_memory_between_tests") def test_set_stage_devices_respects_logical_ids(mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch): # Preserve an existing logical mapping and ensure devices "0,1" map through it. @@ -75,8 +75,6 @@ def test_set_stage_devices_respects_logical_ids(mocker: MockerFixture, monkeypat assert os.environ["CUDA_VISIBLE_DEVICES"] == "6,7" -@pytest.mark.core_model -@pytest.mark.cpu @pytest.mark.usefixtures("clean_gpu_memory_between_tests") def test_set_stage_devices_handles_not_enough_devices(mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch): # Preserve an existing logical mapping and ensure devices "0,1" map through it. @@ -92,10 +90,9 @@ def test_set_stage_devices_handles_not_enough_devices(mocker: MockerFixture, mon mock_platform, ) - # Keep the logical mapping and resolve to the visible subset. - set_stage_devices(stage_id=0, devices="0,1,2,3") - - assert os.environ["CUDA_VISIBLE_DEVICES"] == "6,7" + # Raise since we need 4 GPUs, but we only have 2 visible + with pytest.raises(ValueError): + set_stage_devices(stage_id=0, devices="0,1,2,3") @pytest.mark.usefixtures("clean_gpu_memory_between_tests") diff --git a/tests/entrypoints/test_utils.py b/tests/entrypoints/test_utils.py index 6e52e4c6c0c..352ed2aad9b 100644 --- a/tests/entrypoints/test_utils.py +++ b/tests/entrypoints/test_utils.py @@ -5,21 +5,14 @@ from dataclasses import dataclass import pytest -import torch from pytest_mock import MockerFixture -from vllm.sampling_params import RequestOutputKind, SamplingParams -from vllm_omni.config.yaml_util import create_config from vllm_omni.diffusion.data import OmniDiffusionConfig from vllm_omni.engine.arg_utils import OmniEngineArgs -from vllm_omni.engine.async_omni_engine import AsyncOmniEngine from vllm_omni.entrypoints.utils import ( _convert_dataclasses_to_dict, _filter_dict_like_object, - coerce_param_message_types, filter_dataclass_kwargs, - load_and_resolve_stage_configs, - load_stage_configs_from_yaml, resolve_model_config_path, ) @@ -311,159 +304,3 @@ def mock_exists(path): assert result is not None assert "glm_image.yaml" in result - - def test_voxcpm_transformers_format_resolution(self, mocker: MockerFixture): - """Test VoxCPM transformers config resolves to the voxcpm stage config.""" - mocker.patch( - "vllm_omni.entrypoints.utils.get_config", - side_effect=ValueError("missing transformers config"), - ) - mocker.patch( - "vllm_omni.entrypoints.utils.file_or_path_exists", - side_effect=lambda _model, filename, revision=None: filename == "config.json", - ) - mocker.patch( - "vllm_omni.entrypoints.utils.get_hf_file_to_dict", - return_value={"model_type": "voxcpm"}, - ) - mocker.patch( - "vllm_omni.entrypoints.utils.current_omni_platform.get_default_stage_config_path", - return_value="vllm_omni/model_executor/stage_configs", - ) - - original_exists = os.path.exists - - def mock_exists(path): - if "voxcpm.yaml" in str(path): - return True - return original_exists(path) - - mocker.patch("os.path.exists", side_effect=mock_exists) - - result = resolve_model_config_path("OpenBMB/VoxCPM1.5") - - assert result is not None - assert "voxcpm.yaml" in result - - -class TestLoadAndResolveStageConfigs: - def test_load_and_resolve_with_kwargs(self): - """Ensure that dtype survives default stage creation.""" - kwargs = {"dtype": torch.float32} - config_path, stage_configs = load_and_resolve_stage_configs( - model="black-forest-labs/FLUX.2-klein-4B", - stage_configs_path=None, - kwargs=kwargs, - default_stage_cfg_factory=lambda: AsyncOmniEngine._create_default_diffusion_stage_cfg(kwargs), - ) - assert config_path is None - assert len(stage_configs) == 1 - assert "dtype" in stage_configs[0]["engine_args"] - - -class TestLoadStageConfigsFromYaml: - """Regression tests for stage-config loading and merging.""" - - def test_deep_merges_stage_engine_args(self, mocker: MockerFixture): - yaml_config = create_config( - { - "async_chunk": True, - "stage_args": [ - { - "stage_id": 0, - "runtime": {"device": 0}, - "engine_args": { - "parallel_config": {"tensor_parallel_size": 4}, - }, - } - ], - } - ) - mocker.patch( - "vllm_omni.entrypoints.utils.load_yaml_config", - return_value=yaml_config, - ) - - stages = load_stage_configs_from_yaml( - "fake.yaml", - base_engine_args={ - "parallel_config": { - "tensor_parallel_size": 1, - "pipeline_parallel_size": 2, - }, - "model": "base-model", - }, - ) - - merged_engine_args = stages[0]["engine_args"] - assert merged_engine_args["parallel_config"]["tensor_parallel_size"] == 4 - assert merged_engine_args["parallel_config"]["pipeline_parallel_size"] == 2 - assert merged_engine_args["model"] == "base-model" - assert merged_engine_args["async_chunk"] is True - - def test_merges_nested_stage_engine_args(self, mocker: MockerFixture): - yaml_config = create_config( - { - "stage_args": [ - { - "stage_id": 0, - "engine_args": { - "nested": {"override": 2}, - }, - } - ], - } - ) - mocker.patch( - "vllm_omni.entrypoints.utils.load_yaml_config", - return_value=yaml_config, - ) - - stages = load_stage_configs_from_yaml( - "fake.yaml", - base_engine_args={"nested": {"base": 1}}, - ) - - assert stages[0]["engine_args"]["nested"]["base"] == 1 - assert stages[0]["engine_args"]["nested"]["override"] == 2 - - -class TestCumulativeStreamingCoercion: - @pytest.mark.parametrize("skip_clone", [True, False]) - def test_cumulative_default_becomes_delta_if_stream(self, skip_clone): - """Ensure cumulative messages are coercible to delta if streaming.""" - sp = SamplingParams(output_kind=RequestOutputKind.CUMULATIVE) - sp.skip_clone = skip_clone - result = coerce_param_message_types([sp], is_streaming=True)[0] - assert isinstance(result, SamplingParams) - assert result.output_kind == RequestOutputKind.DELTA - assert (skip_clone and sp is result) or (not skip_clone and sp is not result) - - @pytest.mark.parametrize("skip_clone", [True, False]) - def test_cumulative_default_becomes_final_only_if_not_stream(self, skip_clone): - """Ensure cumulative messages are coercible to final only if not streaming.""" - sp = SamplingParams(output_kind=RequestOutputKind.CUMULATIVE) - sp.skip_clone = skip_clone - result = coerce_param_message_types([sp], is_streaming=False)[0] - assert isinstance(result, SamplingParams) - assert result.output_kind == RequestOutputKind.FINAL_ONLY - assert (skip_clone and sp is result) or (not skip_clone and sp is not result) - - @pytest.mark.parametrize("is_streaming", [True, False]) - @pytest.mark.parametrize("output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) - def test_non_cumulative_are_coerced(self, output_kind, is_streaming): - """Ensure non-cumulative params are coerced to the target type.""" - sp = SamplingParams(output_kind=output_kind) - expected = RequestOutputKind.DELTA if is_streaming else RequestOutputKind.FINAL_ONLY - result = coerce_param_message_types([sp], is_streaming=is_streaming)[0] - assert isinstance(result, SamplingParams) - assert result.output_kind == expected - - def test_coercion_applies_to_all_stages(self): - """Ensure all stages are coerced to DELTA for streaming.""" - sp0 = SamplingParams(output_kind=RequestOutputKind.CUMULATIVE) - sp1 = SamplingParams(output_kind=RequestOutputKind.CUMULATIVE) - result = coerce_param_message_types([sp0, sp1], is_streaming=True) - assert all([isinstance(r, SamplingParams) for r in result]) - assert result[0].output_kind == RequestOutputKind.DELTA - assert result[1].output_kind == RequestOutputKind.DELTA diff --git a/tests/examples/conftest.py b/tests/examples/conftest.py index 867731b21f9..137d15f163f 100644 --- a/tests/examples/conftest.py +++ b/tests/examples/conftest.py @@ -1,3 +1,353 @@ -"""Pytest fixtures for tests/examples.""" +""" +Shared fixtures, helpers, and path constants for tests/examples/. +""" -from tests.examples.helpers import example_runner # noqa: F401 +import json +import os +import re +import shlex +import subprocess +import sys +import tempfile +from collections import defaultdict +from collections.abc import Callable +from pathlib import Path +from typing import Any, NamedTuple, cast + +import pytest +import torch +from safetensors.torch import save_file + +# --------------------------------------------------------------------------- +# Path constants and fixtures +# --------------------------------------------------------------------------- + +REPO_ROOT = Path(__file__).resolve().parents[2] +EXAMPLES = REPO_ROOT / "examples" + +# Use Python tempfile instead of pytest's tmp_path_factory because +# OUTPUT_DIR is needed in test collection time, but tmp_path_factory is only available in test running time. +# It is needed during test collection because extract_readme_snippets replaces LoRA path with a generated one under OUTPUT_DIR, +# and extract_readme_snippets is called at collection time to generate separate test cases for each README code block. +OUTPUT_DIR = ( + REPO_ROOT / prefix + if (prefix := os.environ.get("OUTPUT_DIR")) + else Path(tempfile.mkdtemp(prefix="vllm_omni_test_examples_")) +) + + +# --------------------------------------------------------------------------- +# Code snippet extraction and asset file helpers +# --------------------------------------------------------------------------- + +# parameters: language, code, h2_title +ReadmeSnippetExtractionSkipPredicate = Callable[[str, str, str], tuple[bool, str]] + + +class ReadmeSnippet(NamedTuple): + language: str + code: str + h2_title: str + index_in_section: int + output_file_path: Path | None = None + skip: tuple[bool, str] = (False, "") + + @property + def test_id(self) -> str: + return f"{ReadmeSnippet._slug(self.h2_title)}_{self.index_in_section:03d}" + + @staticmethod + def extract_readme_snippets( + readme_path: Path, + skipif: ReadmeSnippetExtractionSkipPredicate | None = None, + ) -> list["ReadmeSnippet"]: + import mistune + + markdown = mistune.create_markdown(renderer="ast") + tokens = markdown(readme_path.read_text(encoding="utf-8")) + tokens = cast(list[dict[str, Any]], tokens) # mistune's AST renderer always produces a list, not a str + + h2_title = "" + section_counts: defaultdict[str, int] = defaultdict(int) + snippets: list[ReadmeSnippet] = [] + + for token in tokens: + token_type = token.get("type") + + if token_type == "heading": + level = (token.get("attrs") or {}).get("level") + title = ReadmeSnippet._heading_text(token) + if level == 2: + h2_title = title + continue + + if token_type != "block_code": + continue + + try: + info = token.get("attrs").get("info") # type: ignore[reportOptionalMemberAccess] + language = info.strip().split()[0].lower() # type: ignore[reportOptionalMemberAccess] + + # Common shell aliases to "bash" in several markdown renderers. + if language in {"shell", "sh", "ksh", "zsh"}: + language = "bash" + + if language not in {"bash", "python"}: + continue + except AttributeError: + # The fence is missing explicit language info; skip it. + continue + + key = h2_title + section_counts[key] += 1 + code = token.get("raw", "") + output_file_path = None + if language == "bash": + argv = ReadmeSnippet._normalize_bash_command(code, Path(readme_path.parent)) + code = shlex.join(argv) + output_file_path = ReadmeSnippet._output_file_path_from_argv(argv) + if skipif is not None: + skip_config = skipif(language, code, h2_title) + else: + skip_config = (False, "") + snippet = ReadmeSnippet( + language=language, + code=code, + h2_title=h2_title, + index_in_section=section_counts[key], + output_file_path=output_file_path, + skip=skip_config, + ) + snippets.append(snippet) + + return snippets + + @staticmethod + def _normalize_bash_command(command: str, readme_dir: Path) -> list[str]: + line_joined_command = re.sub(r"\\\s*\n", " ", command).strip() + argv = shlex.split(line_joined_command, comments=True) + assert argv, "README bash fence produced an empty command" + + # Normalize python directory and example script location + if argv[0] in {"python", "python3"}: + argv[0] = sys.executable + if len(argv) > 1 and argv[1].endswith(".py"): + script_arg = argv[1] + script_path = Path(script_arg) + if script_path.is_absolute(): + resolved_script = script_path + else: + # Take the file name only, and append script_dir to its front + resolved_script = readme_dir / script_path.name + assert resolved_script.exists(), ( + f"README bash snippet references a script that does not exist: {script_arg} (resolved to {resolved_script})" + ) + argv[1] = str(resolved_script) + + # Normalize LoRA adapter path and ensure README LoRA assets exist. + try: + lora_arg_idx = argv.index("--lora-path") # Raise ValueError if not found + assert len(argv) > lora_arg_idx + 1, "README bash snippet uses --lora-path without a following value" + + lora_dir = OUTPUT_DIR / "lora" + adapter_model = lora_dir / "adapter_model.safetensors" + adapter_config = lora_dir / "adapter_config.json" + if not adapter_model.exists() or not adapter_config.exists(): + write_zimage_lora(lora_dir, v_scale=8.0) + + argv[lora_arg_idx + 1] = str(lora_dir) + except ValueError: + pass + + return argv + + @staticmethod + def _output_file_path_from_argv(argv: list[str]) -> Path | None: + if "--output" not in argv: + return None + output_param_idx = argv.index("--output") + assert len(argv) > output_param_idx + 1, "README bash snippet uses --output without a following value" + output_arg = argv[output_param_idx + 1] + return Path(output_arg) + + @staticmethod + def _slug(text: str) -> str: + return "".join(ch.lower() if ch.isalnum() else "_" for ch in text).strip("_") + + @staticmethod + def _heading_text(token: dict) -> str: + return "".join(child.get("raw", "") for child in token.get("children", [])).strip() + + +# [TODO] Duplicate `_write_zimage_lora` in tests/e2e/online_serving/test_images_generations_lora.py. Combine these helpers and tests/e2e/offline_inference/test_diffusion_lora.py to test/utils later +def write_zimage_lora(adapter_dir: Path, *, q_scale: float = 0.0, k_scale: float = 0.0, v_scale: float = 0.0): + adapter_dir.mkdir(parents=True, exist_ok=True) + + # Z-Image transformer uses dim=3840 by default. + dim = 3840 + module_name = "transformer.layers.0.attention.to_qkv" + rank = 1 + + lora_a = torch.zeros((rank, dim), dtype=torch.float32) + lora_a[0, 0] = 1.0 + + # QKVParallelLinear packs (Q, K, V) => out dim is 3 * dim (tp=1). + lora_b = torch.zeros((3 * dim, rank), dtype=torch.float32) + if q_scale: + lora_b[:dim, 0] = q_scale + if k_scale: + lora_b[dim : 2 * dim, 0] = k_scale + if v_scale: + lora_b[2 * dim :, 0] = v_scale + + save_file( + { + f"base_model.model.{module_name}.lora_A.weight": lora_a, + f"base_model.model.{module_name}.lora_B.weight": lora_b, + }, + str(adapter_dir / "adapter_model.safetensors"), + ) + (adapter_dir / "adapter_config.json").write_text( + json.dumps( + { + "r": rank, + "lora_alpha": rank, + "target_modules": [module_name], + } + ), + encoding="utf-8", + ) + + +# --------------------------------------------------------------------------- +# Code runner and subprocess helpers +# --------------------------------------------------------------------------- + + +class ExampleRunResult(NamedTuple): + run_dir: Path + assets: list[Path] + + +class ExampleRunner: + """Run extracted README snippets and return generated assets. + + The output materials are organized in a three-level directory structure: + - Set at init: `self.output_root` for all tests (from env OUTPUT_DIR) + - Set at `self.run(...)`: `output_subfolder` for a specific example page (e.g., `example_offline_t2i`) + - Generated by `extract_readme_snippets`: `snippet.test_id` for a specific code block (matching H2 titles, e.g., `basic_usage_001`) + """ + + IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp"} + + def __init__(self, output_root: Path) -> None: + self.output_root = output_root + + def run( + self, snippet: ReadmeSnippet, *, output_subfolder: Path = Path("."), env: dict[str, str] | None = None + ) -> ExampleRunResult: + run_dir = self.output_root / output_subfolder / snippet.test_id + run_dir.mkdir(parents=True, exist_ok=True) + + if snippet.language == "python": + assets = self._run_python_snippet(snippet, run_dir, env) + return ExampleRunResult(run_dir=run_dir, assets=assets) + + if snippet.language == "bash": + asset = self._run_bash_snippet(snippet, run_dir, env) + return ExampleRunResult(run_dir=run_dir, assets=[asset]) + + raise AssertionError(f"Unsupported snippet language: {snippet.language}") + + def _run_python_snippet( + self, snippet: ReadmeSnippet, run_dir: Path, env: dict[str, str] | None = None + ) -> list[Path]: + # Saving the script to a temporary file and `run_cmd` it. + # Not using `exec(snippet.code)` because the output is lost. + script_path = run_dir / "snippet.py" + script_path.write_text(snippet.code, encoding="utf-8") + + before = self._collect_images(run_dir) + run_cmd([sys.executable, str(script_path)], cwd=run_dir, env=env) + after = self._collect_images(run_dir) + + assets = sorted(after - before) + return assets + + def _run_bash_snippet(self, snippet: ReadmeSnippet, run_dir: Path, env: dict[str, str] | None = None) -> Path: + run_cmd(snippet.code, shell=True, cwd=run_dir, env=env) + + assert snippet.output_file_path is not None, ( + f"README bash snippet is missing --output argument: {snippet.test_id}. " + "The test script cannot guess the output file path." + ) + + # If the code snippet declares a relative path for the output file, append this path to the parent output collection directory. + # If the code snippet declares an absolute path (not likely but just in case), the return value resolution removes `run_dir`, also correctly pointing to this file. + return run_dir / snippet.output_file_path + + def _collect_images(self, root: Path) -> set[Path]: + return {path for path in root.rglob("*") if path.suffix.lower() in self.IMAGE_SUFFIXES} + + +@pytest.fixture +def example_runner() -> ExampleRunner: + return ExampleRunner(output_root=OUTPUT_DIR) + + +def run_cmd( + command: list[str] | str, + *, + shell: bool = False, + env: dict[str, str] | None = None, + cwd: Path | str | None = None, +) -> str: + """Run a command as a subprocess; assert zero exit code and return stdout. + + Output is fully captured and returned as a string so callers can parse it + (e.g. with :func:`extract_content_after_keyword`). + Use this for scripts whose printed output is part of the test assertion. + """ + if env is not None: + env = {**os.environ.copy(), **env} + result = subprocess.run(command, capture_output=True, text=True, shell=shell, env=env, cwd=cwd) + + if result.returncode != 0: + print(f"STDERR: {result.stderr}") + raise subprocess.CalledProcessError(result.returncode, command) + + all_output = result.stdout + print(f"All output:\n{all_output}") + return all_output + + +# --------------------------------------------------------------------------- +# Output validation helpers +# --------------------------------------------------------------------------- + + +def extract_content_after_keyword(keywords: str, text: str) -> str: + """Return the text that follows *keywords* in *text* (regex match). + + Raises ``AssertionError`` if the keyword is not found, so test failures + produce a clear message pointing at the missing keyword. + """ + matches = re.findall(rf"{keywords}\s*(.+)", text, re.DOTALL) + + if not matches: + raise AssertionError(f"Keywords {keywords} not found in provided text output") + return matches[0] + + +def strip_trailing_audio_saved_line(text: str) -> str: + """Drop trailing ``Audio saved to ...`` lines from captured client stdout. + + ``openai_chat_completion_client_for_multimodal_generation.py`` may print + ``Chat completion output from text:`` for one choice and ``Audio saved to`` + for another; :func:`extract_content_after_keyword` uses ``re.DOTALL`` and + would otherwise keep the audio progress line inside the *text* segment. + """ + lines = text.splitlines() + while lines and lines[-1].strip().startswith("Audio saved to"): + lines.pop() + return "\n".join(lines).strip() diff --git a/tests/examples/helpers.py b/tests/examples/helpers.py deleted file mode 100644 index 137d15f163f..00000000000 --- a/tests/examples/helpers.py +++ /dev/null @@ -1,353 +0,0 @@ -""" -Shared fixtures, helpers, and path constants for tests/examples/. -""" - -import json -import os -import re -import shlex -import subprocess -import sys -import tempfile -from collections import defaultdict -from collections.abc import Callable -from pathlib import Path -from typing import Any, NamedTuple, cast - -import pytest -import torch -from safetensors.torch import save_file - -# --------------------------------------------------------------------------- -# Path constants and fixtures -# --------------------------------------------------------------------------- - -REPO_ROOT = Path(__file__).resolve().parents[2] -EXAMPLES = REPO_ROOT / "examples" - -# Use Python tempfile instead of pytest's tmp_path_factory because -# OUTPUT_DIR is needed in test collection time, but tmp_path_factory is only available in test running time. -# It is needed during test collection because extract_readme_snippets replaces LoRA path with a generated one under OUTPUT_DIR, -# and extract_readme_snippets is called at collection time to generate separate test cases for each README code block. -OUTPUT_DIR = ( - REPO_ROOT / prefix - if (prefix := os.environ.get("OUTPUT_DIR")) - else Path(tempfile.mkdtemp(prefix="vllm_omni_test_examples_")) -) - - -# --------------------------------------------------------------------------- -# Code snippet extraction and asset file helpers -# --------------------------------------------------------------------------- - -# parameters: language, code, h2_title -ReadmeSnippetExtractionSkipPredicate = Callable[[str, str, str], tuple[bool, str]] - - -class ReadmeSnippet(NamedTuple): - language: str - code: str - h2_title: str - index_in_section: int - output_file_path: Path | None = None - skip: tuple[bool, str] = (False, "") - - @property - def test_id(self) -> str: - return f"{ReadmeSnippet._slug(self.h2_title)}_{self.index_in_section:03d}" - - @staticmethod - def extract_readme_snippets( - readme_path: Path, - skipif: ReadmeSnippetExtractionSkipPredicate | None = None, - ) -> list["ReadmeSnippet"]: - import mistune - - markdown = mistune.create_markdown(renderer="ast") - tokens = markdown(readme_path.read_text(encoding="utf-8")) - tokens = cast(list[dict[str, Any]], tokens) # mistune's AST renderer always produces a list, not a str - - h2_title = "" - section_counts: defaultdict[str, int] = defaultdict(int) - snippets: list[ReadmeSnippet] = [] - - for token in tokens: - token_type = token.get("type") - - if token_type == "heading": - level = (token.get("attrs") or {}).get("level") - title = ReadmeSnippet._heading_text(token) - if level == 2: - h2_title = title - continue - - if token_type != "block_code": - continue - - try: - info = token.get("attrs").get("info") # type: ignore[reportOptionalMemberAccess] - language = info.strip().split()[0].lower() # type: ignore[reportOptionalMemberAccess] - - # Common shell aliases to "bash" in several markdown renderers. - if language in {"shell", "sh", "ksh", "zsh"}: - language = "bash" - - if language not in {"bash", "python"}: - continue - except AttributeError: - # The fence is missing explicit language info; skip it. - continue - - key = h2_title - section_counts[key] += 1 - code = token.get("raw", "") - output_file_path = None - if language == "bash": - argv = ReadmeSnippet._normalize_bash_command(code, Path(readme_path.parent)) - code = shlex.join(argv) - output_file_path = ReadmeSnippet._output_file_path_from_argv(argv) - if skipif is not None: - skip_config = skipif(language, code, h2_title) - else: - skip_config = (False, "") - snippet = ReadmeSnippet( - language=language, - code=code, - h2_title=h2_title, - index_in_section=section_counts[key], - output_file_path=output_file_path, - skip=skip_config, - ) - snippets.append(snippet) - - return snippets - - @staticmethod - def _normalize_bash_command(command: str, readme_dir: Path) -> list[str]: - line_joined_command = re.sub(r"\\\s*\n", " ", command).strip() - argv = shlex.split(line_joined_command, comments=True) - assert argv, "README bash fence produced an empty command" - - # Normalize python directory and example script location - if argv[0] in {"python", "python3"}: - argv[0] = sys.executable - if len(argv) > 1 and argv[1].endswith(".py"): - script_arg = argv[1] - script_path = Path(script_arg) - if script_path.is_absolute(): - resolved_script = script_path - else: - # Take the file name only, and append script_dir to its front - resolved_script = readme_dir / script_path.name - assert resolved_script.exists(), ( - f"README bash snippet references a script that does not exist: {script_arg} (resolved to {resolved_script})" - ) - argv[1] = str(resolved_script) - - # Normalize LoRA adapter path and ensure README LoRA assets exist. - try: - lora_arg_idx = argv.index("--lora-path") # Raise ValueError if not found - assert len(argv) > lora_arg_idx + 1, "README bash snippet uses --lora-path without a following value" - - lora_dir = OUTPUT_DIR / "lora" - adapter_model = lora_dir / "adapter_model.safetensors" - adapter_config = lora_dir / "adapter_config.json" - if not adapter_model.exists() or not adapter_config.exists(): - write_zimage_lora(lora_dir, v_scale=8.0) - - argv[lora_arg_idx + 1] = str(lora_dir) - except ValueError: - pass - - return argv - - @staticmethod - def _output_file_path_from_argv(argv: list[str]) -> Path | None: - if "--output" not in argv: - return None - output_param_idx = argv.index("--output") - assert len(argv) > output_param_idx + 1, "README bash snippet uses --output without a following value" - output_arg = argv[output_param_idx + 1] - return Path(output_arg) - - @staticmethod - def _slug(text: str) -> str: - return "".join(ch.lower() if ch.isalnum() else "_" for ch in text).strip("_") - - @staticmethod - def _heading_text(token: dict) -> str: - return "".join(child.get("raw", "") for child in token.get("children", [])).strip() - - -# [TODO] Duplicate `_write_zimage_lora` in tests/e2e/online_serving/test_images_generations_lora.py. Combine these helpers and tests/e2e/offline_inference/test_diffusion_lora.py to test/utils later -def write_zimage_lora(adapter_dir: Path, *, q_scale: float = 0.0, k_scale: float = 0.0, v_scale: float = 0.0): - adapter_dir.mkdir(parents=True, exist_ok=True) - - # Z-Image transformer uses dim=3840 by default. - dim = 3840 - module_name = "transformer.layers.0.attention.to_qkv" - rank = 1 - - lora_a = torch.zeros((rank, dim), dtype=torch.float32) - lora_a[0, 0] = 1.0 - - # QKVParallelLinear packs (Q, K, V) => out dim is 3 * dim (tp=1). - lora_b = torch.zeros((3 * dim, rank), dtype=torch.float32) - if q_scale: - lora_b[:dim, 0] = q_scale - if k_scale: - lora_b[dim : 2 * dim, 0] = k_scale - if v_scale: - lora_b[2 * dim :, 0] = v_scale - - save_file( - { - f"base_model.model.{module_name}.lora_A.weight": lora_a, - f"base_model.model.{module_name}.lora_B.weight": lora_b, - }, - str(adapter_dir / "adapter_model.safetensors"), - ) - (adapter_dir / "adapter_config.json").write_text( - json.dumps( - { - "r": rank, - "lora_alpha": rank, - "target_modules": [module_name], - } - ), - encoding="utf-8", - ) - - -# --------------------------------------------------------------------------- -# Code runner and subprocess helpers -# --------------------------------------------------------------------------- - - -class ExampleRunResult(NamedTuple): - run_dir: Path - assets: list[Path] - - -class ExampleRunner: - """Run extracted README snippets and return generated assets. - - The output materials are organized in a three-level directory structure: - - Set at init: `self.output_root` for all tests (from env OUTPUT_DIR) - - Set at `self.run(...)`: `output_subfolder` for a specific example page (e.g., `example_offline_t2i`) - - Generated by `extract_readme_snippets`: `snippet.test_id` for a specific code block (matching H2 titles, e.g., `basic_usage_001`) - """ - - IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp"} - - def __init__(self, output_root: Path) -> None: - self.output_root = output_root - - def run( - self, snippet: ReadmeSnippet, *, output_subfolder: Path = Path("."), env: dict[str, str] | None = None - ) -> ExampleRunResult: - run_dir = self.output_root / output_subfolder / snippet.test_id - run_dir.mkdir(parents=True, exist_ok=True) - - if snippet.language == "python": - assets = self._run_python_snippet(snippet, run_dir, env) - return ExampleRunResult(run_dir=run_dir, assets=assets) - - if snippet.language == "bash": - asset = self._run_bash_snippet(snippet, run_dir, env) - return ExampleRunResult(run_dir=run_dir, assets=[asset]) - - raise AssertionError(f"Unsupported snippet language: {snippet.language}") - - def _run_python_snippet( - self, snippet: ReadmeSnippet, run_dir: Path, env: dict[str, str] | None = None - ) -> list[Path]: - # Saving the script to a temporary file and `run_cmd` it. - # Not using `exec(snippet.code)` because the output is lost. - script_path = run_dir / "snippet.py" - script_path.write_text(snippet.code, encoding="utf-8") - - before = self._collect_images(run_dir) - run_cmd([sys.executable, str(script_path)], cwd=run_dir, env=env) - after = self._collect_images(run_dir) - - assets = sorted(after - before) - return assets - - def _run_bash_snippet(self, snippet: ReadmeSnippet, run_dir: Path, env: dict[str, str] | None = None) -> Path: - run_cmd(snippet.code, shell=True, cwd=run_dir, env=env) - - assert snippet.output_file_path is not None, ( - f"README bash snippet is missing --output argument: {snippet.test_id}. " - "The test script cannot guess the output file path." - ) - - # If the code snippet declares a relative path for the output file, append this path to the parent output collection directory. - # If the code snippet declares an absolute path (not likely but just in case), the return value resolution removes `run_dir`, also correctly pointing to this file. - return run_dir / snippet.output_file_path - - def _collect_images(self, root: Path) -> set[Path]: - return {path for path in root.rglob("*") if path.suffix.lower() in self.IMAGE_SUFFIXES} - - -@pytest.fixture -def example_runner() -> ExampleRunner: - return ExampleRunner(output_root=OUTPUT_DIR) - - -def run_cmd( - command: list[str] | str, - *, - shell: bool = False, - env: dict[str, str] | None = None, - cwd: Path | str | None = None, -) -> str: - """Run a command as a subprocess; assert zero exit code and return stdout. - - Output is fully captured and returned as a string so callers can parse it - (e.g. with :func:`extract_content_after_keyword`). - Use this for scripts whose printed output is part of the test assertion. - """ - if env is not None: - env = {**os.environ.copy(), **env} - result = subprocess.run(command, capture_output=True, text=True, shell=shell, env=env, cwd=cwd) - - if result.returncode != 0: - print(f"STDERR: {result.stderr}") - raise subprocess.CalledProcessError(result.returncode, command) - - all_output = result.stdout - print(f"All output:\n{all_output}") - return all_output - - -# --------------------------------------------------------------------------- -# Output validation helpers -# --------------------------------------------------------------------------- - - -def extract_content_after_keyword(keywords: str, text: str) -> str: - """Return the text that follows *keywords* in *text* (regex match). - - Raises ``AssertionError`` if the keyword is not found, so test failures - produce a clear message pointing at the missing keyword. - """ - matches = re.findall(rf"{keywords}\s*(.+)", text, re.DOTALL) - - if not matches: - raise AssertionError(f"Keywords {keywords} not found in provided text output") - return matches[0] - - -def strip_trailing_audio_saved_line(text: str) -> str: - """Drop trailing ``Audio saved to ...`` lines from captured client stdout. - - ``openai_chat_completion_client_for_multimodal_generation.py`` may print - ``Chat completion output from text:`` for one choice and ``Audio saved to`` - for another; :func:`extract_content_after_keyword` uses ``re.DOTALL`` and - would otherwise keep the audio progress line inside the *text* segment. - """ - lines = text.splitlines() - while lines and lines[-1].strip().startswith("Audio saved to"): - lines.pop() - return "\n".join(lines).strip() diff --git a/tests/examples/offline_inference/test_text_to_image.py b/tests/examples/offline_inference/test_text_to_image.py index 041c32dc4ef..a08d16f1614 100644 --- a/tests/examples/offline_inference/test_text_to_image.py +++ b/tests/examples/offline_inference/test_text_to_image.py @@ -7,12 +7,11 @@ import pytest -from tests.examples.helpers import EXAMPLES, ExampleRunner, ReadmeSnippet -from tests.helpers.assertions import assert_image_valid -from tests.helpers.mark import hardware_marks - -pytestmark = [pytest.mark.full_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})] +from tests.conftest import assert_image_valid +from tests.examples.conftest import EXAMPLES, ExampleRunner, ReadmeSnippet +from tests.utils import hardware_marks +pytestmark = [pytest.mark.advanced_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})] T2I_SCRIPT = EXAMPLES / "offline_inference" / "text_to_image" / "text_to_image.py" README_PATH = T2I_SCRIPT.with_name("README.md") diff --git a/tests/examples/online_serving/test_qwen2_5_omni.py b/tests/examples/online_serving/test_qwen2_5_omni.py index b3e49b8d9ad..a78ccf5924a 100644 --- a/tests/examples/online_serving/test_qwen2_5_omni.py +++ b/tests/examples/online_serving/test_qwen2_5_omni.py @@ -4,29 +4,34 @@ """ import os + +from vllm_omni.platforms import current_omni_platform + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + from pathlib import Path import pytest -from tests.examples.helpers import ( +from tests.conftest import OmniServerParams, convert_audio_file_to_text, cosine_similarity_text +from tests.examples.conftest import ( extract_content_after_keyword, run_cmd, strip_trailing_audio_saved_line, ) -from tests.helpers.mark import hardware_test -from tests.helpers.media import convert_audio_file_to_text, cosine_similarity_text -from tests.helpers.runtime import OmniServerParams -from tests.helpers.stage_config import get_deploy_config_path +from tests.utils import hardware_test -pytestmark = [pytest.mark.full_model, pytest.mark.example, pytest.mark.omni] - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +pytestmark = [pytest.mark.advanced_model, pytest.mark.example] models = ["Qwen/Qwen2.5-Omni-7B"] -# Single CI deploy YAML; rocm/xpu deltas are picked automatically via the -# platforms: section in vllm_omni/deploy/ci/qwen2_5_omni.yaml. -stage_configs = [get_deploy_config_path("ci/qwen2_5_omni.yaml")] + +stage_configs = [str(Path(__file__).parent.parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_ci.yaml")] + +if current_omni_platform.is_xpu(): + stage_configs = [ + str(Path(__file__).parent.parent.parent / "e2e" / "stage_configs" / "xpu" / "qwen2_5_omni_ci.yaml") + ] example_dir = str(Path(__file__).parent.parent.parent.parent / "examples" / "online_serving") # Create parameter combinations for model and stage config @@ -39,6 +44,8 @@ common_args = ["python", os.path.join(example_dir, "openai_chat_completion_client_for_multimodal_generation.py")] +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_send_multimodal_request_001(omni_server) -> None: @@ -74,6 +81,8 @@ def test_send_multimodal_request_001(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_send_multimodal_request_002(omni_server) -> None: @@ -109,6 +118,8 @@ def test_send_multimodal_request_002(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_send_multimodal_request_003(omni_server) -> None: @@ -134,6 +145,8 @@ def test_send_multimodal_request_003(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_modality_control_001(omni_server) -> None: @@ -162,6 +175,8 @@ def test_modality_control_001(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_modality_control_002(omni_server) -> None: @@ -189,6 +204,8 @@ def test_modality_control_002(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_modality_control_003(omni_server) -> None: @@ -225,6 +242,8 @@ def test_modality_control_003(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_stream_001(omni_server) -> None: diff --git a/tests/examples/online_serving/test_qwen3_omni.py b/tests/examples/online_serving/test_qwen3_omni.py index e52a2bf5a67..65f99d7bf28 100644 --- a/tests/examples/online_serving/test_qwen3_omni.py +++ b/tests/examples/online_serving/test_qwen3_omni.py @@ -4,28 +4,32 @@ """ import os + +from vllm_omni.platforms import current_omni_platform + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + from pathlib import Path import pytest -from tests.examples.helpers import ( +from tests.conftest import OmniServerParams, convert_audio_file_to_text, cosine_similarity_text +from tests.examples.conftest import ( extract_content_after_keyword, run_cmd, strip_trailing_audio_saved_line, ) -from tests.helpers.mark import hardware_test -from tests.helpers.media import convert_audio_file_to_text, cosine_similarity_text -from tests.helpers.runtime import OmniServerParams -from tests.helpers.stage_config import get_deploy_config_path +from tests.utils import hardware_test -pytestmark = [pytest.mark.full_model, pytest.mark.example, pytest.mark.omni] - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +pytestmark = [pytest.mark.advanced_model, pytest.mark.example] models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] -stage_configs = [get_deploy_config_path("ci/qwen3_omni_moe.yaml")] +stage_configs = [str(Path(__file__).parent.parent.parent / "e2e" / "stage_configs" / "qwen3_omni_ci.yaml")] + +if current_omni_platform.is_xpu(): + stage_configs = [str(Path(__file__).parent.parent.parent / "e2e" / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml")] example_dir = str(Path(__file__).parent.parent.parent.parent / "examples" / "online_serving") @@ -38,6 +42,8 @@ common_args = ["python", os.path.join(example_dir, "openai_chat_completion_client_for_multimodal_generation.py")] +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_send_multimodal_request_001(omni_server) -> None: @@ -66,6 +72,8 @@ def test_send_multimodal_request_001(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_send_multimodal_request_002(omni_server) -> None: @@ -97,6 +105,8 @@ def test_send_multimodal_request_002(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_send_multimodal_request_003(omni_server) -> None: @@ -112,6 +122,8 @@ def test_send_multimodal_request_003(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_modality_control_001(omni_server) -> None: @@ -134,6 +146,8 @@ def test_modality_control_001(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_modality_control_002(omni_server) -> None: @@ -156,6 +170,8 @@ def test_modality_control_002(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_modality_control_003(omni_server) -> None: @@ -186,6 +202,8 @@ def test_modality_control_003(omni_server) -> None: # TODO: Verify the E2E latency after confirmation baseline. +@pytest.mark.advanced_model +@pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_stream_001(omni_server) -> None: diff --git a/tests/examples/online_serving/test_text_to_image.py b/tests/examples/online_serving/test_text_to_image.py index ee0a1fedba7..51b7ff61bc9 100644 --- a/tests/examples/online_serving/test_text_to_image.py +++ b/tests/examples/online_serving/test_text_to_image.py @@ -13,12 +13,11 @@ import pytest -from tests.examples.helpers import EXAMPLES, OUTPUT_DIR, run_cmd, write_zimage_lora -from tests.helpers.assertions import assert_image_valid -from tests.helpers.mark import hardware_marks -from tests.helpers.runtime import OmniServer, OmniServerParams +from tests.conftest import OmniServer, OmniServerParams, assert_image_valid +from tests.examples.conftest import EXAMPLES, OUTPUT_DIR, run_cmd, write_zimage_lora +from tests.utils import hardware_marks -pytestmark = [pytest.mark.full_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})] +pytestmark = [pytest.mark.advanced_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})] T2I_ONLINE_CLIENT = EXAMPLES / "online_serving" / "text_to_image" / "openai_chat_client.py" EXAMPLE_OUTPUT_SUBFOLDER = "example_online_t2i" diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py deleted file mode 100644 index a3348b07fe0..00000000000 --- a/tests/helpers/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Shared, importable test helper utilities. - -Submodules (``assertions``, ``env``, ``media``, ``runtime``, …) are imported -explicitly by callers. Avoid star-importing everything here: that ran before -refactor only inside the old monolithic ``conftest``; a greedy ``__init__`` -changes import order and can affect in-process Omni (``OmniRunner`` / offline -e2e) vs subprocess-based ``OmniServer`` tests. -""" diff --git a/tests/helpers/assertions.py b/tests/helpers/assertions.py deleted file mode 100644 index 604b76b62ec..00000000000 --- a/tests/helpers/assertions.py +++ /dev/null @@ -1,522 +0,0 @@ -"""Assertion and response validation helpers for tests.""" - -import io -import tempfile -import threading -from io import BytesIO -from pathlib import Path -from typing import Any - -import numpy as np -import soundfile as sf -from PIL import Image - -from tests.helpers.media import ( - cosine_similarity_text, -) - -_GENDER_PIPELINE = None -_GENDER_PIPELINE_LOCK = threading.Lock() -_PCM_SPEECH_SAMPLE_RATE_HZ = 24_000 -_MIN_PCM_SPEECH_HNR_DB = 1.0 -_PRESET_VOICE_GENDER_MAP: dict[str, str] = { - "serena": "female", - "uncle_fu": "male", - "chelsie": "female", - "clone": "female", - "ethan": "male", -} - - -def assert_image_diffusion_response( - response, - request_config: dict[str, Any], - run_level: str = None, -) -> None: - """ - Validate image diffusion response. - - Expected request_config schema: - { - "request_type": "image", - "extra_body": { - "num_outputs_per_prompt": 1, - "width": ..., - "height": ..., - ... - } - } - """ - assert response.images is not None, "Image response is None" - assert len(response.images) > 0, "No images in response" - - extra_body = request_config.get("extra_body") or {} - - num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt") - if num_outputs_per_prompt is not None: - assert len(response.images) == num_outputs_per_prompt, ( - f"Expected {num_outputs_per_prompt} images, got {len(response.images)}" - ) - - if run_level in {"advanced_model", "full_model"}: - width = extra_body.get("width") - height = extra_body.get("height") - - if width is not None or height is not None: - for img in response.images: - assert_image_valid(img, width=width, height=height) - - -def assert_video_diffusion_response( - response, - request_config: dict[str, Any], - run_level: str = None, -) -> None: - """ - Validate video diffusion response. - - Expected request_config schema: - { - "request_type": "video", - "form_data": { - "prompt": "...", - "num_frames": ..., - "width": ..., - "height": ..., - "fps": ..., - ... - } - } - """ - form_data = request_config.get("form_data", {}) - - assert response.videos is not None, "Video response is None" - assert len(response.videos) > 0, "No videos in response" - - expected_frames = _maybe_int(form_data.get("num_frames")) - expected_width = _maybe_int(form_data.get("width")) - expected_height = _maybe_int(form_data.get("height")) - expected_fps = _maybe_int(form_data.get("fps")) - - for vid_bytes in response.videos: - assert_video_valid( - vid_bytes, - num_frames=expected_frames, - width=expected_width, - height=expected_height, - fps=expected_fps, - ) - - -def assert_audio_diffusion_response( - response, - request_config: dict[str, Any], - run_level: str = None, -) -> None: - """ - Validate audio diffusion response. - """ - raise NotImplementedError("Audio validation is not implemented yet") - - -def _maybe_int(value: Any) -> int | None: - if value is None: - return None - return int(value) - - -def assert_image_valid(image: Path | Image.Image, *, width: int | None = None, height: int | None = None): - """Assert the file is a loadable image with optional exact dimensions.""" - if isinstance(image, Path): - assert image.exists(), f"Image not found: {image}" - image = Image.open(image) - image.load() - assert image.width > 0 and image.height > 0 - if width is not None: - assert image.width == width, f"Expected width={width}, got {image.width}" - if height is not None: - assert image.height == height, f"Expected height={height}, got {image.height}" - return image - - -def assert_video_valid( - video: Path | bytes | BytesIO, - *, - num_frames: int | None = None, - width: int | None = None, - height: int | None = None, - fps: float | None = None, -) -> dict[str, int | float]: - """Assert the MP4 has the expected resolution and frame count. - - For several diffusion backends, encoded MP4 frame count follows a codec-aligned - convention (e.g. request `num_frames=8` can produce 9 encoded frames). Keep - this compatibility behavior to avoid false negatives in online-serving tests. - """ - temp_path = None - cap = None - try: - import cv2 - - if isinstance(video, Path): - if not video.exists(): - raise AssertionError(f"Video file not found: {video}") - video_path = str(video) - else: - suffix = ".mp4" - with tempfile.NamedTemporaryFile(delete=False, suffix=suffix, mode="wb") as tmp: - if isinstance(video, bytes): - tmp.write(video) - elif isinstance(video, BytesIO): - tmp.write(video.getvalue()) - else: - raise TypeError(f"Unsupported video type: {type(video)}") - temp_path = Path(tmp.name) - video_path = str(temp_path) - - cap = cv2.VideoCapture(video_path) - if not cap.isOpened(): - raise AssertionError("Failed to open video capture") - - actual_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - actual_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - actual_fps = float(cap.get(cv2.CAP_PROP_FPS)) - actual_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - - if width is not None: - assert actual_width == width, f"Expected width={width}, got {actual_width}" - if height is not None: - assert actual_height == height, f"Expected height={height}, got {actual_height}" - if fps is not None and actual_fps: - assert abs(actual_fps - float(fps)) < 1.0, f"Expected fps~={fps}, got {actual_fps}" - if num_frames is not None: - expected_frames = (int(num_frames) // 4) * 4 + 1 - assert actual_frames == expected_frames, f"Expected frames={expected_frames}, got {actual_frames}" - - return { - "width": actual_width, - "height": actual_height, - "fps": actual_fps, - "num_frames": actual_frames, - } - except Exception as e: - print(f"ERROR: {type(e).__name__}: {e}", flush=True) - raise - finally: - if cap is not None: - cap.release() - if temp_path and temp_path.exists(): - try: - temp_path.unlink() - except OSError: - pass - - -def assert_audio_valid( - audio_or_path: Path | np.ndarray, - *, - sample_rate: int, - channels: int, - duration_s: float, -) -> None: - """Assert WAV file or (batch, channels, samples) ndarray matches expected audio format.""" - expected_samples = int(duration_s * sample_rate) - if isinstance(audio_or_path, np.ndarray): - audio = audio_or_path - assert audio.ndim == 3, f"Expected audio ndim=3 (batch, channels, samples), got shape {audio.shape}" - assert audio.shape[0] == 1, f"Expected batch size 1, got {audio.shape[0]}" - assert audio.shape[1] == channels, f"Expected {channels} channels, got {audio.shape[1]}" - assert audio.shape[2] == expected_samples, ( - f"Expected {expected_samples} samples ({duration_s}s @ {sample_rate} Hz), got {audio.shape[2]}" - ) - return - - path = audio_or_path - assert path.exists(), f"Audio not found: {path}" - info = sf.info(str(path)) - assert info.samplerate == sample_rate, f"Expected sample_rate={sample_rate}, got {info.samplerate}" - assert info.channels == channels, f"Expected {channels} channel(s), got {info.channels}" - assert info.frames == expected_samples, ( - f"Expected {expected_samples} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}" - ) - - -def _load_gender_pipeline(): - global _GENDER_PIPELINE - if _GENDER_PIPELINE is not None: - return _GENDER_PIPELINE - model_name = "7wolf/wav2vec2-base-gender-classification" - try: - from transformers import pipeline - - _GENDER_PIPELINE = pipeline(task="audio-classification", model=model_name, device=-1) - return _GENDER_PIPELINE - except Exception as exc: # pragma: no cover - print(f"Warning: failed to create gender pipeline '{model_name}': {exc}") - _GENDER_PIPELINE = None - return None - - -def _median_pitch_hz_from_autocorr(mono: np.ndarray, sr: int) -> float | None: - x = np.asarray(mono, dtype=np.float64) - x = x - np.mean(x) - if x.size < int(0.15 * sr): - return None - frame_len = int(0.04 * sr) - hop = max(frame_len // 2, 1) - f0_min_hz, f0_max_hz = 70.0, 400.0 - lag_min = max(1, int(sr / f0_max_hz)) - lag_max = min(frame_len - 2, int(sr / f0_min_hz)) - if lag_max <= lag_min: - return None - win = np.hamming(frame_len) - pitches: list[float] = [] - for start in range(0, int(x.shape[0]) - frame_len, hop): - frame = x[start : start + frame_len] * win - frame = frame - np.mean(frame) - if float(np.sqrt(np.mean(frame**2))) < 1e-4: - continue - ac = np.correlate(frame, frame, mode="full")[frame_len - 1 :] - ac = ac / (float(ac[0]) + 1e-12) - region = ac[lag_min : lag_max + 1] - peak_rel = int(np.argmax(region)) - peak_lag = peak_rel + lag_min - if peak_lag <= 0: - continue - f0 = float(sr) / float(peak_lag) - if f0_min_hz <= f0 <= f0_max_hz: - pitches.append(f0) - if len(pitches) < 4: - return None - return float(np.median(np.asarray(pitches, dtype=np.float64))) - - -def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str: - data, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=True) - if data.size == 0: - raise ValueError("Empty audio") - mono = np.mean(data, axis=1) - try: - target_sr = 16000 - if int(sr) != target_sr and mono.size > 1: - src_len = int(mono.shape[0]) - dst_len = max(1, int(round(src_len * float(target_sr) / float(sr)))) - src_idx = np.arange(src_len, dtype=np.float32) - dst_idx = np.linspace(0, src_len - 1, dst_len, dtype=np.float32) - mono = np.interp(dst_idx, src_idx, mono.astype(np.float32, copy=False)).astype(np.float32) - sr = target_sr - - median_f0 = _median_pitch_hz_from_autocorr(mono, sr) - clf = _load_gender_pipeline() - if clf is None: - print("gender model not available, returning 'unknown'") - return "unknown" - with _GENDER_PIPELINE_LOCK: - outputs = clf(mono, sampling_rate=sr) - if not outputs: - return "unknown" - top = outputs[0] - label = str(top.get("label", "")).lower() - conf = float(top.get("score", 0.0)) - if conf < 0.5: - gender = "unknown" - elif ("female" in label) or ("жен" in label): - gender = "female" - elif ("male" in label) or ("муж" in label): - gender = "male" - else: - gender = "unknown" - - if gender == "female" and median_f0 is not None and median_f0 < 165.0 and conf < 0.88: - print(f"gender pitch assist: reclassifying female->male (median_f0={median_f0:.1f} Hz, conf={conf:.3f})") - gender = "male" - elif gender == "male" and median_f0 is not None and median_f0 > 230.0 and conf < 0.88: - print(f"gender pitch assist: reclassifying male->female (median_f0={median_f0:.1f} Hz, conf={conf:.3f})") - gender = "female" - print( - f"gender classifier: label={label}, conf={conf:.3f}, gender={gender}" - + (f", median_f0={median_f0:.1f}Hz" if median_f0 is not None else "") - ) - return gender - except Exception as exc: # pragma: no cover - print(f"Warning: gender classification failed, returning 'unknown': {exc}") - return "unknown" - - -def _assert_preset_voice_gender_from_audio(audio_bytes: bytes | None, voice_name: str | None) -> None: - """If ``voice_name`` matches a known preset, assert classifier gender matches (skip when unknown).""" - if not voice_name or not audio_bytes: - return - key = str(voice_name).lower() - expected_gender = _PRESET_VOICE_GENDER_MAP.get(key) - if expected_gender is None: - return - estimated_gender = _estimate_voice_gender_from_audio(audio_bytes) - print(f"Preset voice gender check: preset={key!r}, estimated={estimated_gender!r}, expected={expected_gender!r}") - if estimated_gender != "unknown": - assert estimated_gender == expected_gender, ( - f"{voice_name!r} is expected {expected_gender}, but estimated gender is {estimated_gender!r}" - ) - - -def _compute_pcm_hnr_db(pcm_samples: np.ndarray, sr: int = _PCM_SPEECH_SAMPLE_RATE_HZ) -> float: - frame_len = int(0.03 * sr) - hop = frame_len // 2 - hnr_values: list[float] = [] - for start in range(0, len(pcm_samples) - frame_len, hop): - frame = pcm_samples[start : start + frame_len].astype(np.float32, copy=False) - frame = frame - np.mean(frame) - if np.max(np.abs(frame)) < 0.01: - continue - ac = np.correlate(frame, frame, mode="full")[len(frame) - 1 :] - ac = ac / (ac[0] + 1e-10) - min_lag = int(sr / 400) - max_lag = min(int(sr / 80), len(ac)) - if min_lag >= max_lag: - continue - peak = float(np.max(ac[min_lag:max_lag])) - if 0 < peak < 1: - hnr_values.append(10 * np.log10(peak / (1 - peak + 1e-10))) - return float(np.mean(hnr_values)) if hnr_values else 0.0 - - -def _assert_pcm_int16_speech_hnr(audio_bytes: bytes) -> None: - assert audio_bytes is not None and len(audio_bytes) >= 2, "missing PCM bytes" - assert len(audio_bytes) % 2 == 0, "PCM byte length must be aligned to int16" - pcm_samples = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 - hnr = _compute_pcm_hnr_db(pcm_samples) - print(f"PCM speech HNR: {hnr:.2f} dB (threshold: {_MIN_PCM_SPEECH_HNR_DB} dB)") - assert hnr >= _MIN_PCM_SPEECH_HNR_DB, ( - f"Audio distortion detected: HNR={hnr:.2f} dB < {_MIN_PCM_SPEECH_HNR_DB} dB. " - "Voice clone decoder may be losing ref_code speaker context on later chunks." - ) - - -def assert_omni_response(response: Any, request_config: dict[str, Any], run_level): - """ - Validate response results. - - Args: - response: OmniResponse object - - Raises: - AssertionError: When the response does not meet validation criteria - """ - assert response.success, "The request failed." - e2e_latency = response.e2e_latency - if e2e_latency is not None: - print(f"the e2e latency is: {e2e_latency}") - - modalities = request_config.get("modalities", ["text", "audio"]) - - if run_level in {"advanced_model", "full_model"}: - # Verify output success - if "audio" in modalities: - assert response.audio_content is not None, "No audio output is generated" - print(f"audio content is: {response.audio_content}") - speaker = request_config.get("speaker") - if speaker: - _assert_preset_voice_gender_from_audio( - response.audio_bytes, - speaker, - ) - if "text" in modalities: - assert response.text_content is not None, "No text output is generated" - print(f"text content is: {response.text_content}") - - # Verify keywords in output - word_types = ["text", "image", "audio", "video"] - keywords_dict = request_config.get("key_words", {}) - for word_type in word_types: - keywords = keywords_dict.get(word_type) - if "text" in modalities: - if keywords: - text_lower = response.text_content.lower() - assert any(str(kw).lower() in text_lower for kw in keywords), ( - "The output does not contain any of the keywords." - ) - else: - if keywords: - audio_lower = response.audio_content.lower() - assert any(str(kw).lower() in audio_lower for kw in keywords), ( - "The output does not contain any of the keywords." - ) - - # Verify similarity (Whisper transcript vs streamed/detokenized text) - if "audio" in modalities: - audio_ref_text = request_config.get("audio_ref_text") - if "text" in modalities: - transcript = (response.audio_content or "").strip() - text_output = (response.text_content or "").strip() - similarity = cosine_similarity_text( - transcript.lower(), - text_output.lower(), - ) - assert similarity > 0.9, "The audio content is not same as the text" - print(f"similarity is: {similarity}") - if audio_ref_text: - audio_similarity = cosine_similarity_text( - response.audio_content.lower(), - str(audio_ref_text).lower(), - ) - assert audio_similarity > 0.9, ( - f"The audio content does not match reference text: similarity={audio_similarity:.3f}" - ) - - -def assert_audio_speech_response(response: Any, request_config: dict[str, Any], run_level: str) -> None: - assert response.success, "The request failed." - e2e_latency = getattr(response, "e2e_latency", None) - if e2e_latency is not None: - print(f"the avg e2e latency is: {e2e_latency}") - - req_fmt = request_config.get("response_format") - if req_fmt == "pcm" and response.audio_bytes: - _assert_pcm_int16_speech_hnr(response.audio_bytes) - if response.audio_format: - assert "pcm" in response.audio_format.lower(), ( - f"Expected audio/pcm content-type, got {response.audio_format!r}" - ) - elif req_fmt == "wav" and response.audio_format: - assert req_fmt in response.audio_format - - if run_level in {"advanced_model", "full_model"} and req_fmt != "pcm": - expected_text = request_config.get("input") - if expected_text: - transcript = (response.audio_content or "").strip() - print(f"audio content is: {transcript}") - print(f"input text is: {expected_text}") - similarity = cosine_similarity_text(transcript.lower(), expected_text.lower()) - print(f"Cosine similarity: {similarity:.3f}") - assert similarity > 0.9, ( - f"Transcript doesn't match input: similarity={similarity:.2f}, transcript='{transcript}'" - ) - _assert_preset_voice_gender_from_audio(response.audio_bytes, request_config.get("voice")) - - -def assert_diffusion_response(response: Any, request_config: dict[str, Any], run_level: str = None): - assert response.success, "The request failed." - e2e_latency = getattr(response, "e2e_latency", None) - if e2e_latency is not None: - print(f"the avg e2e is: {e2e_latency}") - has_any_content = any(content is not None for content in (response.images, response.videos, response.audios)) - assert has_any_content, "Response contains no images, videos, or audios" - if response.images is not None: - assert_image_diffusion_response(response=response, request_config=request_config, run_level=run_level) - if response.videos is not None: - assert_video_diffusion_response(response=response, request_config=request_config, run_level=run_level) - if response.audios is not None: - assert_audio_diffusion_response(response=response, request_config=request_config, run_level=run_level) - - -__all__ = [ - "assert_audio_diffusion_response", - "assert_audio_speech_response", - "assert_diffusion_response", - "assert_image_diffusion_response", - "assert_image_valid", - "assert_omni_response", - "assert_video_diffusion_response", - "assert_video_valid", - "assert_audio_valid", -] diff --git a/tests/helpers/env.py b/tests/helpers/env.py deleted file mode 100644 index 22ec9a78626..00000000000 --- a/tests/helpers/env.py +++ /dev/null @@ -1,280 +0,0 @@ -"""Test environment / lifecycle helpers (GPU cleanup hooks and memory monitoring for tests). - -``vllm.platforms`` / ``vllm_omni.platforms`` are imported only inside functions that need them -so importing this module at pytest plugin load does not run before session autouse fixtures -""" - -from __future__ import annotations - -import gc -import os -import subprocess -import threading -import time -from contextlib import contextmanager - -import torch - - -def run_forced_gpu_cleanup_round() -> None: - run_pre_test_cleanup(enable_force=True) - run_post_test_cleanup(enable_force=True) - - -def get_physical_device_indices(devices): - visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES") - if visible_devices is None: - return devices - visible_indices = [int(x) for x in visible_devices.split(",")] - index_mapping = {i: physical for i, physical in enumerate(visible_indices)} - return [index_mapping[i] for i in devices if i in index_mapping] - - -def wait_for_gpu_memory_to_clear( - *, - devices: list[int], - threshold_bytes: int | None = None, - threshold_ratio: float | None = None, - timeout_s: float = 120, -) -> None: - from vllm.platforms import current_platform - - assert threshold_bytes is not None or threshold_ratio is not None - devices = get_physical_device_indices(devices) - start_time = time.time() - - device_list = ", ".join(str(d) for d in devices) - if threshold_bytes is not None: - threshold_str = f"{threshold_bytes / 2**30:.2f} GiB" - condition_str = f"Memory usage ≤ {threshold_str}" - else: - threshold_percent = threshold_ratio * 100 - threshold_str = f"{threshold_percent:.1f}%" - condition_str = f"Memory usage ratio ≤ {threshold_str}" - - print(f"[GPU Memory Monitor] Waiting for GPU {device_list} to free memory, Condition: {condition_str}") - - if threshold_bytes is not None: - - def is_free(used, total): - return used <= threshold_bytes / 2**30 - else: - - def is_free(used, total): - return used / total <= threshold_ratio - - @contextmanager - def nvml_scope(): - if current_platform.is_rocm(): - from amdsmi import amdsmi_init, amdsmi_shut_down - - amdsmi_init() - try: - yield - finally: - amdsmi_shut_down() - elif current_platform.is_cuda(): - from vllm.third_party.pynvml import nvmlInit, nvmlShutdown - - nvmlInit() - try: - yield - finally: - nvmlShutdown() - else: - yield - - is_rocm = current_platform.is_rocm() - - with nvml_scope(): - if is_rocm: - from amdsmi import amdsmi_get_gpu_vram_usage, amdsmi_get_processor_handles - elif current_platform.is_cuda(): - from vllm.third_party.pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo - - while True: - output: dict[int, str] = {} - output_raw: dict[int, tuple[float, float]] = {} - for device in devices: - if is_rocm: - dev_handle = amdsmi_get_processor_handles()[device] - mem_info = amdsmi_get_gpu_vram_usage(dev_handle) - gb_used = mem_info["vram_used"] / 2**10 - gb_total = mem_info["vram_total"] / 2**10 - else: - dev_handle = nvmlDeviceGetHandleByIndex(device) - mem_info = nvmlDeviceGetMemoryInfo(dev_handle) - gb_used = mem_info.used / 2**30 - gb_total = mem_info.total / 2**30 - output_raw[device] = (gb_used, gb_total) - usage_percent = (gb_used / gb_total) * 100 if gb_total > 0 else 0 - output[device] = f"{gb_used:.1f}GiB/{gb_total:.1f}GiB ({usage_percent:.1f}%)" - - print("[GPU Memory Status] Current usage:") - for device_id, mem_info in output.items(): - print(f" GPU {device_id}: {mem_info}") - - dur_s = time.time() - start_time - elapsed_minutes = dur_s / 60 - if all(is_free(used, total) for used, total in output_raw.values()): - print(f"[GPU Memory Freed] Devices {device_list} meet memory condition") - print(f" Condition: {condition_str}") - print(f" Wait time: {dur_s:.1f} seconds ({elapsed_minutes:.1f} minutes)") - break - - if dur_s >= timeout_s: - raise ValueError( - f"[GPU Memory Timeout] Devices {device_list} still don't meet memory condition after {dur_s:.1f} seconds\n" - f"Condition: {condition_str}\n" - f"Current status:\n" + "\n".join(f" GPU {device}: {output[device]}" for device in devices) - ) - - gc.collect() - torch.cuda.empty_cache() - time.sleep(5) - - -def _print_gpu_processes() -> None: - """Print GPU information including nvidia-smi and system processes.""" - - print("\n" + "=" * 80) - print("NVIDIA GPU Information (nvidia-smi)") - print("=" * 80) - - try: - nvidia_result = subprocess.run( - ["nvidia-smi"], - capture_output=True, - text=True, - timeout=5, - ) - - if nvidia_result.returncode == 0: - lines = nvidia_result.stdout.strip().split("\n") - for line in lines[:20]: - print(line) - - if len(lines) > 20: - print(f"... (showing first 20 of {len(lines)} lines)") - else: - print("nvidia-smi command failed") - - except (subprocess.TimeoutExpired, FileNotFoundError): - print("nvidia-smi not available or timed out") - except Exception as e: - print(f"Error running nvidia-smi: {e}") - - print("\n" + "=" * 80) - print("Detailed GPU Processes (nvidia-smi pmon)") - print("=" * 80) - - try: - pmon_result = subprocess.run( - ["nvidia-smi", "pmon", "-c", "1"], - capture_output=True, - text=True, - timeout=3, - ) - - if pmon_result.returncode == 0 and pmon_result.stdout.strip(): - print(pmon_result.stdout) - else: - print("No active GPU processes found via nvidia-smi pmon") - - except Exception: - print("nvidia-smi pmon not available") - - print("\n" + "=" * 80) - print("System Processes with GPU keywords") - print("=" * 80) - - -_SKIPPED_GPU_CLEANUP_MSG = ( - "\nSkipping GPU memory cleanup check (typically: instance already up; no check needed between tests)\n" -) - - -def run_pre_test_cleanup(enable_force: bool = False) -> None: - if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force: - print(_SKIPPED_GPU_CLEANUP_MSG) - return - - print("Pre-test GPU status:") - - num_gpus = torch.cuda.device_count() - if num_gpus > 0: - try: - wait_for_gpu_memory_to_clear( - devices=list(range(num_gpus)), - threshold_ratio=0.05, - ) - except Exception as e: - print(f"Pre-test cleanup note: {e}") - - -def run_post_test_cleanup(enable_force: bool = False) -> None: - if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force: - print(_SKIPPED_GPU_CLEANUP_MSG) - return - - if torch.cuda.is_available(): - gc.collect() - torch.cuda.empty_cache() - - print("Post-test GPU status:") - _print_gpu_processes() - - -class DeviceMemoryMonitor: - """Poll global device memory usage.""" - - def __init__(self, device_index: int, interval: float = 0.05): - self.device_index = device_index - self.interval = interval - self._peak_used_mb = 0.0 - self._stop_event = threading.Event() - self._thread: threading.Thread | None = None - - def start(self) -> None: - from vllm_omni.platforms import current_omni_platform - - def monitor_loop() -> None: - while not self._stop_event.is_set(): - try: - with current_omni_platform.device(self.device_index): - free_bytes, total_bytes = current_omni_platform.mem_get_info() - used_mb = (total_bytes - free_bytes) / (1024**2) - self._peak_used_mb = max(self._peak_used_mb, used_mb) - except Exception: - pass - time.sleep(self.interval) - - self._thread = threading.Thread(target=monitor_loop, daemon=False) - self._thread.start() - - def stop(self) -> None: - if self._thread is None: - return - self._stop_event.set() - self._thread.join(timeout=2.0) - - @property - def peak_used_mb(self) -> float: - from vllm_omni.platforms import current_omni_platform - - fallback_alloc = current_omni_platform.max_memory_allocated(device=self.device_index) / (1024**2) - fallback_reserved = current_omni_platform.max_memory_reserved(device=self.device_index) / (1024**2) - return max(self._peak_used_mb, fallback_alloc, fallback_reserved) - - def __del__(self): - self.stop() - - -__all__ = [ - "DeviceMemoryMonitor", - "get_physical_device_indices", - "run_post_test_cleanup", - "run_pre_test_cleanup", - "run_forced_gpu_cleanup_round", - "wait_for_gpu_memory_to_clear", -] diff --git a/tests/helpers/fixtures/__init__.py b/tests/helpers/fixtures/__init__.py deleted file mode 100644 index 8bd090b7824..00000000000 --- a/tests/helpers/fixtures/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Pytest fixture modules under tests.helpers.""" diff --git a/tests/helpers/fixtures/env.py b/tests/helpers/fixtures/env.py deleted file mode 100644 index 939bad02ca4..00000000000 --- a/tests/helpers/fixtures/env.py +++ /dev/null @@ -1,59 +0,0 @@ -import os - -import pytest -import torch - - -@pytest.fixture(scope="session", autouse=True) -def default_env(): - # Keep behavior but avoid import-time side effects (RFC #2299). - keys = ("VLLM_WORKER_MULTIPROC_METHOD", "VLLM_TARGET_DEVICE") - previous = {key: os.environ.get(key) for key in keys} - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = previous["VLLM_WORKER_MULTIPROC_METHOD"] or "spawn" - os.environ["VLLM_TARGET_DEVICE"] = previous["VLLM_TARGET_DEVICE"] or ( - "cuda" if torch.cuda.is_available() and torch.cuda.device_count() > 0 else "cpu" - ) - yield - for key, value in previous.items(): - if value is None: - os.environ.pop(key, None) - else: - os.environ[key] = value - - -@pytest.fixture(scope="session") -def model_prefix() -> str: - prefix = os.environ.get("MODEL_PREFIX", "") - return f"{prefix.rstrip('/')}/" if prefix else "" - - -@pytest.fixture(autouse=True) -def clean_gpu_memory_between_tests(): - # Import here so ``tests.helpers.env`` (and vLLM platform modules) load only - # after session autouse fixtures like ``default_env`` have run (RFC #2299). - from tests.helpers.env import run_post_test_cleanup, run_pre_test_cleanup - - print("\n=== PRE-TEST GPU CLEANUP ===") - run_pre_test_cleanup() - yield - run_post_test_cleanup() - - -@pytest.fixture(scope="session", autouse=True) -def default_vllm_config(): - """Set a default VllmConfig for the whole test session. - - Session scope ensures module-scoped fixtures (e.g. ``omni_runner``) and - deferred imports of ``tests.helpers.runtime`` both see the same context. - Function-scoped autouse ran too late for ``OmniRunner`` setup and could - desynchronize vLLM init vs request preprocessing (e.g. renderer state). - """ - from vllm.config import DeviceConfig, VllmConfig, set_current_vllm_config - - # Use CPU device if no GPU is available (e.g., in CI environments) - has_gpu = torch.cuda.is_available() and torch.cuda.device_count() > 0 - device = "cuda" if has_gpu else "cpu" - device_config = DeviceConfig(device=device) - - with set_current_vllm_config(VllmConfig(device_config=device_config)): - yield diff --git a/tests/helpers/fixtures/log.py b/tests/helpers/fixtures/log.py deleted file mode 100644 index 798fa4ae6c7..00000000000 --- a/tests/helpers/fixtures/log.py +++ /dev/null @@ -1,7 +0,0 @@ -import pytest - - -@pytest.fixture(autouse=True) -def log_test_name_before_test(request: pytest.FixtureRequest): - print(f"--- Running test: {request.node.name}") - yield diff --git a/tests/helpers/fixtures/run_args.py b/tests/helpers/fixtures/run_args.py deleted file mode 100644 index 975584d206b..00000000000 --- a/tests/helpers/fixtures/run_args.py +++ /dev/null @@ -1,17 +0,0 @@ -import pytest - - -def pytest_addoption(parser): - parser.addoption( - "--run-level", - action="store", - default="core_model", - choices=["core_model", "advanced_model", "full_model"], - help="Test level to run: L2, L3, L4", - ) - - -@pytest.fixture(scope="session") -def run_level(request) -> str: - """Session test level from ``--run-level`` (see CI five-level docs).""" - return request.config.getoption("--run-level") diff --git a/tests/helpers/fixtures/runtime.py b/tests/helpers/fixtures/runtime.py deleted file mode 100644 index 4cae13cd6eb..00000000000 --- a/tests/helpers/fixtures/runtime.py +++ /dev/null @@ -1,137 +0,0 @@ -"""Runtime fixtures (OmniRunner / OmniServer). Imports are deferred to fixture time. - -Loading ``tests.helpers.runtime`` at plugin import time (before session fixtures) -pulls in vLLM/vllm_omni too early and breaks initialization order vs the legacy -monolithic conftest. Defer imports until fixtures run so ``default_env`` / -``default_vllm_config`` run first. -""" - -from __future__ import annotations - -import threading -from collections.abc import Generator -from typing import Any - -import pytest -import yaml - -from tests.helpers.runtime import OmniServer -from tests.helpers.stage_config import modify_stage_config - -omni_fixture_lock = threading.Lock() - - -@pytest.fixture(scope="module") -def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: str) -> Generator[OmniServer, Any, None]: - """Start vLLM-Omni through the standard or stage-CLI launcher. - - The fixture stays module-scoped because multi-stage initialization is costly. - The ``use_stage_cli`` flag on ``OmniServerParams`` routes the setup through the - stage-CLI harness while still reusing the same fixture grouping semantics. - """ - with omni_fixture_lock: - from tests.helpers.runtime import OmniServer, OmniServerParams, OmniServerStageCli - - params: OmniServerParams = request.param - model = model_prefix + params.model - port = params.port - stage_config_path = params.stage_config_path - if run_level in {"advanced_model", "full_model"} and stage_config_path is not None: - with open(stage_config_path, encoding="utf-8") as f: - cfg = yaml.safe_load(f) or {} - # Strip ``load_format: dummy`` (CI overlay default) so advanced_model - # tests use real weights. New schema (``stages:``) writes the field - # flat at stage level; legacy schema (``stage_args:``) nests it as - # ``engine_args.load_format``. Handle both. - new_schema_stages = cfg.get("stages") - stage_key = "stages" if new_schema_stages is not None else "stage_args" - delete_path = "load_format" if new_schema_stages is not None else "engine_args.load_format" - stage_entries = cfg.get(stage_key, []) - stage_ids = [stage["stage_id"] for stage in stage_entries if "stage_id" in stage] - stage_config_path = modify_stage_config( - stage_config_path, - deletes={stage_key: {stage_id: [delete_path] for stage_id in stage_ids}}, - ) - - server_args = params.server_args or [] - if params.use_omni and params.stage_init_timeout is not None: - server_args = [*server_args, "--stage-init-timeout", str(params.stage_init_timeout)] - else: - server_args = [*server_args, "--stage-init-timeout", "600"] - if params.init_timeout is not None: - server_args = [*server_args, "--init-timeout", str(params.init_timeout)] - else: - server_args = [*server_args, "--init-timeout", "900"] - if params.use_stage_cli: - if not params.use_omni: - raise ValueError("omni_server with use_stage_cli=True requires use_omni=True") - if stage_config_path is None: - raise ValueError("omni_server with use_stage_cli=True requires a stage_config_path") - server_args += ["--stage-configs-path", stage_config_path] - - with OmniServerStageCli( - model, - stage_config_path, - server_args, - port=port, - env_dict=params.env_dict, - ) as server: - print("OmniServer started successfully") - yield server - print("OmniServer stopping...") - else: - if stage_config_path is not None: - server_args += ["--stage-configs-path", stage_config_path] - - with ( - OmniServer( - model, - server_args, - port=port, - env_dict=params.env_dict, - use_omni=params.use_omni, - ) - if port - else OmniServer( - model, - server_args, - env_dict=params.env_dict, - use_omni=params.use_omni, - ) - ) as server: - print("OmniServer started successfully") - yield server - print("OmniServer stopping...") - - print("OmniServer stopped") - - -@pytest.fixture -def openai_client(request: pytest.FixtureRequest, run_level: str): - """Resolve ``omni_server`` lazily so parametrized server fixtures work like upstream.""" - from tests.helpers.runtime import OpenAIClientHandler - - server = request.getfixturevalue("omni_server") - return OpenAIClientHandler(host=server.host, port=server.port, api_key="EMPTY", run_level=run_level) - - -@pytest.fixture(scope="module") -def omni_runner(request: pytest.FixtureRequest, model_prefix: str): - from tests.helpers.runtime import OmniRunner - - with omni_fixture_lock: - model, stage_config_path = request.param - model = model_prefix + model - with OmniRunner(model, seed=42, stage_configs_path=stage_config_path) as runner: - print("OmniRunner started successfully") - yield runner - print("OmniRunner stopping...") - - print("OmniRunner stopped") - - -@pytest.fixture -def omni_runner_handler(omni_runner: Any): - from tests.helpers.runtime import OmniRunnerHandler - - return OmniRunnerHandler(omni_runner) diff --git a/tests/helpers/mark.py b/tests/helpers/mark.py deleted file mode 100644 index ed45dd7e9a1..00000000000 --- a/tests/helpers/mark.py +++ /dev/null @@ -1,135 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Pytest marks and decorators for hardware / resource selection (CUDA, ROCm, …).""" - -import pytest -from vllm.utils.torch_utils import cuda_device_count_stateless - -# Re-exported from tests.helpers.env (GPU wait + DeviceMemoryMonitor). - - -def cuda_marks(*, res: str, num_cards: int): - test_platform_detail = pytest.mark.cuda - if res == "L4": - test_resource = pytest.mark.L4 - elif res == "H100": - test_resource = pytest.mark.H100 - else: - raise ValueError(f"Invalid CUDA resource type: {res}. Supported: L4, H100") - marks = [test_resource, test_platform_detail] - if num_cards == 1: - return marks - test_distributed = pytest.mark.distributed_cuda(num_cards=num_cards) - test_skipif = pytest.mark.skipif_cuda( - cuda_device_count_stateless() < num_cards, - reason=f"Need at least {num_cards} CUDA GPUs to run the test.", - ) - return marks + [test_distributed, test_skipif] - - -def rocm_marks(*, res: str, num_cards: int): - test_platform_detail = pytest.mark.rocm - if res == "MI325": - test_resource = pytest.mark.MI325 - else: - raise ValueError(f"Invalid ROCm resource type: {res}. Supported: MI325") - marks = [test_resource, test_platform_detail] - if num_cards == 1: - return marks - test_distributed = pytest.mark.distributed_rocm(num_cards=num_cards) - return marks + [test_distributed] - - -def xpu_marks(*, res: str, num_cards: int): - test_platform_detail = pytest.mark.xpu - if res == "B60": - test_resource = pytest.mark.B60 - else: - raise ValueError(f"Invalid XPU resource type: {res}. Supported: B60") - marks = [test_resource, test_platform_detail] - if num_cards == 1: - return marks - test_distributed = pytest.mark.distributed_rocm(num_cards=num_cards) - return marks + [test_distributed] - - -def musa_marks(*, res: str, num_cards: int): - test_platform_detail = pytest.mark.musa - if res == "S5000": - test_resource = pytest.mark.S5000 - else: - raise ValueError(f"Invalid MUSA resource type: {res}. Supported: S5000") - marks = [test_resource, test_platform_detail] - if num_cards == 1: - return marks - test_distributed = pytest.mark.distributed_musa(num_cards=num_cards) - return marks + [test_distributed] - - -def gpu_marks(*, res: str, num_cards: int): - test_platform = pytest.mark.gpu - if res in ("L4", "H100"): - return [test_platform] + cuda_marks(res=res, num_cards=num_cards) - if res == "MI325": - return [test_platform] + rocm_marks(res=res, num_cards=num_cards) - if res == "B60": - return [test_platform] + xpu_marks(res=res, num_cards=num_cards) - if res == "S5000": - return [test_platform] + musa_marks(res=res, num_cards=num_cards) - raise ValueError(f"Invalid resource type: {res}. Supported: L4, H100, MI325, B60, S5000") - - -def npu_marks(*, res: str, num_cards: int): - test_platform = pytest.mark.npu - if res == "A2": - test_resource = pytest.mark.A2 - elif res == "A3": - test_resource = pytest.mark.A3 - else: - test_resource = None - if num_cards == 1: - return [mark for mark in [test_platform, test_resource] if mark is not None] - test_distributed = pytest.mark.distributed_npu(num_cards=num_cards) - return [mark for mark in [test_platform, test_resource, test_distributed] if mark is not None] - - -def hardware_marks(*, res: dict[str, str], num_cards: int | dict[str, int] = 1): - for platform, _ in res.items(): - if platform not in ("cuda", "rocm", "xpu", "npu", "musa"): - raise ValueError(f"Unsupported platform: {platform}") - if isinstance(num_cards, int): - num_cards_dict = {platform: num_cards for platform in res.keys()} - else: - num_cards_dict = num_cards - for platform in num_cards_dict.keys(): - if platform not in res: - raise ValueError(f"Platform '{platform}' in num_cards but not in res.") - for platform in res.keys(): - if platform not in num_cards_dict: - num_cards_dict[platform] = 1 - - all_marks: list[pytest.MarkDecorator] = [] - for platform, resource in res.items(): - cards = num_cards_dict[platform] - if platform in ("cuda", "rocm", "xpu"): - marks = gpu_marks(res=resource, num_cards=cards) - elif platform == "musa": - marks = musa_marks(res=resource, num_cards=cards) - elif platform == "npu": - marks = npu_marks(res=resource, num_cards=cards) - else: - raise ValueError(f"Unsupported platform: {platform}") - all_marks.extend(marks) - return all_marks - - -def hardware_test(*, res: dict[str, str], num_cards: int | dict[str, int] = 1): - all_marks = hardware_marks(res=res, num_cards=num_cards) - - def wrapper(f): - func = f - for mark in reversed(all_marks): - func = mark(func) - return func - - return wrapper diff --git a/tests/helpers/media.py b/tests/helpers/media.py deleted file mode 100644 index c0fb9717140..00000000000 --- a/tests/helpers/media.py +++ /dev/null @@ -1,657 +0,0 @@ -"""Synthetic media generation and media/text utilities for tests.""" - -import base64 -import concurrent.futures -import gc -import hashlib -import io -import logging -import math -import multiprocessing -import os -import random -import re -import subprocess -import tempfile -import time -import uuid -from contextlib import contextmanager -from pathlib import Path -from typing import Any - -import numpy as np -import soundfile as sf -from PIL import Image - -logger = logging.getLogger(__name__) - - -def _resolve_synthetic_media_cache_dir(cache_dir: Path | str | None) -> Path: - if cache_dir is not None: - return Path(cache_dir).expanduser().resolve() - return Path(tempfile.gettempdir()) / "vllm_omni_test_synthetic_media" - - -def _np_array_from_mp4_bytes(video_bytes: bytes) -> np.ndarray: - """Decode MP4 bytes to a (T, H, W, 3) uint8 RGB stack (matches in-memory synthetic frames).""" - import cv2 - - with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp: - tmp.write(video_bytes) - path = tmp.name - cap = None - try: - cap = cv2.VideoCapture(path) - if not cap.isOpened(): - raise RuntimeError("Failed to open cached synthetic video for decode") - frames: list[np.ndarray] = [] - while True: - ok, frame_bgr = cap.read() - if not ok: - break - frames.append(cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)) - if not frames: - raise RuntimeError("Cached synthetic video has no decodable frames") - return np.stack(frames, axis=0) - finally: - if cap is not None: - cap.release() - try: - os.unlink(path) - except OSError: - pass - - -def generate_synthetic_audio( - duration: int, - num_channels: int, - sample_rate: int = 48000, - *, - phrase_text: str = "test", - force_regenerate: bool = False, - cache_dir: Path | str | None = None, -) -> dict[str, Any]: - """ - Generate TTS speech with pyttsx3 and return base64 string. - - Caches the WAV under ``cache_dir`` when given, else under the default temp - subdirectory. Reuses the file when the same - ``duration`` / ``num_channels`` / ``sample_rate`` / ``phrase_text`` are - requested unless ``force_regenerate`` is true. - - The cache filename includes a SHA-256 digest of ``phrase_text`` so different - phrases never share a WAV cache entry. - """ - root = _resolve_synthetic_media_cache_dir(cache_dir) - root.mkdir(parents=True, exist_ok=True) - phrase_key = hashlib.sha256(phrase_text.encode("utf-8")).hexdigest() - cache_path = root / f"synth_audio_d{duration}_ch{num_channels}_sr{sample_rate}_pt{phrase_key}.wav" - - if not force_regenerate and cache_path.is_file(): - data, _sr = sf.read(str(cache_path), dtype="float32", always_2d=True) - audio_bytes = cache_path.read_bytes() - return { - "np_array": np.asarray(data, dtype=np.float32), - "base64": base64.b64encode(audio_bytes).decode("utf-8"), - "file_path": str(cache_path.resolve()), - } - - import pyttsx3 - - def _pick_voice(engine: pyttsx3.Engine) -> str | None: - voices = engine.getProperty("voices") - if not voices: - return None - - preferred_tokens = ( - "natural", - "jenny", - "sonia", - "susan", - "zira", - "aria", - "hazel", - "samantha", - "ava", - "allison", - "female", - "woman", - "english-us", - "en-us", - "english", - ) - discouraged_tokens = ( - "espeak", - "robot", - "mbrola", - "microsoft david", - "male", - "man", - ) - - best_voice = voices[0] - best_score = float("-inf") - for voice in voices: - voice_text = f"{getattr(voice, 'id', '')} {getattr(voice, 'name', '')}".lower() - voice_languages = " ".join( - lang.decode(errors="ignore") if isinstance(lang, bytes) else str(lang) - for lang in getattr(voice, "languages", []) - ).lower() - combined_text = f"{voice_text} {voice_languages}" - score = 0 - for idx, token in enumerate(preferred_tokens): - if token in combined_text: - score += 20 - idx - for token in discouraged_tokens: - if token in combined_text: - score -= 10 - if "english" in combined_text or "en_" in combined_text or "en-" in combined_text: - score += 4 - if "en-us" in combined_text or "english-us" in combined_text: - score += 4 - if score > best_score: - best_score = score - best_voice = voice - - return best_voice.id - - def _resample_audio(audio: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray: - if src_sr == dst_sr or len(audio) == 0: - return audio.astype(np.float32) - src_len = audio.shape[0] - dst_len = max(1, int(round(src_len * float(dst_sr) / float(src_sr)))) - src_idx = np.arange(src_len, dtype=np.float32) - dst_idx = np.linspace(0, src_len - 1, dst_len, dtype=np.float32) - resampled_channels: list[np.ndarray] = [] - for ch in range(audio.shape[1]): - resampled_channels.append(np.interp(dst_idx, src_idx, audio[:, ch]).astype(np.float32)) - return np.stack(resampled_channels, axis=1) - - def _match_channels(audio: np.ndarray, target_channels: int) -> np.ndarray: - current_channels = audio.shape[1] - if current_channels == target_channels: - return audio.astype(np.float32) - if target_channels == 1: - return np.mean(audio, axis=1, keepdims=True, dtype=np.float32) - if current_channels == 1: - return np.repeat(audio, target_channels, axis=1).astype(np.float32) - collapsed = np.mean(audio, axis=1, keepdims=True, dtype=np.float32) - return np.repeat(collapsed, target_channels, axis=1).astype(np.float32) - - def _trim_silence(audio: np.ndarray, threshold: float = 0.01) -> np.ndarray: - if len(audio) == 0: - return audio - energy = np.max(np.abs(audio), axis=1) - voiced = np.where(energy > threshold)[0] - if len(voiced) == 0: - return audio - start = max(0, int(voiced[0]) - int(sample_rate * 0.02)) - end = min(len(audio), int(voiced[-1]) + int(sample_rate * 0.04) + 1) - return audio[start:end] - - def _enhance_speech(audio: np.ndarray) -> np.ndarray: - if len(audio) == 0: - return audio.astype(np.float32) - enhanced = audio.astype(np.float32).copy() - enhanced -= np.mean(enhanced, axis=0, keepdims=True, dtype=np.float32) - if len(enhanced) > 1: - preemphasis = enhanced.copy() - preemphasis[1:] = enhanced[1:] - 0.94 * enhanced[:-1] - enhanced = 0.7 * enhanced + 0.3 * preemphasis - enhanced = np.sign(enhanced) * np.sqrt(np.abs(enhanced)) - fade = min(len(enhanced) // 4, max(1, int(sample_rate * 0.01))) - if fade > 1: - ramp_in = np.linspace(0.0, 1.0, fade, dtype=np.float32) - ramp_out = np.linspace(1.0, 0.0, fade, dtype=np.float32) - enhanced[:fade] *= ramp_in[:, None] - enhanced[-fade:] *= ramp_out[:, None] - peak = float(np.max(np.abs(enhanced))) - if peak > 1e-8: - enhanced = enhanced / peak * 0.95 - return enhanced.astype(np.float32) - - num_samples = int(sample_rate * max(1, duration)) - audio_data = np.zeros((num_samples, num_channels), dtype=np.float32) - - engine = pyttsx3.init() - engine.setProperty("rate", 112) - engine.setProperty("volume", 1.0) - selected_voice = _pick_voice(engine) - if selected_voice is not None: - engine.setProperty("voice", selected_voice) - - temp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) - temp_wav.close() - try: - engine.save_to_file(phrase_text, temp_wav.name) - engine.runAndWait() - engine.stop() - - ready = False - for _ in range(50): - if os.path.exists(temp_wav.name) and os.path.getsize(temp_wav.name) > 44: - ready = True - break - time.sleep(0.1) - if not ready: - raise RuntimeError("pyttsx3 did not produce a WAV file in time.") - - tts_audio, tts_sr = sf.read(temp_wav.name, dtype="float32", always_2d=True) - finally: - if os.path.exists(temp_wav.name): - os.unlink(temp_wav.name) - - if len(tts_audio) == 0: - raise RuntimeError("pyttsx3 produced an empty WAV file.") - - tts_audio = _resample_audio(tts_audio, tts_sr, sample_rate) - tts_audio = _match_channels(tts_audio, num_channels) - tts_audio = _trim_silence(tts_audio, threshold=0.012) - tts_audio = _enhance_speech(tts_audio) - - lead_silence = min(int(sample_rate * 0.02), num_samples // 8) - pause_samples = int(sample_rate * 0.18) - start = lead_silence - phrase_len = tts_audio.shape[0] - while start < num_samples: - take = min(phrase_len, num_samples - start) - audio_data[start : start + take] = tts_audio[:take] - start += phrase_len + pause_samples - - max_amp = float(np.max(np.abs(audio_data))) - if max_amp > 0: - audio_data = audio_data / max_amp * 0.95 - - sf.write(str(cache_path), audio_data, sample_rate, format="WAV", subtype="PCM_16") - audio_bytes = cache_path.read_bytes() - - return { - "np_array": audio_data.copy(), - "base64": base64.b64encode(audio_bytes).decode("utf-8"), - "file_path": str(cache_path.resolve()), - } - - -def _mux_mp4_bytes_with_synthetic_audio( - video_mp4_bytes: bytes, - *, - num_frames: int, - fps: float = 30.0, - sample_rate: int = 48000, -) -> bytes: - duration_sec = num_frames / fps if fps > 0 else 0.0 - duration_int = max(1, int(math.ceil(duration_sec))) - - try: - audio_result = generate_synthetic_audio( - duration=duration_int, - num_channels=1, - sample_rate=sample_rate, - ) - audio_pcm = audio_result["np_array"] - except Exception as e: - logger.warning("Synthetic video: generate_synthetic_audio failed (%s); using video-only MP4.", e) - return video_mp4_bytes - - try: - import imageio_ffmpeg - - ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe() - except Exception: - ffmpeg_exe = "ffmpeg" - - try: - with tempfile.TemporaryDirectory(prefix="syn_vid_mux_") as tmp: - vid_path = os.path.join(tmp, "video.mp4") - wav_path = os.path.join(tmp, "audio.wav") - out_path = os.path.join(tmp, "out.mp4") - with open(vid_path, "wb") as f: - f.write(video_mp4_bytes) - sf.write(wav_path, audio_pcm, sample_rate, format="WAV", subtype="PCM_16") - cmd = [ - ffmpeg_exe, - "-y", - "-nostdin", - "-hide_banner", - "-loglevel", - "error", - "-i", - vid_path, - "-i", - wav_path, - "-c:v", - "copy", - "-c:a", - "aac", - "-b:a", - "128k", - "-shortest", - "-movflags", - "+faststart", - out_path, - ] - subprocess.run(cmd, check=True, stdin=subprocess.DEVNULL, timeout=300) - with open(out_path, "rb") as f: - return f.read() - except ( - FileNotFoundError, - subprocess.CalledProcessError, - subprocess.TimeoutExpired, - OSError, - ) as e: - logger.warning("Synthetic video: audio mux failed (%s); using video-only MP4.", e) - return video_mp4_bytes - - -def generate_synthetic_video( - width: int, - height: int, - num_frames: int, - *, - embed_audio: bool = False, - force_regenerate: bool = False, - cache_dir: Path | str | None = None, -) -> dict[str, Any]: - """ - Generate synthetic MP4 (optional AAC audio). Caches final bytes by - ``width`` / ``height`` / ``num_frames`` / ``embed_audio`` unless - ``force_regenerate`` is true. Cache root: ``cache_dir`` if given, else the - default temp subdirectory. - """ - root = _resolve_synthetic_media_cache_dir(cache_dir) - root.mkdir(parents=True, exist_ok=True) - cache_path = root / f"synth_video_w{width}_h{height}_nf{num_frames}_ea{int(embed_audio)}.mp4" - - if not force_regenerate and cache_path.is_file(): - video_bytes = cache_path.read_bytes() - return { - "np_array": _np_array_from_mp4_bytes(video_bytes), - "base64": base64.b64encode(video_bytes).decode("utf-8"), - "file_path": str(cache_path.resolve()), - } - - import cv2 - import imageio - - num_balls = random.randint(3, 8) - balls = [] - for _ in range(num_balls): - radius = min(width, height) // 8 - if radius < 1: - raise ValueError(f"Video dimensions ({width}x{height}) too small") - x = random.randint(radius, width - radius) - y = random.randint(radius, height - radius) - speed = random.uniform(3.0, 8.0) - angle = random.uniform(0, 2 * math.pi) - vx = speed * math.cos(angle) - vy = speed * math.sin(angle) - color_bgr = (random.randint(50, 255), random.randint(50, 255), random.randint(50, 255)) - balls.append({"x": x, "y": y, "vx": vx, "vy": vy, "radius": radius, "color_bgr": color_bgr}) - - video_frames = [] - for _ in range(num_frames): - frame_bgr = np.zeros((height, width, 3), dtype=np.uint8) - for ball in balls: - ball["x"] += ball["vx"] - ball["y"] += ball["vy"] - if ball["x"] - ball["radius"] <= 0 or ball["x"] + ball["radius"] >= width: - ball["vx"] = -ball["vx"] - ball["x"] = max(ball["radius"], min(width - ball["radius"], ball["x"])) - if ball["y"] - ball["radius"] <= 0 or ball["y"] + ball["radius"] >= height: - ball["vy"] = -ball["vy"] - ball["y"] = max(ball["radius"], min(height - ball["radius"], ball["y"])) - x, y = int(ball["x"]), int(ball["y"]) - radius = int(ball["radius"]) - cv2.circle(frame_bgr, (x, y), radius, ball["color_bgr"], -1) - frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB) - video_frames.append(frame_rgb) - - fps = 30 - buffer = io.BytesIO() - writer_kwargs = { - "format": "mp4", - "fps": fps, - "codec": "libx264", - "quality": 7, - "pixelformat": "yuv420p", - "macro_block_size": 16, - "ffmpeg_params": ["-preset", "medium", "-crf", "23", "-movflags", "+faststart", "-pix_fmt", "yuv420p"], - } - try: - with imageio.get_writer(buffer, **writer_kwargs) as writer: - for frame in video_frames: - writer.append_data(frame) - buffer.seek(0) - video_only_bytes = buffer.read() - except Exception as e: - print(f"Warning: Failed to encode synthetic video: {e}") - raise - video_bytes = ( - _mux_mp4_bytes_with_synthetic_audio(video_only_bytes, num_frames=num_frames, fps=float(fps)) - if embed_audio - else video_only_bytes - ) - - cache_path.write_bytes(video_bytes) - - return { - "np_array": np.array(video_frames), - "base64": base64.b64encode(video_bytes).decode("utf-8"), - "file_path": str(cache_path.resolve()), - } - - -def generate_synthetic_image( - width: int, - height: int, - *, - force_regenerate: bool = False, - cache_dir: Path | str | None = None, - seed: int | None = None, -) -> dict[str, Any]: - """ - Random colored squares on white background. Caches JPEG by ``width`` / - ``height`` unless ``force_regenerate`` is true. Cache root: ``cache_dir`` - if given, else the default temp subdirectory. - """ - if seed is not None: - random.seed(seed) - - root = _resolve_synthetic_media_cache_dir(cache_dir) - root.mkdir(parents=True, exist_ok=True) - cache_path = root / f"synth_image_w{width}_h{height}.jpg" - - if not force_regenerate and cache_path.is_file(): - from PIL import Image as PILImage - - image = PILImage.open(cache_path) - image.load() - image_bytes = cache_path.read_bytes() - return { - "np_array": np.array(image).copy(), - "base64": base64.b64encode(image_bytes).decode("utf-8"), - "file_path": str(cache_path.resolve()), - } - - from PIL import ImageDraw - - image = Image.new("RGB", (width, height), (255, 255, 255)) - draw = ImageDraw.Draw(image) - num_squares = random.randint(3, 8) - for _ in range(num_squares): - square_size = random.randint(max(1, min(width, height) // 8), max(2, min(width, height) // 4)) - x = random.randint(0, max(0, width - square_size - 1)) - y = random.randint(0, max(0, height - square_size - 1)) - color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) - border_width = random.randint(1, 5) - draw.rectangle([x, y, x + square_size, y + square_size], fill=color, outline=(0, 0, 0), width=border_width) - - image.save(str(cache_path), format="JPEG", quality=85, optimize=True) - image_bytes = cache_path.read_bytes() - - return { - "np_array": np.array(image).copy(), - "base64": base64.b64encode(image_bytes).decode("utf-8"), - "file_path": str(cache_path.resolve()), - } - - -def decode_b64_image(b64: str): - img = Image.open(io.BytesIO(base64.b64decode(b64))) - img.load() - return img - - -def preprocess_text(text): - import opencc - - word_to_num = { - "zero": "0", - "one": "1", - "two": "2", - "three": "3", - "four": "4", - "five": "5", - "six": "6", - "seven": "7", - "eight": "8", - "nine": "9", - "ten": "10", - } - for word, num in word_to_num.items(): - pattern = r"\b" + re.escape(word) + r"\b" - text = re.sub(pattern, num, text, flags=re.IGNORECASE) - - text = re.sub(r"[^\w\s]", "", text) - text = re.sub(r"\s+", " ", text) - cc = opencc.OpenCC("t2s") - text = cc.convert(text) - text = re.sub(r"(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])", "", text) - return text.lower().strip() - - -def cosine_similarity_text(text1, text2, n: int = 3): - from collections import Counter - - if not text1 or not text2: - return 0.0 - - text1 = preprocess_text(text1) - text2 = preprocess_text(text2) - print(f"cosine similarity text1 is: {text1}, text2 is: {text2}") - - ngrams1 = [text1[i : i + n] for i in range(len(text1) - n + 1)] - ngrams2 = [text2[i : i + n] for i in range(len(text2) - n + 1)] - counter1 = Counter(ngrams1) - counter2 = Counter(ngrams2) - - all_ngrams = set(counter1.keys()) | set(counter2.keys()) - vec1 = [counter1.get(ng, 0) for ng in all_ngrams] - vec2 = [counter2.get(ng, 0) for ng in all_ngrams] - dot_product = sum(a * b for a, b in zip(vec1, vec2)) - norm1 = sum(a * a for a in vec1) ** 0.5 - norm2 = sum(b * b for b in vec2) ** 0.5 - if norm1 == 0 or norm2 == 0: - return 0.0 - cosine = dot_product / (norm1 * norm2) - # Down-weight when lengths differ: repeated/hallucinated transcripts stay - # high in bag-of-ngrams cosine (e.g. ABCABCABC vs ABC) but should score low. - len1, len2 = len(text1), len(text2) - length_harmony = (2.0 * min(len1, len2)) / (len1 + len2) - return cosine * length_harmony - - -def _merge_base64_audio_to_segment(base64_list: list[str]): - from pydub import AudioSegment - - merged = None - for b64 in base64_list: - raw = base64.b64decode(b64.split(",", 1)[-1]) - seg = AudioSegment.from_file(io.BytesIO(raw)) - merged = seg if merged is None else merged + seg - return merged - - -@contextmanager -def _serialize_whisper_small_model_download(): - """Serialize Whisper ``small`` cache writes across processes (Linux/Unix).""" - import fcntl - - lock_path = Path.home() / ".cache" / "whisper" / ".small_model_download.lock" - lock_path.parent.mkdir(parents=True, exist_ok=True) - f = open(lock_path, "a+b") - try: - fcntl.flock(f.fileno(), fcntl.LOCK_EX) - yield - finally: - fcntl.flock(f.fileno(), fcntl.LOCK_UN) - f.close() - - -def _whisper_transcribe_in_current_process(output_path: str) -> str: - import whisper - - device_index = None - from vllm_omni.platforms import current_omni_platform - - if current_omni_platform.is_available(): - n = current_omni_platform.get_device_count() - if n == 1: - device_index = 0 - elif n > 1: - device_index = n - 1 - - if device_index is not None: - torch_device = current_omni_platform.get_torch_device(device_index) - current_omni_platform.set_device(torch_device) - device = str(torch_device) - use_accelerator = True - else: - use_accelerator = False - device = "cpu" - - with _serialize_whisper_small_model_download(): - model = whisper.load_model("small", device=device) - try: - text = model.transcribe( - output_path, - temperature=0.0, - word_timestamps=True, - condition_on_previous_text=False, - )["text"] - finally: - del model - gc.collect() - if use_accelerator: - current_omni_platform.synchronize() - current_omni_platform.empty_cache() - return text or "" - - -def convert_audio_file_to_text(output_path: str) -> str: - """Convert an audio file to text in an isolated subprocess.""" - ctx = multiprocessing.get_context("spawn") - with concurrent.futures.ProcessPoolExecutor(max_workers=1, mp_context=ctx) as executor: - future = executor.submit(_whisper_transcribe_in_current_process, output_path) - return future.result() - - -def convert_audio_bytes_to_text(raw_bytes: bytes) -> str: - output_path = f"./test_{uuid.uuid4().hex}.wav" - data, samplerate = sf.read(io.BytesIO(raw_bytes)) - sf.write(output_path, data, samplerate, format="WAV", subtype="PCM_16") - print(f"audio data is saved: {output_path}") - return convert_audio_file_to_text(output_path) - - -__all__ = [ - "_merge_base64_audio_to_segment", - "convert_audio_bytes_to_text", - "convert_audio_file_to_text", - "cosine_similarity_text", - "decode_b64_image", - "generate_synthetic_audio", - "generate_synthetic_image", - "generate_synthetic_video", - "preprocess_text", -] diff --git a/tests/helpers/runtime.py b/tests/helpers/runtime.py deleted file mode 100644 index 0cf0f9e480d..00000000000 --- a/tests/helpers/runtime.py +++ /dev/null @@ -1,1406 +0,0 @@ -"""Server/client/runner runtime primitives for tests.""" - -import base64 -import concurrent.futures -import io -import json -import os -import socket -import subprocess -import sys -import tempfile -import time -from dataclasses import dataclass -from io import BytesIO -from pathlib import Path -from typing import Any, NamedTuple - -import psutil -import requests -import soundfile as sf -import torch -import yaml -from openai import OpenAI, omit -from PIL import Image -from vllm import TextPrompt -from vllm.distributed.parallel_state import cleanup_dist_env_and_memory -from vllm.logger import init_logger - -from tests.helpers.assertions import ( - assert_audio_speech_response, - assert_diffusion_response, - assert_omni_response, -) -from tests.helpers.env import run_forced_gpu_cleanup_round -from tests.helpers.media import ( - _merge_base64_audio_to_segment, - convert_audio_bytes_to_text, - decode_b64_image, -) -from vllm_omni.config.stage_config import resolve_deploy_yaml -from vllm_omni.platforms import current_omni_platform - -logger = init_logger(__name__) - -PromptAudioInput = list[tuple[Any, int]] | tuple[Any, int] | None -PromptImageInput = list[Any] | Any | None -PromptVideoInput = list[Any] | Any | None - -try: - from vllm.distributed.parallel_state import cleanup_dist_env_and_memory # type: ignore -except Exception: # pragma: no cover - - def cleanup_dist_env_and_memory() -> None: - return None - - -def get_open_port() -> int: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("127.0.0.1", 0)) - return int(s.getsockname()[1]) - - -def dummy_messages_from_mix_data( - system_prompt: dict[str, Any] = None, - video_data_url: Any = None, - audio_data_url: Any = None, - image_data_url: Any = None, - content_text: str = None, -): - """Create messages with video、image、audio data URL for OpenAI API.""" - if content_text is not None: - content = [{"type": "text", "text": content_text}] - else: - content = [] - - media_items = [] - if isinstance(video_data_url, list): - for video_url in video_data_url: - media_items.append((video_url, "video")) - else: - media_items.append((video_data_url, "video")) - - if isinstance(image_data_url, list): - for url in image_data_url: - media_items.append((url, "image")) - else: - media_items.append((image_data_url, "image")) - - if isinstance(audio_data_url, list): - for url in audio_data_url: - media_items.append((url, "audio")) - else: - media_items.append((audio_data_url, "audio")) - - content.extend( - {"type": f"{media_type}_url", f"{media_type}_url": {"url": url}} - for url, media_type in media_items - if url is not None - ) - messages = [{"role": "user", "content": content}] - if system_prompt is not None: - messages = [system_prompt] + messages - return messages - - -def _omni_subprocess_cwd() -> str: - """Repo root for ``python -m vllm_omni...`` (legacy conftest lived under ``tests/``; helpers under ``tests/helpers/``).""" - return os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..")) - - -class OmniServerParams(NamedTuple): - model: str - port: int | None = None - stage_config_path: str | None = None - server_args: list[str] | None = None - env_dict: dict[str, str] | None = None - use_omni: bool = True - use_stage_cli: bool = False - init_timeout: int | None = None - stage_init_timeout: int | None = None # None: fixture supplies default (600 s) - - -class OmniServer: - """Omniserver for vLLM-Omni tests.""" - - def __init__( - self, - model: str, - serve_args: list[str], - *, - port: int | None = None, - env_dict: dict[str, str] | None = None, - use_omni: bool = True, - ) -> None: - run_forced_gpu_cleanup_round() - cleanup_dist_env_and_memory() - self.model = model - self.serve_args = serve_args - self.env_dict = env_dict - self.use_omni = use_omni - self.proc: subprocess.Popen | None = None - self.host = "127.0.0.1" - self.port = get_open_port() if port is None else port - - def _start_server(self) -> None: - env = os.environ.copy() - env.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - if self.env_dict is not None: - env.update(self.env_dict) - - cmd = [ - sys.executable, - "-m", - "vllm_omni.entrypoints.cli.main", - "serve", - self.model, - "--host", - self.host, - "--port", - str(self.port), - ] - if self.use_omni: - cmd.append("--omni") - cmd += self.serve_args - - print(f"Launching OmniServer with: {' '.join(cmd)}") - self.proc = subprocess.Popen( - cmd, - env=env, - cwd=_omni_subprocess_cwd(), - ) - - max_wait = 1200 - start_time = time.time() - while time.time() - start_time < max_wait: - ret = self.proc.poll() - if ret is not None: - raise RuntimeError(f"Server processes exited with code {ret} before becoming ready.") - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.settimeout(1) - if sock.connect_ex((self.host, self.port)) == 0: - print(f"Server ready on {self.host}:{self.port}") - return - time.sleep(2) - raise RuntimeError(f"Server failed to start within {max_wait} seconds") - - def _kill_process_tree(self, pid): - try: - parent = psutil.Process(pid) - children = parent.children(recursive=True) - all_pids = [pid] + [child.pid for child in children] - - for child in children: - try: - child.terminate() - except psutil.NoSuchProcess: - pass - - _, still_alive = psutil.wait_procs(children, timeout=10) - - for child in still_alive: - try: - child.kill() - except psutil.NoSuchProcess: - pass - - try: - parent.terminate() - parent.wait(timeout=10) - except (psutil.NoSuchProcess, psutil.TimeoutExpired): - try: - parent.kill() - except psutil.NoSuchProcess: - pass - - time.sleep(1) - alive_processes = [] - for check_pid in all_pids: - if psutil.pid_exists(check_pid): - alive_processes.append(check_pid) - - if alive_processes: - print(f"Warning: Processes still alive: {alive_processes}") - for alive_pid in alive_processes: - try: - subprocess.run(["kill", "-9", str(alive_pid)], timeout=2) - except Exception as e: - print(f"Cleanup failed: {e}") - - except psutil.NoSuchProcess: - pass - - def __enter__(self): - self._start_server() - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.proc: - self._kill_process_tree(self.proc.pid) - run_forced_gpu_cleanup_round() - cleanup_dist_env_and_memory() - - -class OmniServerStageCli(OmniServer): - """Omni server harness that exercises the stage CLI flow.""" - - def __init__( - self, - model: str, - stage_config_path: str, - serve_args: list[str] | None = None, - *, - stage_ids: list[int] | None = None, - port: int | None = None, - env_dict: dict[str, str] | None = None, - ) -> None: - super().__init__(model, serve_args or [], port=port, env_dict=env_dict, use_omni=True) - self.stage_config_path = stage_config_path - self.master_port = get_open_port() - self.visible_device_list = self._load_visible_device_list(env_dict) - resolved_cfg = resolve_deploy_yaml(stage_config_path) - # Dump the resolved deploy config so CI logs show each stage's - # gpu_memory_utilization / max_model_len / max_num_seqs after - # base_config inheritance and overlay merge — essential when - # diagnosing OOMs that depend on the merged values. - print( - f"[OmniServerStageCli] Resolved deploy config from {stage_config_path}:\n" - f"{yaml.safe_dump(resolved_cfg, sort_keys=False, default_flow_style=False)}", - flush=True, - ) - self.stage_runtime_devices = self._load_stage_runtime_devices(resolved_cfg) - self.stage_ids = stage_ids or self._load_stage_ids(resolved_cfg) - if 0 not in self.stage_ids: - raise ValueError(f"Stage CLI test requires stage_id=0 in config: {stage_config_path}") - self.stage_procs: dict[int, subprocess.Popen] = {} - self.proc = None - - @staticmethod - def _stage_entries(cfg: dict) -> list[dict]: - """Return the list of stage entries from either legacy (``stage_args``) - or new-schema (``stages``) deploy YAMLs.""" - return cfg.get("stage_args") or cfg.get("stages") or [] - - @staticmethod - def _load_stage_ids(resolved_config: dict) -> list[int]: - stage_ids = [ - stage["stage_id"] for stage in OmniServerStageCli._stage_entries(resolved_config) if "stage_id" in stage - ] - if not stage_ids: - raise ValueError("No stage IDs found in resolved config") - return stage_ids - - @staticmethod - def _load_stage_runtime_devices(resolved_config: dict) -> dict[int, str]: - runtime_devices: dict[int, str] = {} - for stage in OmniServerStageCli._stage_entries(resolved_config): - stage_id = stage.get("stage_id") - # New schema: stage.devices is flat at stage level. - # Legacy schema: stage.runtime.devices is nested. - devices = stage.get("devices") or stage.get("runtime", {}).get("devices") - if stage_id is not None and devices: - runtime_devices[int(stage_id)] = str(devices) - return runtime_devices - - @classmethod - def _parse_device_list(cls, devices: str | int) -> list[str]: - if isinstance(devices, int): - if devices < 0: - raise ValueError("Device IDs must be non-negative integers") - return [str(devices)] - return [token.strip() for token in str(devices).split(",") if token.strip()] - - @classmethod - def _load_visible_device_list(cls, env_dict: dict[str, str] | None) -> list[str] | None: - env = os.environ.copy() - if env_dict is not None: - env.update(env_dict) - - env_var = getattr(current_omni_platform, "device_control_env_var", None) - if env_var and env_var in env: - return [token.strip() for token in env[env_var].split(",") if token.strip()] - return None - - @classmethod - def _map_stage_devices(cls, stage_id: int, visible_device_list: list[str] | None, devices: str) -> str: - device_list = cls._parse_device_list(devices) - - if visible_device_list is None: - return ",".join(device_list) - - if not all(device.isdigit() for device in device_list): - raise ValueError("Logical devices must be non-negative integers") - - logical_ids = [int(device) for device in device_list] - if logical_ids and max(logical_ids) >= len(visible_device_list): - raise ValueError( - f"Stage {stage_id} has logical IDs {device_list}, one or more of which exceed the number of visible devices" - ) - - return ",".join(visible_device_list[idx] for idx in logical_ids) - - def _set_stage_device_env(self, stage_id: int, env: dict[str, str], devices: str) -> None: - mapped_devices = self._map_stage_devices(stage_id, self.visible_device_list, devices) - env_var = getattr(current_omni_platform, "device_control_env_var", None) - if env_var: - env[env_var] = mapped_devices - - def _build_stage_cmd(self, stage_id: int, *, headless: bool) -> list[str]: - cmd = [ - sys.executable, - "-m", - "vllm_omni.entrypoints.cli.main", - "serve", - self.model, - "--omni", - "--stage-configs-path", - self.stage_config_path, - "--stage-id", - str(stage_id), - "--omni-master-address", - self.host, - "--omni-master-port", - str(self.master_port), - ] - - if headless: - cmd.append("--headless") - else: - cmd += ["--host", self.host, "--port", str(self.port)] - - cmd += self.serve_args - return cmd - - def _launch_stage(self, stage_id: int, *, headless: bool) -> None: - env = os.environ.copy() - env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - if self.env_dict is not None: - env.update(self.env_dict) - - devices = self.stage_runtime_devices.get(stage_id) - if devices: - self._set_stage_device_env(stage_id, env, devices) - - cmd = self._build_stage_cmd(stage_id, headless=headless) - print(f"Launching OmniServerStageCli stage {stage_id}: {' '.join(cmd)}") - # Capture each subprocess's stdout+stderr to a per-stage log file so - # debugging "Stage N exited before API server ready" doesn't rely on - # guessing; the file is surfaced in the RuntimeError message. - log_path = Path(tempfile.gettempdir()) / f"omni_stage_{stage_id}_{self.master_port}.log" - self._stage_log_paths = getattr(self, "_stage_log_paths", {}) - self._stage_log_paths[stage_id] = log_path - log_fh = open(log_path, "w", buffering=1) # noqa: SIM115 - closed in __exit__ - self._stage_log_files = getattr(self, "_stage_log_files", {}) - self._stage_log_files[stage_id] = log_fh - proc = subprocess.Popen( - cmd, - env=env, - cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))), - stdout=log_fh, - stderr=subprocess.STDOUT, - ) - self.stage_procs[stage_id] = proc - if stage_id == 0: - self.proc = proc - - def _ensure_stage_processes_alive(self) -> None: - for stage_id, proc in self.stage_procs.items(): - ret = proc.poll() - if ret is not None: - log_path = getattr(self, "_stage_log_paths", {}).get(stage_id) - tail = "" - if log_path and log_path.exists(): - try: - with open(log_path, encoding="utf-8", errors="replace") as f: - lines = f.readlines() - tail = "\n=== Last 60 lines of stage {} log ({}) ===\n{}".format( - stage_id, log_path, "".join(lines[-60:]) or "" - ) - except Exception as exc: # pragma: no cover - diagnostic only - tail = f"\n" - raise RuntimeError(f"Stage {stage_id} exited with code {ret} before API server became ready.{tail}") - - def _start_server(self) -> None: - ordered_stage_ids = [0, *[stage_id for stage_id in self.stage_ids if stage_id != 0]] - - self._launch_stage(0, headless=False) - time.sleep(2) - self._ensure_stage_processes_alive() - - for stage_id in ordered_stage_ids[1:]: - self._launch_stage(stage_id, headless=True) - - max_wait = 1200 - start_time = time.time() - while time.time() - start_time < max_wait: - self._ensure_stage_processes_alive() - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.settimeout(1) - result = sock.connect_ex((self.host, self.port)) - if result == 0: - print(f"OmniServerStageCli ready on {self.host}:{self.port}") - return - time.sleep(2) - - raise RuntimeError(f"OmniServerStageCli failed to start within {max_wait} seconds") - - def _dump_stage_logs_for_debug(self, head_lines: int = 300, tail_lines: int = 500) -> None: - """Tail each stage's subprocess log back to stdout on teardown. - - Stage subprocesses redirect stdout/stderr to ``/tmp/omni_stage_*.log`` - so we don't spam the main CI stream while tests run; but that also - hides engine init (KV cache size, Available KV cache memory, vLLM - engine config) when things go wrong. Dump them here so buildkite - captures them post-run. Head covers engine init; tail covers - whatever state the stage was in when it was torn down. - """ - log_paths = getattr(self, "_stage_log_paths", {}) or {} - for stage_id in sorted(log_paths): - log_path = log_paths[stage_id] - if not log_path or not log_path.exists(): - continue - try: - with open(log_path, encoding="utf-8", errors="replace") as f: - lines = f.readlines() - except Exception as exc: # pragma: no cover - diagnostic only - print(f"[OmniServerStageCli] stage {stage_id} log read failed: {exc}", flush=True) - continue - total = len(lines) - if total <= head_lines + tail_lines: - head_chunk = lines - tail_chunk = [] - elided = 0 - else: - head_chunk = lines[:head_lines] - tail_chunk = lines[-tail_lines:] - elided = total - head_lines - tail_lines - print(f"\n=== stage {stage_id} log HEAD ({log_path}) ===", flush=True) - print("".join(head_chunk).rstrip("\n"), flush=True) - if tail_chunk: - print(f"\n... [{elided} lines elided] ...", flush=True) - print(f"\n=== stage {stage_id} log TAIL ({log_path}) ===", flush=True) - print("".join(tail_chunk).rstrip("\n"), flush=True) - print(f"=== end stage {stage_id} log ===\n", flush=True) - - def __exit__(self, exc_type, exc_val, exc_tb): - self._dump_stage_logs_for_debug() - for stage_id in sorted(self.stage_procs, reverse=True): - proc = self.stage_procs[stage_id] - if proc.poll() is None: - self._kill_process_tree(proc.pid) - run_forced_gpu_cleanup_round() - cleanup_dist_env_and_memory() - - -@dataclass -class OmniResponse: - text_content: str | None = None - audio_data: list[str] | None = None - audio_content: str | None = None - audio_format: str | None = None - audio_bytes: bytes | None = None - e2e_latency: float | None = None - success: bool = False - error_message: str | None = None - cached_tokens: int | None = None - - -@dataclass -class DiffusionResponse: - text_content: str | None = None - images: list[Image.Image] | None = None - audios: list[Any] | None = None - videos: list[Any] | None = None - e2e_latency: float | None = None - success: bool = False - error_message: str | None = None - - -class OpenAIClientHandler: - def __init__(self, host: str = "127.0.0.1", port: int = None, api_key: str = "EMPTY", run_level: str = None): - if port is None: - port = get_open_port() - self.base_url = f"http://{host}:{port}" - self.client = OpenAI(base_url=f"http://{host}:{port}/v1", api_key=api_key) - self.run_level = run_level - - def _process_stream_omni_response(self, chat_completion) -> OmniResponse: - result = OmniResponse() - start_time = time.perf_counter() - try: - text_content = "" - audio_data = [] - for chunk in chat_completion: - for choice in chunk.choices: - content = getattr(getattr(choice, "delta", None), "content", None) - modality = getattr(chunk, "modality", None) - if modality == "audio" and content: - audio_data.append(content) - elif modality == "text" and content: - text_content += content - result.e2e_latency = time.perf_counter() - start_time - audio_content = None - if audio_data: - merged_seg = _merge_base64_audio_to_segment(audio_data) - wav_buf = BytesIO() - merged_seg.export(wav_buf, format="wav") - result.audio_bytes = wav_buf.getvalue() - audio_content = convert_audio_bytes_to_text(result.audio_bytes) - result.text_content = text_content - result.audio_data = audio_data - result.audio_content = audio_content - result.success = True - except Exception as e: - result.error_message = f"Stream processing error: {str(e)}" - print(f"Error: {result.error_message}") - return result - - def _process_non_stream_omni_response(self, chat_completion) -> OmniResponse: - result = OmniResponse() - start_time = time.perf_counter() - try: - audio_data = None - text_content = None - for choice in chat_completion.choices: - if hasattr(choice.message, "audio") and choice.message.audio is not None: - audio_data = choice.message.audio.data - if hasattr(choice.message, "content") and choice.message.content is not None: - text_content = choice.message.content - # Extract cached_tokens for prefix caching tests - usage = getattr(chat_completion, "usage", None) - if usage and (details := getattr(usage, "prompt_tokens_details", None)): - result.cached_tokens = details.cached_tokens - result.e2e_latency = time.perf_counter() - start_time - audio_content = None - if audio_data: - result.audio_bytes = base64.b64decode(audio_data) - audio_content = convert_audio_bytes_to_text(result.audio_bytes) - result.text_content = text_content - result.audio_content = audio_content - result.success = True - except Exception as e: - result.error_message = f"Non-stream processing error: {str(e)}" - print(f"Error: {result.error_message}") - return result - - def _process_diffusion_response(self, chat_completion) -> DiffusionResponse: - result = DiffusionResponse() - start_time = time.perf_counter() - try: - images = [] - for choice in chat_completion.choices: - content = getattr(choice.message, "content", None) - if isinstance(content, list): - for item in content: - image_url = None - if isinstance(item, dict): - image_url = item.get("image_url", {}).get("url") - else: - image_url_obj = getattr(item, "image_url", None) - image_url = getattr(image_url_obj, "url", None) if image_url_obj else None - if image_url and image_url.startswith("data:image"): - b64_data = image_url.split(",", 1)[1] - images.append(decode_b64_image(b64_data)) - result.e2e_latency = time.perf_counter() - start_time - result.images = images if images else None - result.success = True - except Exception as e: - result.error_message = f"Diffusion response processing error: {str(e)}" - print(f"Error: {result.error_message}") - return result - - def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: - responses: list[OmniResponse] = [] - stream = request_config.get("stream", False) - modalities = request_config.get("modalities", ["text", "audio"]) - extra_body: dict[str, Any] = {} - if "speaker" in request_config: - extra_body["speaker"] = request_config["speaker"] - if request_config.get("use_audio_in_video"): - mm = dict(extra_body.get("mm_processor_kwargs") or {}) - mm["use_audio_in_video"] = True - extra_body["mm_processor_kwargs"] = mm - if "sampling_params_list" in request_config: - extra_body["sampling_params_list"] = request_config["sampling_params_list"] - - create_kwargs: dict[str, Any] = { - "model": request_config.get("model"), - "messages": request_config.get("messages"), - "stream": stream, - "modalities": modalities, - } - if extra_body: - create_kwargs["extra_body"] = extra_body - - if request_num == 1: - chat_completion = self.client.chat.completions.create(**create_kwargs) - resp = ( - self._process_stream_omni_response(chat_completion) - if stream - else self._process_non_stream_omni_response(chat_completion) - ) - assert_omni_response(resp, request_config, run_level=self.run_level) - responses.append(resp) - return responses - - def _one(): - chat_completion = self.client.chat.completions.create(**create_kwargs) - return ( - self._process_stream_omni_response(chat_completion) - if stream - else self._process_non_stream_omni_response(chat_completion) - ) - - with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: - futures = [executor.submit(_one) for _ in range(request_num)] - for future in concurrent.futures.as_completed(futures): - resp = future.result() - assert_omni_response(resp, request_config, run_level=self.run_level) - responses.append(resp) - return responses - - def _process_stream_audio_speech_response(self, response, *, response_format: str | None = None) -> OmniResponse: - """ - Process streaming /v1/audio/speech responses into an OmniResponse. - - This mirrors _process_stream_omni_response but operates on low-level - audio bytes and produces an OmniResponse with audio_content filled - from Whisper transcription. - """ - result = OmniResponse() - start_time = time.perf_counter() - - try: - # Aggregate all audio bytes from the streaming response. - data = bytearray() - - # Preferred OpenAI helper. - if hasattr(response, "iter_bytes") and callable(getattr(response, "iter_bytes")): - for chunk in response.iter_bytes(): - if chunk: - data.extend(chunk) - else: - # Generic iterable-of-bytes fallback (e.g., generator or list of chunks). - try: - iterator = iter(response) - except TypeError: - iterator = None - - if iterator is not None: - for chunk in iterator: - if not chunk: - continue - if isinstance(chunk, (bytes, bytearray)): - data.extend(chunk) - elif hasattr(chunk, "data"): - data.extend(chunk.data) # type: ignore[arg-type] - elif hasattr(chunk, "content"): - data.extend(chunk.content) # type: ignore[arg-type] - else: - raise TypeError(f"Unsupported stream chunk type: {type(chunk)}") - else: - raise TypeError(f"Unsupported audio speech streaming response type: {type(response)}") - - raw_bytes = bytes(data) - if response_format == "pcm": - transcript = None - else: - transcript = convert_audio_bytes_to_text(raw_bytes) - - # Populate OmniResponse. - result.audio_bytes = raw_bytes - result.audio_content = transcript - result.e2e_latency = time.perf_counter() - start_time - result.success = True - result.audio_format = getattr(response, "response", None) - if result.audio_format is not None: - result.audio_format = result.audio_format.headers.get("content-type", "") - - except Exception as e: - result.error_message = f"Audio speech stream processing error: {str(e)}" - print(f"Error: {result.error_message}") - - return result - - def _process_non_stream_audio_speech_response( - self, response, *, response_format: str | None = None - ) -> OmniResponse: - """ - Process non-streaming /v1/audio/speech responses into an OmniResponse. - - This mirrors _process_non_stream_omni_response but for the binary - audio payload returned by audio.speech.create. - """ - result = OmniResponse() - start_time = time.perf_counter() - - try: - # OpenAI non-streaming audio.speech.create returns HttpxBinaryResponseContent (.read() or .content) - if hasattr(response, "read") and callable(getattr(response, "read")): - raw_bytes = response.read() - elif hasattr(response, "content"): - raw_bytes = response.content # type: ignore[assignment] - else: - raise TypeError(f"Unsupported audio speech response type: {type(response)}") - - if response_format == "pcm": - transcript = None - else: - transcript = convert_audio_bytes_to_text(raw_bytes) - - result.audio_bytes = raw_bytes - result.audio_content = transcript - result.e2e_latency = time.perf_counter() - start_time - result.success = True - result.audio_format = getattr(response, "response", None) - if result.audio_format is not None: - result.audio_format = result.audio_format.headers.get("content-type", "") - - except Exception as e: - result.error_message = f"Audio speech non-stream processing error: {str(e)}" - print(f"Error: {result.error_message}") - - return result - - def send_audio_speech_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: - """ - Call the /v1/audio/speech endpoint using the same configuration-dict - style as send_omni_request, but via the OpenAI Python client's - audio.speech APIs. - - Expected keys in request_config: - - model: model name/path (required) - - input: text to synthesize (required) - - response_format: audio format such as "wav" or "pcm" (optional) - - task_type, ref_text, ref_audio: TTS-specific extras (optional, passed via extra_body) - - timeout: request timeout in seconds (float, optional, default 120.0) - - stream: whether to use streaming API (bool, optional, default False) - """ - timeout = float(request_config.get("timeout", 120.0)) - - model = request_config["model"] - text_input = request_config["input"] - stream = bool(request_config.get("stream", False)) - voice = request_config.get("voice", None) - - # Standard OpenAI param: use omit when not provided to keep default behavior. - response_format = request_config.get("response_format", omit) - - # Qwen3-TTS custom fields, forwarded via extra_body. - extra_body: dict[str, Any] = {} - # Keep this list aligned with vllm_omni.entrypoints.openai.protocol.audio params. - for key in ("task_type", "ref_text", "ref_audio", "language", "max_new_tokens"): - if key in request_config: - extra_body[key] = request_config[key] - - responses: list[OmniResponse] = [] - - speech_fmt: str | None = None if response_format is omit else str(response_format).lower() - - if request_num == 1: - if stream: - # Use streaming response helper. - with self.client.audio.speech.with_streaming_response.create( - model=model, - input=text_input, - response_format=response_format, - extra_body=extra_body or None, - timeout=timeout, - voice=voice, - ) as resp: - omni_resp = self._process_stream_audio_speech_response(resp, response_format=speech_fmt) - else: - # Non-streaming response. - resp = self.client.audio.speech.create( - model=model, - input=text_input, - response_format=response_format, - extra_body=extra_body or None, - timeout=timeout, - voice=voice, - ) - omni_resp = self._process_non_stream_audio_speech_response(resp, response_format=speech_fmt) - - assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level) - responses.append(omni_resp) - return responses - else: - # request_num > 1: concurrent requests (use same params as single-request path) - - if stream: - - def _stream_task(): - with self.client.audio.speech.with_streaming_response.create( - model=model, - input=text_input, - response_format=response_format, - extra_body=extra_body or None, - timeout=timeout, - voice=voice, - ) as resp: - return self._process_stream_audio_speech_response(resp, response_format=speech_fmt) - - with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: - futures = [executor.submit(_stream_task) for _ in range(request_num)] - for future in concurrent.futures.as_completed(futures): - omni_resp = future.result() - assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level) - responses.append(omni_resp) - else: - with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: - futures = [] - for _ in range(request_num): - future = executor.submit( - self.client.audio.speech.create, - model=model, - input=text_input, - response_format=response_format, - extra_body=extra_body or None, - timeout=timeout, - voice=voice, - ) - futures.append(future) - - for future in concurrent.futures.as_completed(futures): - resp = future.result() - omni_resp = self._process_non_stream_audio_speech_response(resp, response_format=speech_fmt) - assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level) - responses.append(omni_resp) - - return responses - - def send_diffusion_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[DiffusionResponse]: - """ - Send OpenAI requests for diffusion models. - Args: - request_config: Request configuration dictionary containing parameters like model, messages - request_num: Number of requests to send concurrently, defaults to 1 (single request) - Returns: - list[DiffusionResponse]: List of DiffusionResponse objects containing the response data - """ - responses: list[DiffusionResponse] = [] - stream = request_config.get("stream", False) - modalities = request_config.get("modalities", omit) # Most diffusion models don't require modalities param - extra_body = request_config.get("extra_body", None) - if stream: - raise NotImplementedError("Streaming is not currently implemented for diffusion model e2e test") - if request_num == 1: - # Send single request - chat_completion = self.client.chat.completions.create( - model=request_config.get("model"), - messages=request_config.get("messages"), - extra_body=extra_body, - modalities=modalities, - ) - response = self._process_diffusion_response(chat_completion) - assert_diffusion_response(response, request_config, run_level=self.run_level) - responses.append(response) - else: - # Send concurrent requests - with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: - futures = [] - # Submit all request tasks - for _ in range(request_num): - future = executor.submit( - self.client.chat.completions.create, - model=request_config.get("model"), - messages=request_config.get("messages"), - modalities=modalities, - extra_body=extra_body, - ) - futures.append(future) - # Process completed tasks - for future in concurrent.futures.as_completed(futures): - chat_completion = future.result() - response = self._process_diffusion_response(chat_completion) - assert_diffusion_response(response, request_config, run_level=self.run_level) - responses.append(response) - return responses - - def send_video_diffusion_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: - """ - Send native /v1/videos requests. - """ - if request_num != 1: - raise NotImplementedError("Concurrent video diffusion requests are not currently implemented") - form_data = request_config.get("form_data") - if not isinstance(form_data, dict): - raise ValueError("Video request_config must contain 'form_data'") - normalized_form_data = {key: str(value) for key, value in form_data.items() if value is not None} - files: dict[str, tuple[str, BytesIO, str]] = {} - image_reference = request_config.get("image_reference") - if image_reference: - if image_reference.startswith("data:image"): - header, encoded = image_reference.split(",", 1) - content_type = header.split(";")[0].removeprefix("data:") - extension = content_type.split("/")[-1] - file_data = base64.b64decode(encoded) - files["input_reference"] = (f"reference.{extension}", BytesIO(file_data), content_type) - else: - normalized_form_data["image_reference"] = json.dumps({"image_url": image_reference}) - - result = DiffusionResponse() - start_time = time.perf_counter() - create_url = self._build_url("/v1/videos") - response = requests.post( - create_url, - data=normalized_form_data, - files=files, - headers={"Accept": "application/json"}, - timeout=60, - ) - response.raise_for_status() - job_data = response.json() - video_id = job_data["id"] - self._wait_until_video_completed(video_id) - video_content = self._download_video_content(video_id) - result.success = True - result.videos = [video_content] - result.e2e_latency = time.perf_counter() - start_time - assert_diffusion_response(result, request_config, run_level=self.run_level) - return [result] - - def _wait_until_video_completed( - self, video_id: str, poll_interval_seconds: int = 2, timeout_seconds: int = 300 - ) -> None: - status_url = self._build_url(f"/v1/videos/{video_id}") - deadline = time.monotonic() + timeout_seconds - while time.monotonic() < deadline: - status_resp = requests.get(status_url, headers={"Accept": "application/json"}, timeout=30) - status_resp.raise_for_status() - status_data = status_resp.json() - current_status = status_data["status"] - if current_status == "completed": - return - if current_status == "failed": - error_msg = status_data.get("last_error", "Unknown error") - raise RuntimeError(f"Job failed: {error_msg}") - time.sleep(poll_interval_seconds) - raise TimeoutError(f"Video job {video_id} did not complete within {timeout_seconds}s") - - def _download_video_content(self, video_id: str) -> bytes: - download_url = self._build_url(f"/v1/videos/{video_id}/content") - video_resp = requests.get(download_url, stream=True, timeout=60) - video_resp.raise_for_status() - video_bytes = BytesIO() - for chunk in video_resp.iter_content(chunk_size=8192): - if chunk: - video_bytes.write(chunk) - return video_bytes.getvalue() - - def _build_url(self, path: str) -> str: - return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}" - - -class OmniRunner: - def __init__( - self, - model_name: str, - seed: int = 42, - stage_init_timeout: int = 600, - batch_timeout: int = 10, - init_timeout: int = 900, - shm_threshold_bytes: int = 65536, - log_stats: bool = False, - stage_configs_path: str | None = None, - **kwargs, - ) -> None: - cleanup_dist_env_and_memory() - run_forced_gpu_cleanup_round() - self.model_name = model_name - self.seed = seed - self._prompt_len_estimate_cache: dict[str, Any] = {} - from vllm_omni.entrypoints.omni import Omni - - self.omni = Omni( - model=model_name, - log_stats=log_stats, - stage_init_timeout=stage_init_timeout, - batch_timeout=batch_timeout, - init_timeout=init_timeout, - shm_threshold_bytes=shm_threshold_bytes, - stage_configs_path=stage_configs_path, - **kwargs, - ) - - def get_default_sampling_params_list(self) -> list[Any]: - if not hasattr(self.omni, "default_sampling_params_list"): - raise AttributeError("Omni.default_sampling_params_list is not available") - return list(self.omni.default_sampling_params_list) - - def _estimate_prompt_len( - self, - additional_information: dict[str, Any], - model_name: str, - ) -> int: - """Estimate prompt_token_ids placeholder length for the Talker stage. - - The AR Talker replaces all input embeddings via ``preprocess``, so the - placeholder values are irrelevant but the **length** must match the - embeddings that ``preprocess`` will produce. - """ - _cache = self._prompt_len_estimate_cache - try: - from vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts import Qwen3TTSConfig - from vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_talker import ( - Qwen3TTSTalkerForConditionalGeneration, - ) - - if model_name not in _cache: - from transformers import AutoTokenizer - - tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side="left") - cfg = Qwen3TTSConfig.from_pretrained(model_name, trust_remote_code=True) - _cache[model_name] = (tok, getattr(cfg, "talker_config", None)) - - tok, tcfg = _cache[model_name] - task_type = (additional_information.get("task_type") or ["CustomVoice"])[0] - return Qwen3TTSTalkerForConditionalGeneration.estimate_prompt_len_from_additional_information( - additional_information=additional_information, - task_type=task_type, - tokenize_prompt=lambda t: tok(t, padding=False)["input_ids"], - codec_language_id=getattr(tcfg, "codec_language_id", None), - spk_is_dialect=getattr(tcfg, "spk_is_dialect", None), - ) - except Exception as exc: - logger.warning("Failed to estimate prompt length, using fallback 2048: %s", exc) - return 2048 - - def get_omni_inputs( - self, - prompts: list[str] | str, - system_prompt: str | None = None, - audios: PromptAudioInput = None, - images: PromptImageInput = None, - videos: PromptVideoInput = None, - mm_processor_kwargs: dict[str, Any] | None = None, - modalities: list[str] | None = None, - ) -> list[TextPrompt]: - if system_prompt is None: - system_prompt = ( - "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " - "Group, capable of perceiving auditory and visual inputs, as well as " - "generating text and speech." - ) - video_padding_token = "<|VIDEO|>" - image_padding_token = "<|IMAGE|>" - audio_padding_token = "<|AUDIO|>" - if "Qwen3-Omni-30B-A3B-Instruct" in self.model_name: - video_padding_token = "<|video_pad|>" - image_padding_token = "<|image_pad|>" - audio_padding_token = "<|audio_pad|>" - elif "Ming-flash-omni" in self.model_name: - video_padding_token = "