diff --git a/.gitignore b/.gitignore index b5e002235e..35dc7571ee 100644 --- a/.gitignore +++ b/.gitignore @@ -263,3 +263,5 @@ tmp_test vllm_omni/_version.py # output files *.wav +# CI overlay yamls materialized from tests/utils.py:_CI_OVERLAYS at test time +tests/.ci_generated/ diff --git a/benchmarks/qwen3-tts/README.md b/benchmarks/qwen3-tts/README.md index 9c01f29aa9..a1c2ebe12f 100644 --- a/benchmarks/qwen3-tts/README.md +++ b/benchmarks/qwen3-tts/README.md @@ -35,8 +35,8 @@ MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only # Use a Voice Clone model MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-Base TASK_TYPE=Base bash run_benchmark.sh --async-only -# Use bs16 config for higher throughput -STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs16.yaml bash run_benchmark.sh --async-only +# Use batch size 16 for higher throughput +BATCH_SIZE=16 bash run_benchmark.sh --async-only # Custom GPU, prompt count, concurrency levels GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh @@ -50,7 +50,8 @@ GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh CUDA_VISIBLE_DEVICES=0 python -m vllm_omni.entrypoints.cli.main serve \ "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" \ --omni --host 127.0.0.1 --port 8000 \ - --stage-configs-path benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ + --stage-overrides '{"0":{"max_num_seqs":1,"gpu_memory_utilization":0.3,"max_num_batched_tokens":512},"1":{"max_num_seqs":1,"gpu_memory_utilization":0.3,"max_num_batched_tokens":8192}}' \ --trust-remote-code ``` @@ -84,16 +85,19 @@ python benchmarks/qwen3-tts/plot_results.py \ --output results/comparison.png ``` -## Stage Configs +## Batch-size presets -| Config | max_num_seqs | Description | -|--------|:------------:|-------------| -| `vllm_omni/configs/qwen3_tts_bs1.yaml` | 1 | Single-request processing (lowest latency) | -| `vllm_omni/configs/qwen3_tts_bs16.yaml` | 16 | High-throughput concurrent processing | +The bench script loads the bundled production deploy (`vllm_omni/deploy/qwen3_tts.yaml`) and layers per-stage budgets on top via `--stage-overrides`, driven by the `BATCH_SIZE` env var. Each batch size picks compatible per-stage `max_num_seqs`, `max_num_batched_tokens`, and `gpu_memory_utilization` defaults: -All configs use a 2-stage pipeline (Talker -> Code2Wav) with `async_chunk` streaming enabled. The `SharedMemoryConnector` streams codec frames (25-frame chunks with 25-frame context overlap) between stages. +| `BATCH_SIZE` | Description | +|:--:|-------------| +| `1` (default) | Single-request processing (lowest latency) | +| `4` | Moderate-throughput concurrent processing | +| `16` | High-throughput concurrent processing | -The model is specified via the CLI `--model` flag (or `MODEL` env var), so the same configs work for both the 0.6B and 1.7B model variants. +The 2-stage pipeline (Talker -> Code2Wav) runs with `async_chunk` streaming enabled via the prod deploy; the `SharedMemoryConnector` streams codec frames (25-frame chunks with 25-frame context overlap) between stages. + +The model is specified via the CLI `--model` flag (or `MODEL` env var), so the same bench script works for both the 0.6B and 1.7B model variants. ## Metrics diff --git a/benchmarks/qwen3-tts/run_benchmark.sh b/benchmarks/qwen3-tts/run_benchmark.sh index 283b6b844c..8c3e46903c 100755 --- a/benchmarks/qwen3-tts/run_benchmark.sh +++ b/benchmarks/qwen3-tts/run_benchmark.sh @@ -26,8 +26,8 @@ # # Use Voice Clone model # MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-Base TASK_TYPE=Base bash run_benchmark.sh --async-only # -# # Use batch_size=4 config: -# STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs4.yaml bash run_benchmark.sh --async-only +# # Use batch_size=4: +# BATCH_SIZE=4 bash run_benchmark.sh --async-only # # Environment variables: # GPU_DEVICE - GPU index to use (default: 0) @@ -35,9 +35,9 @@ # CONCURRENCY - Space-separated concurrency levels (default: "1 4 10") # MODEL - Model name (default: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice) # PORT - Server port (default: 8000) -# GPU_MEM_TALKER - gpu_memory_utilization for talker stage (default: 0.3) -# GPU_MEM_CODE2WAV - gpu_memory_utilization for code2wav stage (default: 0.2) -# STAGE_CONFIG - Path to stage config YAML (default: configs/qwen3_tts_bs1.yaml) +# BATCH_SIZE - Per-stage ``max_num_seqs`` for both talker and code2wav (default: 1) +# GPU_MEM_TALKER - gpu_memory_utilization for talker stage (default: 0.3 at bs=1, else 0.2) +# GPU_MEM_CODE2WAV - gpu_memory_utilization for code2wav stage (default: 0.3 at bs=1, else 0.2) # TASK_TYPE - Task type: CustomVoice, VoiceDesign, Base (default: CustomVoice) set -euo pipefail @@ -51,14 +51,36 @@ NUM_PROMPTS="${NUM_PROMPTS:-50}" CONCURRENCY="${CONCURRENCY:-1 4 10}" MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice}" PORT="${PORT:-8000}" -GPU_MEM_TALKER="${GPU_MEM_TALKER:-0.3}" -GPU_MEM_CODE2WAV="${GPU_MEM_CODE2WAV:-0.2}" +BATCH_SIZE="${BATCH_SIZE:-1}" +DEFAULT_MEM=$([ "${BATCH_SIZE}" = "1" ] && echo "0.3" || echo "0.2") +GPU_MEM_TALKER="${GPU_MEM_TALKER:-${DEFAULT_MEM}}" +GPU_MEM_CODE2WAV="${GPU_MEM_CODE2WAV:-${DEFAULT_MEM}}" NUM_WARMUPS="${NUM_WARMUPS:-3}" -STAGE_CONFIG="${STAGE_CONFIG:-vllm_omni/configs/qwen3_tts_bs1.yaml}" +DEPLOY_CONFIG="vllm_omni/deploy/qwen3_tts.yaml" RESULT_DIR="${SCRIPT_DIR}/results" TIMESTAMP="$(date +%Y%m%d_%H%M%S)" TASK_TYPE="${TASK_TYPE:-CustomVoice}" +# Build --stage-overrides JSON from BATCH_SIZE + GPU_MEM_*. +STAGE_OVERRIDES=$( + BATCH_SIZE="${BATCH_SIZE}" \ + GPU_MEM_TALKER="${GPU_MEM_TALKER}" \ + GPU_MEM_CODE2WAV="${GPU_MEM_CODE2WAV}" \ + python - <<'PYEOF' +import json, os +bs = int(os.environ["BATCH_SIZE"]) +mem_t = float(os.environ["GPU_MEM_TALKER"]) +mem_c = float(os.environ["GPU_MEM_CODE2WAV"]) +# Prefill budget grows with batch size on both stages. +talker_batched = 512 if bs <= 4 else 4096 +code2wav_batched = 8192 if bs <= 4 else 32768 +print(json.dumps({ + "0": {"max_num_seqs": bs, "gpu_memory_utilization": mem_t, "max_num_batched_tokens": talker_batched}, + "1": {"max_num_seqs": bs, "gpu_memory_utilization": mem_c, "max_num_batched_tokens": code2wav_batched}, +})) +PYEOF +) + # Parse args RUN_ASYNC=true RUN_HF=true @@ -75,41 +97,27 @@ mkdir -p "${RESULT_DIR}" echo "============================================================" echo " Qwen3-TTS Benchmark" echo "============================================================" -echo " GPU: ${GPU_DEVICE}" -echo " Model: ${MODEL}" -echo " Prompts: ${NUM_PROMPTS}" -echo " Concurrency: ${CONCURRENCY}" -echo " Port: ${PORT}" -echo " Stage config: ${STAGE_CONFIG}" -echo " Results: ${RESULT_DIR}" -echo " Task type: ${TASK_TYPE}" +echo " GPU: ${GPU_DEVICE}" +echo " Model: ${MODEL}" +echo " Prompts: ${NUM_PROMPTS}" +echo " Concurrency: ${CONCURRENCY}" +echo " Port: ${PORT}" +echo " Deploy config: ${DEPLOY_CONFIG}" +echo " Batch size: ${BATCH_SIZE}" +echo " GPU mem T/C: ${GPU_MEM_TALKER} / ${GPU_MEM_CODE2WAV}" +echo " Results: ${RESULT_DIR}" +echo " Task type: ${TASK_TYPE}" echo "============================================================" -# Prepare stage config with correct GPU device and memory settings -prepare_config() { - local config_template="$1" - local config_name="$2" - local output_path="${RESULT_DIR}/${config_name}_stage_config.yaml" - - # Use sed to patch GPU device and memory utilization - sed \ - -e "s/devices: \"0\"/devices: \"${GPU_DEVICE}\"/g" \ - -e "s/gpu_memory_utilization: 0.3/gpu_memory_utilization: ${GPU_MEM_TALKER}/g" \ - -e "s/gpu_memory_utilization: 0.2/gpu_memory_utilization: ${GPU_MEM_CODE2WAV}/g" \ - "${config_template}" > "${output_path}" - - echo "${output_path}" -} - # Start server and wait for it to be ready start_server() { - local stage_config="$1" - local config_name="$2" + local config_name="$1" local log_file="${RESULT_DIR}/server_${config_name}_${TIMESTAMP}.log" echo "" echo "Starting server with config: ${config_name}" - echo " Stage config: ${stage_config}" + echo " Deploy config: ${DEPLOY_CONFIG}" + echo " Stage overrides: ${STAGE_OVERRIDES}" echo " Log file: ${log_file}" VLLM_WORKER_MULTIPROC_METHOD=spawn \ @@ -118,7 +126,8 @@ start_server() { --omni \ --host 127.0.0.1 \ --port "${PORT}" \ - --stage-configs-path "${stage_config}" \ + --deploy-config "${DEPLOY_CONFIG}" \ + --stage-overrides "${STAGE_OVERRIDES}" \ --stage-init-timeout 120 \ --trust-remote-code \ --disable-log-stats \ @@ -175,17 +184,13 @@ trap 'stop_server' EXIT # Run benchmark for a given config run_bench() { local config_name="$1" - local config_template="$2" echo "" echo "============================================================" echo " Benchmarking: ${config_name}" echo "============================================================" - local stage_config - stage_config=$(prepare_config "${config_template}" "${config_name}") - - start_server "${stage_config}" "${config_name}" + start_server "${config_name}" # Convert concurrency string to args local conc_args="" @@ -212,7 +217,7 @@ run_bench() { # Run vllm-omni benchmark if [ "${RUN_ASYNC}" = true ]; then - run_bench "async_chunk" "${SCRIPT_DIR}/${STAGE_CONFIG}" + run_bench "async_chunk" fi # Run HuggingFace baseline benchmark diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml deleted file mode 100644 index c348e6714d..0000000000 --- a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Qwen3-TTS batch_size=1 config (streaming with async_chunk) -# 2-stage pipeline: Talker -> Code2Wav -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - max_num_seqs: 1 - model_stage: qwen3_tts - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: false - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 512 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - max_num_seqs: 1 - model_stage: code2wav - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 8192 - max_model_len: 32768 - engine_input_source: [0] - final_output: true - final_output_type: audio - input_connectors: - from_stage_0: connector_of_shared_memory - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -runtime: - enabled: true - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - codec_streaming: true - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - codec_chunk_frames: 25 - codec_left_context_frames: 25 - - edges: - - from: 0 - to: 1 diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs16.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs16.yaml deleted file mode 100644 index 6c3fbdce66..0000000000 --- a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs16.yaml +++ /dev/null @@ -1,89 +0,0 @@ -# Qwen3-TTS max_num_seqs=16 config (streaming with async_chunk) -# High-throughput concurrent request processing -# 2-stage pipeline: Talker -> Code2Wav -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - max_num_seqs: 16 - model_stage: qwen3_tts - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: false - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 4096 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - max_num_seqs: 16 - model_stage: code2wav - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.2 - distributed_executor_backend: "mp" - max_num_batched_tokens: 16384 - max_model_len: 32768 - engine_input_source: [0] - final_output: true - final_output_type: audio - input_connectors: - from_stage_0: connector_of_shared_memory - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -runtime: - enabled: true - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - codec_streaming: true - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - codec_chunk_frames: 25 - codec_left_context_frames: 25 - - edges: - - from: 0 - to: 1 diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml deleted file mode 100644 index 32f3f4dbac..0000000000 --- a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml +++ /dev/null @@ -1,89 +0,0 @@ -# Qwen3-TTS batch_size=4 config (streaming with async_chunk) -# Enables concurrent request processing -# 2-stage pipeline: Talker -> Code2Wav -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - max_num_seqs: 4 - model_stage: qwen3_tts - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: false - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 512 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - max_num_seqs: 4 - model_stage: code2wav - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.2 - distributed_executor_backend: "mp" - max_num_batched_tokens: 8192 - max_model_len: 32768 - engine_input_source: [0] - final_output: true - final_output_type: audio - input_connectors: - from_stage_0: connector_of_shared_memory - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -runtime: - enabled: true - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - codec_streaming: true - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - codec_chunk_frames: 25 - codec_left_context_frames: 25 - - edges: - - from: 0 - to: 1 diff --git a/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh b/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh index 61cf7757a9..0ede359ea3 100755 --- a/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh +++ b/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh @@ -31,8 +31,11 @@ PORT_OFF="${PORT_OFF:-8001}" RESULT_DIR="${SCRIPT_DIR}/results" TIMESTAMP="$(date +%Y%m%d_%H%M%S)" -STAGE_CONFIG_ON="vllm_omni/model_executor/stage_configs/qwen3_tts.yaml" -STAGE_CONFIG_OFF="vllm_omni/model_executor/stage_configs/qwen3_tts_no_async_chunk.yaml" +# The bundled ``vllm_omni/deploy/qwen3_tts.yaml`` is auto-loaded by the model +# registry; no ``--deploy-config`` flag needed on the default (ON) path. +# async_chunk OFF is selected by the ``--no-async-chunk`` CLI flag — +# the single ``qwen3_tts`` pipeline dispatches to the end-to-end codec +# processor when ``deploy.async_chunk`` is false. mkdir -p "${RESULT_DIR}" @@ -77,7 +80,6 @@ wait_for_server() { echo "" echo "[Phase 1] Starting async_chunk ON server on port ${PORT_ON}..." CUDA_VISIBLE_DEVICES=${GPU_DEVICE} vllm-omni serve "${MODEL}" \ - --stage-configs-path "${STAGE_CONFIG_ON}" \ --host 0.0.0.0 --port "${PORT_ON}" \ --trust-remote-code --enforce-eager --omni \ > "${RESULT_DIR}/server_on_${TIMESTAMP}.log" 2>&1 & @@ -104,7 +106,7 @@ sleep 5 echo "" echo "[Phase 2] Starting async_chunk OFF server on port ${PORT_OFF}..." CUDA_VISIBLE_DEVICES=${GPU_DEVICE} vllm-omni serve "${MODEL}" \ - --stage-configs-path "${STAGE_CONFIG_OFF}" \ + --no-async-chunk \ --host 0.0.0.0 --port "${PORT_OFF}" \ --trust-remote-code --enforce-eager --omni \ > "${RESULT_DIR}/server_off_${TIMESTAMP}.log" 2>&1 & diff --git a/docs/configuration/README.md b/docs/configuration/README.md index b5761a7f1b..390176e9ce 100644 --- a/docs/configuration/README.md +++ b/docs/configuration/README.md @@ -6,7 +6,7 @@ For options within a vLLM Engine. Please refer to [vLLM Configuration](https://d Currently, the main options are maintained by stage configs for each model. -For specific example, please refer to [Qwen2.5-omni stage config](stage_configs/qwen2_5_omni.yaml) +For a specific example, see the [Qwen2.5-Omni deploy config](gh-file:vllm_omni/deploy/qwen2_5_omni.yaml). The matching frozen pipeline topology lives at [vllm_omni/model_executor/models/qwen2_5_omni/pipeline.py](gh-file:vllm_omni/model_executor/models/qwen2_5_omni/pipeline.py). For introduction, please check [Introduction for stage config](./stage_configs.md) diff --git a/docs/configuration/pd_disaggregation.md b/docs/configuration/pd_disaggregation.md index 9d07ee18f2..9196bdb024 100644 --- a/docs/configuration/pd_disaggregation.md +++ b/docs/configuration/pd_disaggregation.md @@ -11,7 +11,7 @@ deployment-specific values usually change per environment: - connector backend and connector ports - connector IPs or bootstrap addresses -Start from the [default Qwen3-Omni stage config](gh-file:vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml) +Start from the [default Qwen3-Omni stage config](gh-file:vllm_omni/deploy/qwen3_omni_moe.yaml) and copy it to your own file, for example `qwen3_omni_pd.yaml`. Then apply the changes below. diff --git a/docs/configuration/stage_configs.md b/docs/configuration/stage_configs.md index e3b6de8487..55b4053cc7 100644 --- a/docs/configuration/stage_configs.md +++ b/docs/configuration/stage_configs.md @@ -3,7 +3,147 @@ In vLLM-Omni, the target model is separated into multiple stages, which are processed by different LLMEngines, DiffusionEngines or other types of engines. Depending on different types of stages, such as Autoregressive (AR) stage or Diffusion transformer (DiT) stage, each can choose corresponding schedulers, model workers to load with the Engines in a plug-in fashion. !!! note - Default stage config YAMLs (for example, `vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml` and `vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml`) are bundled and loaded automatically when `stage_configs_path` is not provided. They have been verified to work on 1xH100 for Qwen2.5-Omni and 2xH100 for Qwen3-Omni. + Default deploy config YAMLs (for example, `vllm_omni/deploy/qwen2_5_omni.yaml`, `vllm_omni/deploy/qwen3_omni_moe.yaml`, and `vllm_omni/deploy/qwen3_tts.yaml`) are bundled and loaded automatically when neither `--stage-configs-path` nor `--deploy-config` is provided — the model registry resolves the right pipeline + deploy YAML by `model_type`. The bundled defaults have been verified on 1xH100 for Qwen2.5-Omni and 2xH100 for Qwen3-Omni. Models that have not yet migrated to the new schema continue to use the legacy `vllm_omni/model_executor/stage_configs/.yaml` files via `--stage-configs-path`. + +## New deploy schema reference + +The new deploy schema lives under `vllm_omni/deploy/` and is paired with a frozen `PipelineConfig` registered by the model's `pipeline.py`. Each deploy YAML has these top-level fields: + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `base_config` | str (path) | optional | — | Overlay parent (relative or absolute). `stages:` / `platforms:` deep-merged by stage_id; other scalars overlay-wins. Intended for user-authored overlays; prod yamls stay flat. | +| `async_chunk` | bool | optional | `true` | Enable chunked streaming between stages. Pin to `false` if the pipeline runs end-to-end. | +| `connectors` | dict | optional | `null` | Named connector specs (`{name, extra}`). Referenced by each stage's `input_connectors` / `output_connectors`. See [Connector schema](#connector-schema). | +| `edges` | list | optional | `null` | Explicit edge list for the KV transfer graph. Auto-derived from stage inputs if omitted. | +| `stages` | list | required | — | Per-stage engine args + wiring (see [Stage fields](#stage-fields)). | +| `platforms` | dict | optional | `null` | Keyed by `npu` / `rocm` / `xpu`, each contains a `stages:` list with per-platform overrides applied on top of the CUDA defaults. | +| `pipeline` | str | optional | `null` | Override the auto-detected pipeline registry key (used for structural variants like `qwen2_5_omni_thinker_only`). | +| `trust_remote_code` | bool | optional | `true` | **Pipeline-wide.** Trust HF remote code on model load; applies to every stage. | +| `distributed_executor_backend` | str | optional | `"mp"` | **Pipeline-wide.** Executor backend (`"mp"` or `"ray"`). | +| `dtype` | str \| null | optional | `null` | **Pipeline-wide.** Model dtype for every stage. | +| `quantization` | str \| null | optional | `null` | **Pipeline-wide.** Quantization method for every stage. | +| `enable_prefix_caching` | bool | optional | `false` | **Pipeline-wide.** Prefix cache toggle applied to every stage. | +| `enable_chunked_prefill` | bool \| null | optional | `null` | **Pipeline-wide.** Chunked prefill toggle applied to every stage. | +| `data_parallel_size` | int | optional | `1` | **Pipeline-wide.** DP degree for every stage. | +| `pipeline_parallel_size` | int | optional | `1` | **Pipeline-wide.** PP degree for every stage. | + +### Stage fields + +Each entry under `stages:` accepts any `StageDeployConfig` field directly (no nested `engine_args:`). Only fields whose value legitimately varies across stages live here; pipeline-wide settings (trust_remote_code, distributed_executor_backend, dtype, quantization, prefix/chunked prefill, DP/PP sizes) are declared at the top level and applied to every stage. Unknown keys fall through to `engine_extras:` and are forwarded to the engine. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `stage_id` | int | required | — | Stage identity; matched against `PipelineConfig.stages[*].stage_id`. | +| `max_num_seqs` | int | optional | `64` | Max concurrent sequences per stage. | +| `gpu_memory_utilization` | float | optional | `0.9` | Per-stage memory budget. | +| `tensor_parallel_size` | int | optional | `1` | TP degree for this stage. | +| `enforce_eager` | bool | optional | `false` | Disable CUDA graphs. | +| `max_num_batched_tokens` | int | optional | `32768` | Prefill budget. | +| `max_model_len` | int \| null | optional | `null` | Per-stage context length (auto-sets `VLLM_ALLOW_LONG_MAX_MODEL_LEN=1` when larger than HF default). | +| `async_scheduling` | bool \| null | optional | `null` | Per-stage async scheduling toggle. | +| `devices` | str | optional | `"0"` | `CUDA_VISIBLE_DEVICES`-style device list. | +| `output_connectors` | dict \| null | optional | `null` | Keyed by `to_stage_`; values are names registered under top-level `connectors:`. | +| `input_connectors` | dict \| null | optional | `null` | Keyed by `from_stage_`; values are names registered under top-level `connectors:`. | +| `default_sampling_params` | dict \| null | optional | `null` | Baseline sampling params. Deep-merged with pipeline `sampling_constraints` (pipeline wins). | +| `engine_extras` | dict | optional | `{}` | Catch-all for keys not listed above; deep-merged across overlays. Also carries per-stage overrides of pipeline-wide settings (e.g. stage-specific `dtype`). | + +### Connector schema + +Each entry under top-level `connectors:` follows this shape: + +```yaml +connectors: + : + name: # required — class registered in vllm_omni.distributed + extra: # optional — forwarded to the connector's __init__ + : + ... +``` + +| Connector class | Use case | `extra` keys | +|-----------------|----------|--------------| +| `SharedMemoryConnector` | Same-host KV transfer between stages (default for bundled YAMLs). | `shm_threshold_bytes` (int, default `65536`). | +| `MooncakeStoreConnector` | Cross-host KV transfer over TCP. Required for multi-node deployments. | `host`, `metadata_server`, `master`, `segment` (int bytes), `localbuf` (int bytes), `proto` (`"tcp"` / `"rdma"`). | + +A stage references a connector by name in its `input_connectors` / `output_connectors`: + +```yaml +connectors: + shm: + name: SharedMemoryConnector + +stages: + - stage_id: 0 + output_connectors: {to_stage_1: shm} + - stage_id: 1 + input_connectors: {from_stage_0: shm} +``` + +### CLI flags introduced in this refactor + +| Flag | Description | +|------|-------------| +| `--deploy-config PATH` | Load a new-schema deploy YAML. Takes precedence over `--stage-configs-path`. **Optional** — when omitted, the bundled `vllm_omni/deploy/.yaml` is auto-loaded by the model registry. | +| `--stage-overrides JSON` | Per-stage JSON overrides, e.g. `'{"0":{"gpu_memory_utilization":0.5}}'`. Per-stage values always win over global flags. | +| `--async-chunk` / `--no-async-chunk` | Flip the deploy YAML's `async_chunk:` bool. Unset (default) leaves the YAML value in force. | +| `--stage-configs-path` | **Deprecated.** Accepts legacy `stage_args` yamls and (auto-detected) new deploy yamls; emits a deprecation warning. Migrate to `--deploy-config`. To be removed in a follow-up PR. | + +### Precedence + +From highest to lowest: + +1. Per-stage flags (`--stage-overrides` JSON, `--stage--` if registered) +2. Explicit global CLI flags (`--gpu-memory-utilization 0.85`, etc.) +3. Platform section (`platforms.npu.stages`, etc.) on top of the base `stages:` +4. Overlay YAML (via `base_config:`) on top of the base YAML +5. Parser defaults + +### Worked override example + +Starting from the bundled `vllm_omni/deploy/qwen3_omni_moe.yaml`: + +```yaml +# vllm_omni/deploy/qwen3_omni_moe.yaml (excerpt) +async_chunk: true +stages: + - stage_id: 0 + gpu_memory_utilization: 0.9 + max_num_seqs: 32 + - stage_id: 1 + gpu_memory_utilization: 0.7 + max_num_seqs: 16 +``` + +A user-authored overlay that inherits the base and overrides only stage 1: + +```yaml +# my_overrides.yaml +base_config: /path/to/vllm_omni/deploy/qwen3_omni_moe.yaml +stages: + - stage_id: 1 + gpu_memory_utilization: 0.5 # smaller GPU +``` + +Launched with both an explicit global flag and a per-stage override: + +```bash +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ + --deploy-config my_overrides.yaml \ + --max-model-len 16384 \ + --stage-overrides '{"0": {"max_num_seqs": 8}}' +``` + +Effective config per stage after the merge: + +| Stage | Field | Final value | Source | +|-------|-------|-------------|--------| +| 0 | `gpu_memory_utilization` | `0.9` | base YAML (overlay didn't touch stage 0) | +| 0 | `max_num_seqs` | `8` | per-stage CLI (`--stage-overrides`) — wins over base `32` | +| 0 | `max_model_len` | `16384` | global CLI | +| 1 | `gpu_memory_utilization` | `0.5` | overlay YAML — wins over base `0.7` | +| 1 | `max_num_seqs` | `16` | base YAML (overlay didn't touch this field) | +| 1 | `max_model_len` | `16384` | global CLI | +| 2 | (all defaults) | — | base YAML (no overrides apply) | Therefore, as a core part of vLLM-Omni, the stage configs for a model have several main functions: @@ -35,7 +175,7 @@ stage_args: - stage_id: 0 # mark the unique id for each stage runtime: # The disaggregated configuration process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + devices: "0" # Logical device index for this stage (mapped through CUDA_VISIBLE_DEVICES / ASCEND_RT_VISIBLE_DEVICES if set) engine_args: # Engine arguments for a certain engine model_stage: thinker max_num_seqs: 1 @@ -151,7 +291,9 @@ Default: `true` #### `runtime.devices` -Visible devices for this stage, specified as a string. This controls which GPU devices are available to the stage process, similar to setting `CUDA_VISIBLE_DEVICES` or using `torch.cuda.set_device()`. For example, `"0"` uses GPU 0, `"1"` uses GPU 1, and `"0,1"` makes both GPUs 0 and 1 visible. +Logical device indices for this stage, specified as a string. Values are **logical indices** (`0`, `1`, `2`, ...) — not physical GPU IDs — and are mapped through the platform's visibility env var (`CUDA_VISIBLE_DEVICES` on CUDA, `ASCEND_RT_VISIBLE_DEVICES` on NPU) before being applied via `torch.cuda.set_device()` (or the equivalent). + +Example: if `CUDA_VISIBLE_DEVICES=0,2,4` is set in the environment, then `devices: "0"` selects physical GPU 0 (the first visible), `devices: "1"` selects physical GPU 2, and `devices: "0,1"` makes physical GPUs 0 and 2 available to the stage. If no visibility env var is set, logical and physical IDs coincide. Default: `"0"` diff --git a/docs/configuration/stage_configs/qwen2_5_omni.yaml b/docs/configuration/stage_configs/qwen2_5_omni.yaml deleted file mode 100644 index 2b3bd8e1c0..0000000000 --- a/docs/configuration/stage_configs/qwen2_5_omni.yaml +++ /dev/null @@ -1,89 +0,0 @@ -# stage config for running qwen2.5-omni with AsyncOmniEngine + Orchestrator runtime. -stage_args: - - stage_id: 0 - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - stage_id: 2 - runtime: - process: true - devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.15 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index b0428ddd7d..2452ef5d4a 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -231,8 +231,7 @@ vllm_omni/ tests/ │ ├── test_qwen3_omni_expansion.py │ ├── test_mimo_audio.py │ ├── test_image_gen_edit.py - │ ├── test_images_generations_lora.py - │ └── stage_configs/ + │ └── test_images_generations_lora.py └── offline_inference/ ✅ ├── test_qwen2_5_omni.py ├── test_qwen3_omni.py @@ -248,11 +247,12 @@ vllm_omni/ tests/ ├── test_diffusion_layerwise_offload.py ├── test_diffusion_lora.py ├── test_sequence_parallel.py - └── stage_configs/ - ├── qwen2_5_omni_ci.yaml - ├── qwen3_omni_ci.yaml - ├── bagel_*.yaml - └── npu/, rocm/, etc. + └── stage_configs/ (legacy schema, still + ├── bagel_*.yaml present for unmigrated + └── npu/, rocm/, etc. models) + +# Migrated models (qwen3_omni_moe, qwen2_5_omni, qwen3_tts) live under +# vllm_omni/deploy/ instead — see docs/configuration/stage_configs.md. ``` diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md index 69d5b16d7a..392f004721 100644 --- a/docs/contributing/ci/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -135,8 +135,7 @@ vllm_omni/ tests/ │ ├── test_qwen3_omni_expansion.py │ ├── test_mimo_audio.py │ ├── test_image_gen_edit.py - │ ├── test_images_generations_lora.py - │ └── stage_configs/ + │ └── test_images_generations_lora.py └── offline_inference/ ✅ ├── test_qwen2_5_omni.py ├── test_qwen3_omni.py @@ -153,11 +152,12 @@ vllm_omni/ tests/ ├── test_diffusion_lora.py ├── test_sequence_parallel.py ├── test_qwen_image_edit_expansion.py - └── stage_configs/ - ├── qwen2_5_omni_ci.yaml - ├── qwen3_omni_ci.yaml - ├── bagel_*.yaml + └── stage_configs/ (legacy schema, still present + ├── bagel_*.yaml for unmigrated models) └── npu/, rocm/, etc. + +# Migrated models (qwen3_omni_moe, qwen2_5_omni, qwen3_tts) live under +# vllm_omni/deploy/ instead — see docs/configuration/stage_configs.md. examples/ tests │ └── examples ├── online_serving/ → ├── online_serving/ @@ -229,6 +229,7 @@ from tests.conftest import ( generate_synthetic_video, merge_base64_and_convert_to_text, ) +from tests.utils import get_deploy_config_path from vllm_omni.platforms import current_omni_platform # Edit: model name and stage config path @@ -236,7 +237,7 @@ models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] #If you use the default configuration file, you can directly use the following address. def get_default_config(): - return str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml") + return get_deploy_config_path("ci/qwen3_omni_moe.yaml") #If you need to modify the configuration file, you can use modify_stage_config. def get_chunk_config(): diff --git a/docs/contributing/model/adding_omni_model.md b/docs/contributing/model/adding_omni_model.md index a0619e3381..478e77c7d5 100644 --- a/docs/contributing/model/adding_omni_model.md +++ b/docs/contributing/model/adding_omni_model.md @@ -313,7 +313,7 @@ The registry uses lazy loading, so the model class is imported only when needed. ## Stage Configuration -Create a YAML configuration file in `vllm_omni/model_executor/stage_configs/`. For a complete example, see the [Qwen3-Omni configuration file](gh-file:vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml). +Create a YAML configuration file in `vllm_omni/deploy/`. For a complete example, see the [Qwen3-Omni configuration file](gh-file:vllm_omni/deploy/qwen3_omni_moe.yaml). ### Key Configuration Fields @@ -614,7 +614,7 @@ For a complete reference implementation, see: - **Thinker**: `vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py` - **Talker**: `vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py` - **Code2Wav**: `vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_code2wav.py` -- **Stage config**: `vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml` +- **Stage config**: `vllm_omni/deploy/qwen3_omni_moe.yaml` - **Input processors**: `vllm_omni/model_executor/stage_input_processors/qwen3_omni.py` - **Registry**: `vllm_omni/model_executor/models/registry.py` - **Testing**: `vllm_omni/tests/e2e/offline_inference/test_qwen3_omni.py` diff --git a/docs/contributing/model/adding_tts_model.md b/docs/contributing/model/adding_tts_model.md index e48ae5049f..66da1749ce 100644 --- a/docs/contributing/model/adding_tts_model.md +++ b/docs/contributing/model/adding_tts_model.md @@ -120,8 +120,18 @@ vllm_omni/model_executor/stage_configs/ | `models/qwen3_tts/qwen3_tts.py` | Unified model class | | `models/qwen3_tts/qwen3_tts_code_predictor_vllm.py` | Stage 0 - optimized AR | | `models/qwen3_tts/qwen3_tts_code2wav.py` | Stage 1 - decoder | -| `stage_configs/qwen3_tts.yaml` | Stage config (async_chunk enabled) | -| `stage_configs/qwen3_tts_batch.yaml` | Batch mode config | +| `deploy/qwen3_tts.yaml` (new schema) | Deploy config (async_chunk enabled) — paired with `models/qwen3_tts/pipeline.py` for the frozen topology | + +> **Chunked vs end-to-end modes**: `qwen3_tts` registers a single +> pipeline whose stage 1 declares alternate processor functions — an +> `async_chunk_process_next_stage_input_func` (per-chunk streaming, used +> when `deploy.async_chunk=True`) and a `sync_process_input_func` +> (batch-end, used when `deploy.async_chunk=False`). The loader selects +> one at merge time based on the bool, so `--no-async-chunk` alone +> switches modes — no variant yaml or variant pipeline registration is +> needed. Pipelines that only make sense in one mode (e.g. +> `qwen3_omni_moe` is always chunked) can keep using the unconditional +> `custom_process_*` fields. | `stage_input_processors/qwen3_tts.py` | Stage transition processors | ## Step-by-Step Implementation @@ -574,7 +584,8 @@ Adding a TTS model to vLLM-Omni involves: | `models/qwen3_tts/qwen3_tts.py` | Unified model class | | `models/qwen3_tts/qwen3_tts_code_predictor_vllm.py` | AR stage with vLLM fused ops | | `models/qwen3_tts/qwen3_tts_code2wav.py` | Decoder stage with `chunked_decode_streaming()` | -| `stage_configs/qwen3_tts.yaml` | Stage configuration | +| `models/qwen3_tts/pipeline.py` | Frozen pipeline topology (registered at import time) | +| `deploy/qwen3_tts.yaml` | Deploy config (user-editable, async_chunk + SharedMemoryConnector) | | `stage_input_processors/qwen3_tts.py` | Stage transition processors | For more information, see: diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 418fb707ae..6c209e5659 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -127,10 +127,11 @@ Multi-stage omni serving: ```bash vllm serve Qwen/Qwen2.5-Omni-7B \ --omni \ - --stage-configs-path qwen2_5_omni.yaml \ --port 8091 ``` +(The default deploy config at `vllm_omni/deploy/qwen2_5_omni.yaml` is loaded automatically. Pass `--deploy-config /path/to/custom.yaml` to override.) + Single-stage diffusion serving with torch profiler: ```bash diff --git a/docs/serving/speech_api.md b/docs/serving/speech_api.md index ecbe8d9ac9..733811081a 100644 --- a/docs/serving/speech_api.md +++ b/docs/serving/speech_api.md @@ -15,7 +15,7 @@ Each server instance runs a single model (specified at startup via `vllm serve < ```bash # Qwen3-TTS: CustomVoice model (predefined speakers) vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -300,7 +300,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ ```bash # Start server with VoiceDesign model first vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -322,7 +322,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ ```bash # Start server with Base model first vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -517,15 +517,16 @@ for result in response.json()["results"]: All items are fanned out to `generate()` concurrently. The engine's stage worker automatically batches them up to the configured `max_batch_size` and queues the rest — no client-side throttling needed. -For best throughput, use a batch-optimized stage config with `max_batch_size > 1`: +For best throughput, set both stages' `max_num_seqs` to ≥4 via `--stage-overrides`: ```bash vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml \ - --omni --port 8091 --trust-remote-code --enforce-eager + --omni --port 8091 --trust-remote-code --enforce-eager \ + --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2}, + "1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' ``` -The default `qwen3_tts.yaml` uses `max_batch_size: 1` (single request). The `qwen3_tts_batch.yaml` config sets `max_batch_size: 4` for ~4x throughput. +The bundled `qwen3_tts.yaml` uses `max_num_seqs: 1` (single request) on both stages. Bumping to 4 yields roughly 4× throughput on the talker and lets stage 1 batch chunks across in-flight requests. ## Supported Models @@ -617,7 +618,7 @@ Enable debug logging: ```bash vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ diff --git a/docs/user_guide/examples/offline_inference/qwen3_tts.md b/docs/user_guide/examples/offline_inference/qwen3_tts.md index 4ece5219d7..7226ac1fe4 100644 --- a/docs/user_guide/examples/offline_inference/qwen3_tts.md +++ b/docs/user_guide/examples/offline_inference/qwen3_tts.md @@ -144,13 +144,13 @@ completes. This demonstrates that audio data is available progressively rather t ## Batched Decoding -The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, provide a stage config with `max_num_seqs > 1` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. +The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, set `max_num_seqs > 1` on both stages via `--stage-overrides` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. ``` python end2end.py --query-type CustomVoice \ --txt-prompts benchmark_prompts.txt \ --batch-size 4 \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml + --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2},"1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' ``` **Important:** `--batch-size` must match a CUDA graph capture size (1, 2, 4, 8, 16...) because the Talker's code predictor KV cache is sized to `max_num_seqs`, and CUDA graphs pad the batch to the next capture size. Both stages need `max_num_seqs >= batch_size` in the stage config for batching to take effect. If only stage 1 has a higher `max_num_seqs`, it won't help — stage 1 can only batch chunks from requests that are in-flight simultaneously, which requires stage 0 to also process multiple requests concurrently. diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md index 6f6d9ae4a9..611eb6fd3f 100644 --- a/docs/user_guide/examples/online_serving/qwen3_omni.md +++ b/docs/user_guide/examples/online_serving/qwen3_omni.md @@ -18,12 +18,12 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 If you want to open async chunking for qwen3-omni, launch the server with command below ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --deploy-config /vllm_omni/deploy/qwen3_omni_moe.yaml ``` If you have custom stage configs file, launch the server with command below ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --deploy-config /path/to/deploy_config_file ``` ### Send Multi-modal Request @@ -187,7 +187,7 @@ The script supports the following arguments: - `--model`: Model name/path (default: Qwen/Qwen3-Omni-30B-A3B-Instruct) - `--server-port`: Port for vLLM server (default: 8091) - `--gradio-port`: Port for Gradio demo (default: 7861) -- `--stage-configs-path`: Path to custom stage configs YAML file (optional) +- `--deploy-config`: Path to custom deploy config YAML file (optional) - `--server-host`: Host for vLLM server (default: 0.0.0.0) - `--gradio-ip`: IP for Gradio demo (default: 127.0.0.1) - `--share`: Share Gradio demo publicly (creates a public link) @@ -202,7 +202,7 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 If you have custom stage configs file: ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --deploy-config /path/to/deploy_config_file ``` **Step 2: Run the Gradio demo** diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md index 4e632d4c28..95f234f02d 100644 --- a/docs/user_guide/examples/online_serving/qwen3_tts.md +++ b/docs/user_guide/examples/online_serving/qwen3_tts.md @@ -58,7 +58,7 @@ Then open http://localhost:7860 in your browser. ```bash # CustomVoice model (predefined speakers) vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -66,7 +66,7 @@ vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ # VoiceDesign model vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -74,7 +74,7 @@ vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \ # Base model (voice cloning) vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py index d8f1898ec9..dfe124700d 100644 --- a/examples/offline_inference/qwen2_5_omni/end2end.py +++ b/examples/offline_inference/qwen2_5_omni/end2end.py @@ -320,14 +320,7 @@ def main(args): query_result = query_func(audio_path=audio_path, sampling_rate=sampling_rate) else: query_result = query_func() - omni = Omni( - model=model_name, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - batch_timeout=args.batch_timeout, - init_timeout=args.init_timeout, - shm_threshold_bytes=args.shm_threshold_bytes, - ) + omni = Omni.from_cli_args(args, model=model_name) thinker_sampling_params = SamplingParams( temperature=0.0, # Deterministic - no randomness top_p=1.0, # Disable nucleus sampling diff --git a/examples/offline_inference/qwen3_omni/README.md b/examples/offline_inference/qwen3_omni/README.md index d69ad6abfc..0710faa133 100644 --- a/examples/offline_inference/qwen3_omni/README.md +++ b/examples/offline_inference/qwen3_omni/README.md @@ -70,8 +70,8 @@ For true stage-level concurrency -- where downstream stages (Talker, Code2Wav) start **before** the upstream stage (Thinker) finishes -- use the async_chunk example. This requires: -1. A stage config YAML with ``async_chunk: true`` (e.g. - ``qwen3_omni_moe_async_chunk.yaml``). +1. A deploy config YAML with ``async_chunk: true`` (e.g. + ``qwen3_omni_moe.yaml``). 2. Hardware that matches the config (e.g. 2x H100 for the default 3-stage config). @@ -101,7 +101,7 @@ python end2end_async_chunk.py --query-type text --modalities text ```bash python end2end_async_chunk.py \ --query-type use_audio \ - --stage-configs-path /path/to/your_async_chunk.yaml + --deploy-config /path/to/your_deploy_config.yaml ``` > **Note**: The synchronous ``end2end.py`` (using ``Omni``) is still the diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 056f820ff0..f028c32aa1 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -294,14 +294,7 @@ def main(args): else: query_result = query_func() - omni = Omni( - model=model_name, - dtype=args.dtype, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - init_timeout=args.init_timeout, - ) + omni = Omni.from_cli_args(args, model=model_name) thinker_sampling_params = SamplingParams( temperature=0.9, diff --git a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py index 0744263130..f38922e943 100644 --- a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py +++ b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py @@ -14,7 +14,7 @@ Usage ----- python end2end_async_chunk.py --query-type use_audio \ - --stage-configs-path + --deploy-config See ``--help`` for all options. """ @@ -179,20 +179,26 @@ def clone_prompt_for_request(template: dict) -> dict: return cloned -def _default_async_chunk_stage_configs_path() -> str | None: - """Best-effort default stage config for running Qwen3-Omni with async_chunk. +def _default_deploy_config_path() -> str | None: + """Best-effort default deploy config for running Qwen3-Omni with async_chunk. - When this example is executed from within the repository, we resolve the - default YAML path relative to this file. When installed elsewhere, the - file may not exist and callers should pass --stage-configs-path explicitly. + The default ``vllm_omni/deploy/qwen3_omni_moe.yaml`` ships with + ``async_chunk: true`` at the top level, so loading it is enough to + enable async-chunk semantics. To disable it, copy the YAML and set + ``async_chunk: false`` (or pass ``--deploy-config`` to a YAML that + overrides the flag). + + When this example is executed from within the repository, we resolve + the default YAML path relative to this file. When installed elsewhere, + the file may not exist and callers should pass ``--deploy-config`` + explicitly. """ repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")) candidate = os.path.join( repo_root, "vllm_omni", - "model_executor", - "stage_configs", - "qwen3_omni_moe_async_chunk.yaml", + "deploy", + "qwen3_omni_moe.yaml", ) return candidate if os.path.exists(candidate) else None @@ -374,15 +380,16 @@ async def run_all(args): prompt["modalities"] = output_modalities # Create AsyncOmni - print(f"[Info] Creating AsyncOmni with stage_configs_path={args.stage_configs_path}") + print(f"[Info] Creating AsyncOmni with deploy_config={args.deploy_config}") async_omni = None try: - async_omni = AsyncOmni( - model=args.model, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - ) + # ``from_cli_args`` expands vars(args) into kwargs and auto-captures + # ``_cli_explicit_keys`` from ``sys.argv[1:]`` so argparse defaults + # do not silently override deploy YAML values. Mirrors the + # ``EngineArgs.from_cli_args`` pattern used throughout vllm / + # vllm-omni. ``deploy_config=None`` (the default) falls through to + # the bundled ``vllm_omni/deploy/qwen3_omni_moe.yaml``. + async_omni = AsyncOmni.from_cli_args(args) # Use default sampling params from stage config (they are pre-configured # in the YAML for each stage). @@ -470,11 +477,11 @@ def parse_args(): help="Query type.", ) parser.add_argument( - "--stage-configs-path", + "--deploy-config", type=str, - default=_default_async_chunk_stage_configs_path(), + default=_default_deploy_config_path(), help=( - "Path to an async_chunk stage config YAML. " + "Path to a deploy config YAML. " "If not set, uses the model's default config " "(make sure it has async_chunk: true)." ), diff --git a/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh b/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh index 809054867c..2f2be20915 100755 --- a/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh +++ b/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh @@ -17,7 +17,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" python "${SCRIPT_DIR}/end2end_async_chunk.py" \ --query-type text \ --txt-prompts "${SCRIPT_DIR}/text_prompts_10.txt" \ - --stage-configs-path "${REPO_ROOT}/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml" \ + --deploy-config "${REPO_ROOT}/vllm_omni/deploy/qwen3_omni_moe.yaml" \ --output-dir output_audio_async_chunk \ --max-in-flight 2 \ "$@" diff --git a/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh b/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh index 918c7ee4fd..9ef69293cb 100755 --- a/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh +++ b/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh @@ -6,13 +6,13 @@ # achieving true stage-level concurrency via chunk-level streaming. # # Prerequisites: -# - An async_chunk stage config YAML (e.g. qwen3_omni_moe_async_chunk.yaml) +# - A deploy config YAML (e.g. qwen3_omni_moe.yaml) # - Hardware matching the config (e.g. 2x H100 for the default 3-stage config) # # Usage: # bash run_single_prompt_async_chunk.sh # bash run_single_prompt_async_chunk.sh --query-type text --modalities text -# bash run_single_prompt_async_chunk.sh --stage-configs-path /path/to/custom.yaml +# bash run_single_prompt_async_chunk.sh --deploy-config /path/to/custom.yaml set -euo pipefail @@ -21,6 +21,6 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" python "${SCRIPT_DIR}/end2end_async_chunk.py" \ --query-type use_audio \ - --stage-configs-path "${REPO_ROOT}/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml" \ + --deploy-config "${REPO_ROOT}/vllm_omni/deploy/qwen3_omni_moe.yaml" \ --output-dir output_audio_async_chunk \ "$@" diff --git a/examples/offline_inference/qwen3_tts/README.md b/examples/offline_inference/qwen3_tts/README.md index c38a2b462d..2971ad716a 100644 --- a/examples/offline_inference/qwen3_tts/README.md +++ b/examples/offline_inference/qwen3_tts/README.md @@ -104,13 +104,13 @@ completes. This demonstrates that audio data is available progressively rather t ## Batched Decoding -The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, provide a stage config with `max_num_seqs > 1` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. +The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, set `max_num_seqs > 1` on both stages via `--stage-overrides` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. ``` python end2end.py --query-type CustomVoice \ --txt-prompts benchmark_prompts.txt \ --batch-size 4 \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml + --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2},"1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' ``` **Important:** `--batch-size` must match a CUDA graph capture size (1, 2, 4, 8, 16...) because the Talker's code predictor KV cache is sized to `max_num_seqs`, and CUDA graphs pad the batch to the next capture size. Both stages need `max_num_seqs >= batch_size` in the stage config for batching to take effect. If only stage 1 has a higher `max_num_seqs`, it won't help — stage 1 can only batch chunks from requests that are in-flight simultaneously, which requires stage 0 to also process multiple requests concurrently. diff --git a/examples/offline_inference/qwen3_tts/end2end.py b/examples/offline_inference/qwen3_tts/end2end.py index 901418c39b..77da356b4f 100644 --- a/examples/offline_inference/qwen3_tts/end2end.py +++ b/examples/offline_inference/qwen3_tts/end2end.py @@ -366,12 +366,7 @@ def main(args): output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) - omni = Omni( - model=model_name, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - ) + omni = Omni.from_cli_args(args, model=model_name) batch_size = args.batch_size for batch_start in range(0, len(inputs), batch_size): @@ -387,12 +382,7 @@ async def main_streaming(args): output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) - omni = AsyncOmni( - model=model_name, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - ) + omni = AsyncOmni.from_cli_args(args, model=model_name) for i, prompt in enumerate(inputs): request_id = str(i) diff --git a/examples/online_serving/qwen3_omni/README.md b/examples/online_serving/qwen3_omni/README.md index 4c3f9595fa..32722b3db4 100644 --- a/examples/online_serving/qwen3_omni/README.md +++ b/examples/online_serving/qwen3_omni/README.md @@ -12,19 +12,159 @@ Please refer to [README.md](../../../README.md) vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 ``` -If you want to open async chunking for qwen3-omni, launch the server with command below +The default deploy config at `vllm_omni/deploy/qwen3_omni_moe.yaml` is loaded +automatically by the model registry — no `--deploy-config` flag needed for the +common case. Async-chunk streaming is **enabled by default** in the bundled config. +NPU / ROCm / XPU per-platform deltas are merged in automatically from the +`platforms:` section of the same YAML. + +**Note:** The OpenAI-style **`/v1/realtime`** WebSocket (streaming PCM audio in, audio + transcription out) is **not supported** when `async_chunk` is enabled. Use the default omni layout or a stage config with `async_chunk: false` for realtime sessions. + +If you have a custom deploy YAML, point at it explicitly: ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ + --deploy-config /path/to/your_deploy_config.yaml ``` -**Note:** The OpenAI-style **`/v1/realtime`** WebSocket (streaming PCM audio in, audio + transcription out) is **not supported** when `async_chunk` is enabled. Use the default omni layout or a stage config with `async_chunk: false` for realtime sessions. +### Tuning deployment parameters + +Most engine knobs (`max_num_batched_tokens`, `max_model_len`, `enforce_eager`, +`gpu_memory_utilization`, `tensor_parallel_size`, …) can be tuned without +editing the YAML. There are three layers, in increasing specificity: + +#### 1. Global CLI flags (apply to every stage) + +```bash +# Tighter memory budget on a smaller GPU +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ + --gpu-memory-utilization 0.85 + +# Disable cudagraphs (e.g. for debugging) +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ + --enforce-eager + +# Reduce context length +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ + --max-model-len 32768 + +# Toggle prefix caching on every stage (yaml default: off) +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ + --enable-prefix-caching +# ...or force it off if the yaml turned it on +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ + --no-enable-prefix-caching + +# Toggle pipeline-wide async chunked streaming between stages +# (yaml default for qwen3_omni_moe: on) +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ + --no-async-chunk +``` + +For the TTS counterpart (synchronous codec variant), see +[qwen3_tts README](../qwen3_tts/README.md#sync-vs-async-chunk-mode). + +Explicit CLI flags **override** the deploy YAML (which itself overrides the +parser defaults). If you don't pass a flag, the YAML value wins. + +> **Note on `--no-async-chunk`**: Flips the deploy yaml's `async_chunk:` +> bool. Pipelines that implement alternate processor functions for +> chunked vs end-to-end modes (e.g. qwen3_tts code2wav) dispatch +> automatically based on that bool — no extra flag or variant yaml is +> needed. + +> ⚠️ **For multi-stage models that share GPUs (qwen3_omni_moe by default +> shares cuda:1 between stages 1 and 2), avoid using global memory flags.** +> A global `--gpu-memory-utilization 0.85` would apply to every stage and +> oversubscribe the shared device. Use per-stage overrides instead — see +> below. + +#### 2. Per-stage overrides via `--stage-overrides` (recommended for memory) -If you have custom stage configs file, launch the server with command below ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +# Lower stage 1's memory budget; leave others at the YAML default +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ + --stage-overrides '{ + "1": {"gpu_memory_utilization": 0.5}, + "2": {"max_num_batched_tokens": 65536} + }' +``` + +Per-stage values are always treated as explicit and beat YAML defaults for +the named stage. Other stages keep their YAML values. + +#### 3. Custom deploy YAML + +When per-stage overrides get long, write a small overlay YAML that inherits +from the bundled default: + +```yaml +# my_qwen3_omni_overrides.yaml +base_config: /path/to/vllm_omni/deploy/qwen3_omni_moe.yaml + +stages: + - stage_id: 0 + max_num_batched_tokens: 65536 + enforce_eager: true + - stage_id: 1 + gpu_memory_utilization: 0.5 + - stage_id: 2 + max_model_len: 8192 +``` + +Then start the server with `--deploy-config my_qwen3_omni_overrides.yaml`. +The `base_config:` line tells the loader to inherit everything else (stages, +connectors, edges, platforms section) from the bundled production YAML, so +you only need to spell out the deltas. + +#### 4. Multi-node deployment (cross-host transfer connector) + +The bundled `qwen3_omni_moe.yaml` uses `SharedMemoryConnector` between stages, +which only works when all stages run on the same physical host. For +**cross-node** deployments, write a small overlay YAML that swaps in a +network-capable connector (e.g. `MooncakeStoreConnector`) and re-points each +stage's connector wiring at it. The connector spec carries your own server +addresses — there is no checked-in default because every cluster is +different. + +```yaml +# my_qwen3_omni_multinode.yaml +base_config: /path/to/vllm_omni/deploy/qwen3_omni_moe.yaml + +connectors: + mooncake_connector: + name: MooncakeStoreConnector + extra: + host: "127.0.0.1" + metadata_server: "http://YOUR_METADATA_HOST:8080/metadata" + master: "YOUR_MASTER_HOST:50051" + segment: 512000000 # 512 MB transfer segment + localbuf: 64000000 # 64 MB local buffer + proto: "tcp" + +stages: + - stage_id: 0 + output_connectors: + to_stage_1: mooncake_connector + - stage_id: 1 + input_connectors: + from_stage_0: mooncake_connector + output_connectors: + to_stage_2: mooncake_connector + - stage_id: 2 + input_connectors: + from_stage_1: mooncake_connector ``` +Then launch with `--deploy-config my_qwen3_omni_multinode.yaml`. Same +pattern works for Qwen2.5-Omni — replace `base_config:` with the path to +`vllm_omni/deploy/qwen2_5_omni.yaml`. + +> ⚠️ Replace `YOUR_METADATA_HOST` / `YOUR_MASTER_HOST` with the actual +> mooncake server addresses for your cluster. The `base_config:` overlay +> inherits all stage budgets, devices, and edges from the bundled prod +> YAML — you only need to spell out the connector swap. + ### Send Multi-modal Request Get into the example folder @@ -285,7 +425,7 @@ The script supports the following arguments: - `--model`: Model name/path (default: Qwen/Qwen3-Omni-30B-A3B-Instruct) - `--server-port`: Port for vLLM server (default: 8091) - `--gradio-port`: Port for Gradio demo (default: 7861) -- `--stage-configs-path`: Path to custom stage configs YAML file (optional) +- `--deploy-config`: Path to custom deploy config YAML file (optional) - `--server-host`: Host for vLLM server (default: 0.0.0.0) - `--gradio-ip`: IP for Gradio demo (default: 127.0.0.1) - `--share`: Share Gradio demo publicly (creates a public link) @@ -300,7 +440,7 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 If you have custom stage configs file: ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --deploy-config /path/to/deploy_config_file ``` **Step 2: Run the Gradio demo** diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index b48db9cf45..350fcb71ca 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -43,7 +43,7 @@ Then open http://localhost:7860 in your browser. ### Launch the Server -The default stage config is located at `vllm_omni/model_executor/stage_configs/qwen3_tts.yaml`. For other platforms (e.g., NPU), refer to `vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml`. +The default deploy config is located at `vllm_omni/deploy/qwen3_tts.yaml` and is loaded automatically by the model registry — no `--deploy-config` flag needed for default use. Platform-specific deltas (NPU, ROCm, XPU) are merged in automatically from the `platforms:` block of the same YAML based on the detected runtime. ```bash # CustomVoice model (predefined speakers) @@ -70,6 +70,22 @@ vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ --port 8091 ``` +#### Sync vs async-chunk mode + +Qwen3-TTS supports both **chunked streaming** (default, lower latency) and +**synchronous end-to-end** modes from the same deploy YAML. The bundled +`qwen3_tts.yaml` ships with `async_chunk: true`; flip with `--no-async-chunk` +and the pipeline automatically dispatches to the end-to-end codec processor: + +```bash +vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice --omni --port 8091 \ + --no-async-chunk +``` + +No variant YAML or extra flag is needed — the `StagePipelineConfig` on each +stage declares both processor functions and the runtime picks based on the +`async_chunk:` bool. + Alternatively, use the convenience script: ```bash ./run_server.sh # Default: CustomVoice model diff --git a/examples/online_serving/qwen3_tts/batch_speech_client.py b/examples/online_serving/qwen3_tts/batch_speech_client.py index 7d48e650f8..47fdc3691c 100644 --- a/examples/online_serving/qwen3_tts/batch_speech_client.py +++ b/examples/online_serving/qwen3_tts/batch_speech_client.py @@ -5,11 +5,13 @@ batch level and generate many utterances in the cloned voice without repeating the reference for each item. -Start the server (with batch-optimized config for best throughput): +Start the server (with batch-optimized stage settings for best throughput): vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml \ - --trust-remote-code + --omni \ + --trust-remote-code \ + --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2}, + "1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' Examples: # Batch with a predefined voice diff --git a/examples/online_serving/qwen3_tts/run_gradio_demo.sh b/examples/online_serving/qwen3_tts/run_gradio_demo.sh index bcc0ddb7cf..d79be3c2ab 100644 --- a/examples/online_serving/qwen3_tts/run_gradio_demo.sh +++ b/examples/online_serving/qwen3_tts/run_gradio_demo.sh @@ -127,7 +127,7 @@ echo "Starting vLLM server..." LOG_FILE="/tmp/vllm_tts_server_${SERVER_PORT}.log" vllm-omni serve "$MODEL" \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --host "$SERVER_HOST" \ --port "$SERVER_PORT" \ --gpu-memory-utilization 0.9 \ diff --git a/examples/online_serving/qwen3_tts/run_server.sh b/examples/online_serving/qwen3_tts/run_server.sh index 6f4aa83a0b..78dd2c305d 100755 --- a/examples/online_serving/qwen3_tts/run_server.sh +++ b/examples/online_serving/qwen3_tts/run_server.sh @@ -31,7 +31,7 @@ esac echo "Starting Qwen3-TTS server with model: $MODEL" vllm-omni serve "$MODEL" \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --host 0.0.0.0 \ --port 8091 \ --gpu-memory-utilization 0.9 \ diff --git a/tests/conftest.py b/tests/conftest.py index 3434eb0aed..83752521f2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,6 +47,7 @@ from vllm.logger import init_logger from vllm.utils.network_utils import get_open_port +from vllm_omni.config.stage_config import resolve_deploy_yaml from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniSamplingParams from vllm_omni.outputs import OmniRequestOutput @@ -1339,12 +1340,14 @@ def delete_by_path(config_dict: dict, path: str) -> None: else: print(f"Path {path} does not exist") + _stage_key = "stages" if "stages" in config else "stage_args" + # Apply deletions first if deletes: for key, value in deletes.items(): - if key == "stage_args": + if key in ("stage_args", "stages"): if value and isinstance(value, dict): - stage_args = config.get("stage_args", []) + stage_args = config.get(_stage_key, []) if not stage_args: raise ValueError("stage_args does not exist in config") @@ -1377,9 +1380,9 @@ def delete_by_path(config_dict: dict, path: str) -> None: # Apply updates if updates: for key, value in updates.items(): - if key == "stage_args": + if key in ("stage_args", "stages"): if value and isinstance(value, dict): - stage_args = config.get("stage_args", []) + stage_args = config.get(_stage_key, []) if not stage_args: raise ValueError("stage_args does not exist in config") @@ -1585,32 +1588,46 @@ def __init__( self.stage_config_path = stage_config_path self.master_port = get_open_port() self.visible_device_list = self._load_visible_device_list(env_dict) - self.stage_runtime_devices = self._load_stage_runtime_devices(stage_config_path) - self.stage_ids = stage_ids or self._load_stage_ids(stage_config_path) + resolved_cfg = resolve_deploy_yaml(stage_config_path) + # Dump the resolved deploy config so CI logs show each stage's + # gpu_memory_utilization / max_model_len / max_num_seqs after + # base_config inheritance and overlay merge — essential when + # diagnosing OOMs that depend on the merged values. + print( + f"[OmniServerStageCli] Resolved deploy config from {stage_config_path}:\n" + f"{yaml.safe_dump(resolved_cfg, sort_keys=False, default_flow_style=False)}", + flush=True, + ) + self.stage_runtime_devices = self._load_stage_runtime_devices(resolved_cfg) + self.stage_ids = stage_ids or self._load_stage_ids(resolved_cfg) if 0 not in self.stage_ids: raise ValueError(f"Stage CLI test requires stage_id=0 in config: {stage_config_path}") self.stage_procs: dict[int, subprocess.Popen] = {} self.proc = None @staticmethod - def _load_stage_ids(stage_config_path: str) -> list[int]: - with open(stage_config_path, encoding="utf-8") as f: - cfg = yaml.safe_load(f) or {} + def _stage_entries(cfg: dict) -> list[dict]: + """Return the list of stage entries from either legacy (``stage_args``) + or new-schema (``stages``) deploy YAMLs.""" + return cfg.get("stage_args") or cfg.get("stages") or [] - stage_ids = [stage["stage_id"] for stage in cfg.get("stage_args", []) if "stage_id" in stage] + @staticmethod + def _load_stage_ids(resolved_config: dict) -> list[int]: + stage_ids = [ + stage["stage_id"] for stage in OmniServerStageCli._stage_entries(resolved_config) if "stage_id" in stage + ] if not stage_ids: - raise ValueError(f"No stage IDs found in config: {stage_config_path}") + raise ValueError("No stage IDs found in resolved config") return stage_ids @staticmethod - def _load_stage_runtime_devices(stage_config_path: str) -> dict[int, str]: - with open(stage_config_path, encoding="utf-8") as f: - cfg = yaml.safe_load(f) or {} - + def _load_stage_runtime_devices(resolved_config: dict) -> dict[int, str]: runtime_devices: dict[int, str] = {} - for stage in cfg.get("stage_args", []): + for stage in OmniServerStageCli._stage_entries(resolved_config): stage_id = stage.get("stage_id") - devices = stage.get("runtime", {}).get("devices") + # New schema: stage.devices is flat at stage level. + # Legacy schema: stage.runtime.devices is nested. + devices = stage.get("devices") or stage.get("runtime", {}).get("devices") if stage_id is not None and devices: runtime_devices[int(stage_id)] = str(devices) return runtime_devices @@ -1696,10 +1713,21 @@ def _launch_stage(self, stage_id: int, *, headless: bool) -> None: cmd = self._build_stage_cmd(stage_id, headless=headless) print(f"Launching OmniServerStageCli stage {stage_id}: {' '.join(cmd)}") + # Capture each subprocess's stdout+stderr to a per-stage log file so + # debugging "Stage N exited before API server ready" doesn't rely on + # guessing; the file is surfaced in the RuntimeError message. + log_path = Path(tempfile.gettempdir()) / f"omni_stage_{stage_id}_{self.master_port}.log" + self._stage_log_paths = getattr(self, "_stage_log_paths", {}) + self._stage_log_paths[stage_id] = log_path + log_fh = open(log_path, "w", buffering=1) # noqa: SIM115 - closed in __exit__ + self._stage_log_files = getattr(self, "_stage_log_files", {}) + self._stage_log_files[stage_id] = log_fh proc = subprocess.Popen( cmd, env=env, cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + stdout=log_fh, + stderr=subprocess.STDOUT, ) self.stage_procs[stage_id] = proc if stage_id == 0: @@ -1709,7 +1737,18 @@ def _ensure_stage_processes_alive(self) -> None: for stage_id, proc in self.stage_procs.items(): ret = proc.poll() if ret is not None: - raise RuntimeError(f"Stage {stage_id} exited with code {ret} before API server became ready.") + log_path = getattr(self, "_stage_log_paths", {}).get(stage_id) + tail = "" + if log_path and log_path.exists(): + try: + with open(log_path, encoding="utf-8", errors="replace") as f: + lines = f.readlines() + tail = "\n=== Last 60 lines of stage {} log ({}) ===\n{}".format( + stage_id, log_path, "".join(lines[-60:]) or "" + ) + except Exception as exc: # pragma: no cover - diagnostic only + tail = f"\n" + raise RuntimeError(f"Stage {stage_id} exited with code {ret} before API server became ready.{tail}") def _start_server(self) -> None: ordered_stage_ids = [0, *[stage_id for stage_id in self.stage_ids if stage_id != 0]] @@ -1735,7 +1774,46 @@ def _start_server(self) -> None: raise RuntimeError(f"OmniServerStageCli failed to start within {max_wait} seconds") + def _dump_stage_logs_for_debug(self, head_lines: int = 300, tail_lines: int = 500) -> None: + """Tail each stage's subprocess log back to stdout on teardown. + + Stage subprocesses redirect stdout/stderr to ``/tmp/omni_stage_*.log`` + so we don't spam the main CI stream while tests run; but that also + hides engine init (KV cache size, Available KV cache memory, vLLM + engine config) when things go wrong. Dump them here so buildkite + captures them post-run. Head covers engine init; tail covers + whatever state the stage was in when it was torn down. + """ + log_paths = getattr(self, "_stage_log_paths", {}) or {} + for stage_id in sorted(log_paths): + log_path = log_paths[stage_id] + if not log_path or not log_path.exists(): + continue + try: + with open(log_path, encoding="utf-8", errors="replace") as f: + lines = f.readlines() + except Exception as exc: # pragma: no cover - diagnostic only + print(f"[OmniServerStageCli] stage {stage_id} log read failed: {exc}", flush=True) + continue + total = len(lines) + if total <= head_lines + tail_lines: + head_chunk = lines + tail_chunk = [] + elided = 0 + else: + head_chunk = lines[:head_lines] + tail_chunk = lines[-tail_lines:] + elided = total - head_lines - tail_lines + print(f"\n=== stage {stage_id} log HEAD ({log_path}) ===", flush=True) + print("".join(head_chunk).rstrip("\n"), flush=True) + if tail_chunk: + print(f"\n... [{elided} lines elided] ...", flush=True) + print(f"\n=== stage {stage_id} log TAIL ({log_path}) ===", flush=True) + print("".join(tail_chunk).rstrip("\n"), flush=True) + print(f"=== end stage {stage_id} log ===\n", flush=True) + def __exit__(self, exc_type, exc_val, exc_tb): + self._dump_stage_logs_for_debug() for stage_id in sorted(self.stage_procs, reverse=True): proc = self.stage_procs[stage_id] if proc.poll() is None: @@ -1781,10 +1859,18 @@ def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: st if run_level == "advanced_model" and stage_config_path is not None: with open(stage_config_path, encoding="utf-8") as f: cfg = yaml.safe_load(f) or {} - stage_ids = [stage["stage_id"] for stage in cfg.get("stage_args", []) if "stage_id" in stage] + # Strip ``load_format: dummy`` (CI overlay default) so advanced_model + # tests use real weights. New schema (``stages:``) writes the field + # flat at stage level; legacy schema (``stage_args:``) nests it as + # ``engine_args.load_format``. Handle both. + new_schema_stages = cfg.get("stages") + stage_key = "stages" if new_schema_stages is not None else "stage_args" + delete_path = "load_format" if new_schema_stages is not None else "engine_args.load_format" + stage_entries = cfg.get(stage_key, []) + stage_ids = [stage["stage_id"] for stage in stage_entries if "stage_id" in stage] stage_config_path = modify_stage_config( stage_config_path, - deletes={"stage_args": {stage_id: ["engine_args.load_format"] for stage_id in stage_ids}}, + deletes={stage_key: {stage_id: [delete_path] for stage_id in stage_ids}}, ) server_args = params.server_args or [] @@ -1801,6 +1887,7 @@ def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: st raise ValueError("omni_server with use_stage_cli=True requires use_omni=True") if stage_config_path is None: raise ValueError("omni_server with use_stage_cli=True requires a stage_config_path") + server_args += ["--stage-configs-path", stage_config_path] with OmniServerStageCli( model, @@ -3291,7 +3378,7 @@ def omni_runner(request, model_prefix): with _omni_server_lock: model, stage_config_path = request.param model = model_prefix + model - with OmniRunner(model, seed=42, stage_configs_path=stage_config_path) as runner: + with OmniRunner(model, seed=42, stage_configs_path=stage_config_path, stage_init_timeout=300) as runner: print("OmniRunner started successfully") yield runner print("OmniRunner stopping...") diff --git a/tests/dfx/conftest.py b/tests/dfx/conftest.py index 997f25e6e5..b8edeba9d5 100644 --- a/tests/dfx/conftest.py +++ b/tests/dfx/conftest.py @@ -40,22 +40,32 @@ def modify_stage(default_path, updates, deletes): def create_unique_server_params( configs: list[dict[str, Any]], stage_configs_dir: Path, -) -> list[tuple[str, str, str]]: +) -> list[tuple[str, str, str | None, str | None, tuple[str, ...]]]: unique_params = [] seen = set() for config in configs: test_name = config["test_name"] - model = config["server_params"]["model"] - stage_config_name = config["server_params"].get("stage_config_name") + server_params = config["server_params"] + model = server_params["model"] + stage_config_name = server_params.get("stage_config_name") if stage_config_name: stage_config_path = str(stage_configs_dir / stage_config_name) - delete = config["server_params"].get("delete", None) - update = config["server_params"].get("update", None) + delete = server_params.get("delete", None) + update = server_params.get("update", None) stage_config_path = modify_stage(stage_config_path, update, delete) else: stage_config_path = None - server_param = (test_name, model, stage_config_path) + stage_overrides = server_params.get("stage_overrides") + stage_overrides_json = json.dumps(stage_overrides) if stage_overrides else None + + # ``extra_cli_args`` passes raw CLI flags straight through to + # ``vllm_omni.entrypoints.cli.main serve`` — used for flags that + # don't map to stage-level overrides, e.g. ``--async-chunk`` / + # ``--no-async-chunk`` toggling the deploy-level async_chunk bool. + extra_cli_args = tuple(server_params.get("extra_cli_args") or ()) + + server_param = (test_name, model, stage_config_path, stage_overrides_json, extra_cli_args) if server_param not in seen: seen.add(server_param) unique_params.append(server_param) diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index bea46f684b..0de60c6a54 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -48,8 +48,8 @@ def _get_config_file_from_argv() -> str | None: OMNI_RESULT_TEMPLATE_PATH = Path(__file__).parent / "result_omni_template.json" -STAGE_CONFIGS_DIR = Path(__file__).parent.parent / "stage_configs" -test_params = create_unique_server_params(BENCHMARK_CONFIGS, STAGE_CONFIGS_DIR) +DEPLOY_CONFIGS_DIR = Path(__file__).parent.parent / "deploy" +test_params = create_unique_server_params(BENCHMARK_CONFIGS, DEPLOY_CONFIGS_DIR) server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS) _omni_server_lock = threading.Lock() @@ -62,13 +62,19 @@ def omni_server(request): Multi-stage initialization can take 10-20+ minutes. """ with _omni_server_lock: - test_name, model, stage_config_path = request.param + test_name, model, stage_config_path, stage_overrides, extra_cli_args = request.param print(f"Starting OmniServer with test: {test_name}, model: {model}") server_args = ["--stage-init-timeout", "600", "--init-timeout", "900"] + # --deploy-config and --stage-overrides compose at the CLI (see vllm_omni/entrypoints/utils.py): + # deploy-config sets the base; stage-overrides are applied on top. Both can be set. if stage_config_path: - server_args = ["--stage-configs-path", stage_config_path] + server_args + server_args = ["--deploy-config", stage_config_path] + server_args + if stage_overrides: + server_args = ["--stage-overrides", stage_overrides] + server_args + if extra_cli_args: + server_args = list(extra_cli_args) + server_args with OmniServer(model, server_args) as server: server.test_name = test_name print("OmniServer started successfully") diff --git a/tests/dfx/perf/stage_configs/qwen3_omni.yaml b/tests/dfx/perf/stage_configs/qwen3_omni.yaml deleted file mode 100644 index 2add22b873..0000000000 --- a/tests/dfx/perf/stage_configs/qwen3_omni.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) - -# The following config has been verified on 2x H100-80G GPUs. -async_chunk: false -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - hf_config_name: thinker_config - tensor_parallel_size: 1 - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - enable_prefix_caching: false - max_num_batched_tokens: 32768 - distributed_executor_backend: "mp" - hf_config_name: talker_config - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "1" - engine_args: - model_stage: code2wav - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.1 - distributed_executor_backend: "mp" - max_num_batched_tokens: 100000 - hf_config_name: thinker_config - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/tests/dfx/perf/stage_configs/qwen3_tts.yaml b/tests/dfx/perf/stage_configs/qwen3_tts.yaml deleted file mode 100644 index 7bf04deec5..0000000000 --- a/tests/dfx/perf/stage_configs/qwen3_tts.yaml +++ /dev/null @@ -1,91 +0,0 @@ -# Stage config for running Qwen3-TTS with 2-stage architecture -# Stage 0: Talker (text -> 8-layer RVQ codec codes) -# Stage 1: Code2Wav (codec codes -> audio waveform) -# -# The following config has been verified on 1x H100-80G GPU. -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - max_num_seqs: 4 - model_stage: qwen3_tts - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: false - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 512 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - max_num_seqs: 4 - model_stage: code2wav - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.2 - distributed_executor_backend: "mp" - max_num_batched_tokens: 8192 - max_model_len: 32768 - engine_input_source: [0] - final_output: true - final_output_type: audio - input_connectors: - from_stage_0: connector_of_shared_memory - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -runtime: - enabled: true - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - codec_streaming: true - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - codec_chunk_frames: 25 - codec_left_context_frames: 72 - - edges: - - from: 0 - to: 1 diff --git a/tests/dfx/perf/tests/test_qwen_omni.json b/tests/dfx/perf/tests/test_qwen_omni.json index 4662f8c0c7..39fd266544 100644 --- a/tests/dfx/perf/tests/test_qwen_omni.json +++ b/tests/dfx/perf/tests/test_qwen_omni.json @@ -2,8 +2,7 @@ { "test_name": "test_qwen3_omni", "server_params": { - "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_config_name": "qwen3_omni.yaml" + "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct" }, "benchmark_params": [ { @@ -109,25 +108,7 @@ "test_name": "test_qwen3_omni_chunk", "server_params": { "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_config_name": "qwen3_omni.yaml", - "update": { - "async_chunk": true, - "stage_args": { - "0": { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" - }, - "1": { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk" - } - } - }, - "delete": { - "stage_args": { - "2": [ - "custom_process_input_func" - ] - } - } + "extra_cli_args": ["--async-chunk"] }, "benchmark_params": [ { diff --git a/tests/dfx/stability/scripts/test_benchmark_stability.py b/tests/dfx/stability/scripts/test_benchmark_stability.py index a9faae8ab8..3d6b41e762 100644 --- a/tests/dfx/stability/scripts/test_benchmark_stability.py +++ b/tests/dfx/stability/scripts/test_benchmark_stability.py @@ -35,7 +35,7 @@ from tests.dfx.perf.scripts.run_benchmark import run_benchmark STABILITY_DIR = Path(__file__).resolve().parent.parent -STAGE_CONFIGS_DIR = STABILITY_DIR / "stage_configs" +DEPLOY_CONFIGS_DIR = STABILITY_DIR / "deploy" CONFIG_FILE_PATH = str(STABILITY_DIR / "tests" / "test.json") DEFAULT_NUM_PROMPTS_PER_BATCH = 20 @@ -45,7 +45,7 @@ except FileNotFoundError: BENCHMARK_CONFIGS = [] -test_params = create_unique_server_params(BENCHMARK_CONFIGS, STAGE_CONFIGS_DIR) if BENCHMARK_CONFIGS else [] +test_params = create_unique_server_params(BENCHMARK_CONFIGS, DEPLOY_CONFIGS_DIR) if BENCHMARK_CONFIGS else [] server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS) if BENCHMARK_CONFIGS else {} _omni_server_lock = threading.Lock() @@ -219,11 +219,20 @@ def omni_server(request): Multi-stage initialization can take 10-20+ minutes. """ with _omni_server_lock: - test_name, model, stage_config_path = request.param + test_name, model, stage_config_path, stage_overrides, extra_cli_args = request.param print(f"Starting OmniServer with test: {test_name}, model: {model}") - with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server: + server_args = ["--stage-init-timeout", "120"] + # --deploy-config and --stage-overrides compose at the CLI (see vllm_omni/entrypoints/utils.py): + # deploy-config sets the base; stage-overrides are applied on top. Both can be set. + if stage_config_path: + server_args = ["--deploy-config", stage_config_path] + server_args + if stage_overrides: + server_args = ["--stage-overrides", stage_overrides] + server_args + if extra_cli_args: + server_args = list(extra_cli_args) + server_args + with OmniServer(model, server_args) as server: server.test_name = test_name print("OmniServer started successfully") yield server diff --git a/tests/dfx/stability/stage_configs/qwen3_omni.yaml b/tests/dfx/stability/stage_configs/qwen3_omni.yaml deleted file mode 100644 index 802f8dd249..0000000000 --- a/tests/dfx/stability/stage_configs/qwen3_omni.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) - -# The following config has been verified on 2x H100-80G GPUs. -async_chunk: false -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type to launch OmniLLM - runtime: - devices: "0" - max_batch_size: 64 - engine_args: - model_stage: thinker - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - hf_config_name: thinker_config - tensor_parallel_size: 1 - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: llm # Use llm stage type to launch OmniLLM - runtime: - devices: "1" - max_batch_size: 64 - engine_args: - model_stage: talker - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - enable_prefix_caching: false - max_num_batched_tokens: 32768 - distributed_executor_backend: "mp" - hf_config_name: talker_config - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 2 - stage_type: llm # Use llm stage type to launch OmniLLM - runtime: - devices: "1" - max_batch_size: 64 - engine_args: - model_stage: code2wav - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.1 - distributed_executor_backend: "mp" - max_num_batched_tokens: 1000000 - hf_config_name: thinker_config - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/tests/dfx/stability/tests/test.json b/tests/dfx/stability/tests/test.json index 95993c9c55..255cd5b109 100644 --- a/tests/dfx/stability/tests/test.json +++ b/tests/dfx/stability/tests/test.json @@ -3,7 +3,11 @@ "test_name": "test_qwen3_omni_stability", "server_params": { "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_config_name": "qwen3_omni.yaml" + "stage_overrides": { + "2": { + "max_num_batched_tokens": 1000000 + } + } }, "benchmark_params": [ { @@ -36,25 +40,12 @@ "test_name": "test_qwen3_omni_stability_async_chunk", "server_params": { "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_config_name": "qwen3_omni.yaml", - "update": { - "async_chunk": true, - "stage_args": { - "0": { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" - }, - "1": { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk" - } + "stage_overrides": { + "2": { + "max_num_batched_tokens": 1000000 } }, - "delete": { - "stage_args": { - "2": [ - "custom_process_input_func" - ] - } - } + "extra_cli_args": ["--async-chunk"] }, "benchmark_params": [ { diff --git a/tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml deleted file mode 100644 index 0a5513328c..0000000000 --- a/tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml +++ /dev/null @@ -1,98 +0,0 @@ -# stage config for running qwen2.5-omni for multi-stage omni runtime. - -# This config is optimized for CI e2e tests. -stage_args: - - stage_id: 0 - runtime: - process: true # Run this stage in a separate process - devices: "0" - engine_args: - model_stage: thinker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 896 - max_num_batched_tokens: 896 - max_num_seqs: 1 - gpu_memory_utilization: 0.8 - skip_mm_profiling: true - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - mm_processor_cache_gb: 0 - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 896 - max_num_batched_tokens: 896 - max_num_seqs: 1 - gpu_memory_utilization: 0.8 - skip_mm_profiling: true - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - stage_id: 2 - runtime: - process: true - devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.15 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py index 4c4315aab9..4500ebfbe2 100644 --- a/tests/e2e/offline_inference/test_qwen2_5_omni.py +++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py @@ -2,8 +2,6 @@ E2E tests for Qwen2.5-Omni model with mixed modality inputs, audio and text output. """ -from pathlib import Path - import pytest from tests.conftest import ( @@ -12,36 +10,31 @@ generate_synthetic_video, modify_stage_config, ) -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test from vllm_omni.platforms import current_omni_platform models = ["Qwen/Qwen2.5-Omni-7B"] +# Single CI deploy YAML; rocm/xpu deltas are picked automatically via the +# platforms: section. NPU still uses the legacy per-platform YAML until it +# also migrates to the new schema. +_CI_DEPLOY = get_deploy_config_path("ci/qwen2_5_omni.yaml") + def get_cuda_graph_config(): - path = modify_stage_config( - str(Path(__file__).parent.parent / "stage_configs" / "qwen2_5_omni_ci.yaml"), + return modify_stage_config( + _CI_DEPLOY, updates={ - "stage_args": { - 0: { - "engine_args.enforce_eager": "true", - }, - 1: {"engine_args.enforce_eager": "true"}, + "stages": { + 0: {"enforce_eager": True}, + 1: {"enforce_eager": True}, }, }, ) - return path - - -# CI stage config optimized for 24GB GPU (L4/RTX3090) or NPU -if current_omni_platform.is_npu(): - stage_config = str(Path(__file__).parent / "stage_configs" / "npu" / "qwen2_5_omni_ci.yaml") -elif current_omni_platform.is_rocm(): - # ROCm stage config optimized for MI325 GPU - stage_config = str(Path(__file__).parent.parent / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml") -elif current_omni_platform.is_xpu(): - # Intel XPU stage config optimized for B60 GPU - stage_config = str(Path(__file__).parent.parent / "stage_configs" / "xpu" / "qwen2_5_omni_ci.yaml") + + +if current_omni_platform.is_rocm() or current_omni_platform.is_xpu() or current_omni_platform.is_npu(): + stage_config = _CI_DEPLOY else: stage_config = get_cuda_graph_config() diff --git a/tests/e2e/offline_inference/test_qwen3_omni.py b/tests/e2e/offline_inference/test_qwen3_omni.py index cc0af437ec..0df89c3e88 100644 --- a/tests/e2e/offline_inference/test_qwen3_omni.py +++ b/tests/e2e/offline_inference/test_qwen3_omni.py @@ -7,41 +7,37 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -from pathlib import Path - import pytest from tests.conftest import ( generate_synthetic_video, modify_stage_config, ) -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test from vllm_omni.platforms import current_omni_platform models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] +# Single CI deploy YAML; rocm/xpu deltas are picked automatically via the +# platforms: section. Only CUDA needs an extra enforce_eager tweak. +_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml") + + def get_cuda_graph_config(): - path = modify_stage_config( - str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml"), + return modify_stage_config( + _CI_DEPLOY, updates={ - "stage_args": { - 0: { - "engine_args.enforce_eager": "true", - }, - 1: {"engine_args.enforce_eager": "true"}, + "stages": { + 0: {"enforce_eager": True}, + 1: {"enforce_eager": True}, }, }, ) - return path -# CI stage config for 2xH100-80G GPUs or AMD GPU MI325 -if current_omni_platform.is_rocm(): - # ROCm stage config optimized for MI325 GPU - stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "rocm" / "qwen3_omni_ci.yaml")] -elif current_omni_platform.is_xpu(): - stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml")] +if current_omni_platform.is_rocm() or current_omni_platform.is_xpu(): + stage_configs = [_CI_DEPLOY] else: stage_configs = [get_cuda_graph_config()] diff --git a/tests/e2e/offline_inference/test_qwen3_tts_base.py b/tests/e2e/offline_inference/test_qwen3_tts_base.py index be7bd50a36..a706798043 100644 --- a/tests/e2e/offline_inference/test_qwen3_tts_base.py +++ b/tests/e2e/offline_inference/test_qwen3_tts_base.py @@ -13,12 +13,10 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -from pathlib import Path - import pytest from tests.conftest import modify_stage_config -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test MODEL = "Qwen/Qwen3-TTS-12Hz-0.6B-Base" REF_AUDIO_URL = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav" @@ -26,23 +24,31 @@ def get_cuda_graph_config(): - path = modify_stage_config( - get_stage_config(), + """Build a temp deploy yaml mirroring the deleted qwen3_tts_no_async_chunk.yaml. + + Composes the synchronous (no-async-chunk) variant on top of the bundled + qwen3_tts.yaml prod default, with cudagraphs disabled. Replaces the deleted + standalone variant yaml; same effective config, no checked-in file needed. + """ + return modify_stage_config( + get_deploy_config_path("qwen3_tts.yaml"), updates={ - "stage_args": { + "async_chunk": False, + "stages": { 0: { - "engine_args.enforce_eager": "true", + "max_num_seqs": 1, + "gpu_memory_utilization": 0.2, + "enforce_eager": True, + "async_scheduling": False, + }, + 1: { + "gpu_memory_utilization": 0.2, + "enforce_eager": True, + "async_scheduling": False, }, - 1: {"engine_args.enforce_eager": "true"}, }, }, ) - return path - - -def get_stage_config(name: str = "qwen3_tts_no_async_chunk.yaml"): - """Get the no_async_chunk stage config path (async_chunk disable, cuda_graph disabled).""" - return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name) # Same structure as test_qwen3_omni: models, stage_configs, test_params diff --git a/tests/e2e/offline_inference/test_qwen3_tts_customvoice.py b/tests/e2e/offline_inference/test_qwen3_tts_customvoice.py index 67d72df908..cf411349c3 100644 --- a/tests/e2e/offline_inference/test_qwen3_tts_customvoice.py +++ b/tests/e2e/offline_inference/test_qwen3_tts_customvoice.py @@ -13,34 +13,40 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -from pathlib import Path - import pytest from tests.conftest import modify_stage_config -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" def get_cuda_graph_config(): - path = modify_stage_config( - get_stage_config(), + """Build a temp deploy yaml mirroring the deleted qwen3_tts_no_async_chunk.yaml. + + Composes the synchronous (no-async-chunk) variant on top of the bundled + qwen3_tts.yaml prod default, with cudagraphs disabled. Replaces the deleted + standalone variant yaml; same effective config, no checked-in file needed. + """ + return modify_stage_config( + get_deploy_config_path("qwen3_tts.yaml"), updates={ - "stage_args": { + "async_chunk": False, + "stages": { 0: { - "engine_args.enforce_eager": "true", + "max_num_seqs": 1, + "gpu_memory_utilization": 0.2, + "enforce_eager": True, + "async_scheduling": False, + }, + 1: { + "gpu_memory_utilization": 0.2, + "enforce_eager": True, + "async_scheduling": False, }, - 1: {"engine_args.enforce_eager": "true"}, }, }, ) - return path - - -def get_stage_config(name: str = "qwen3_tts_no_async_chunk.yaml"): - """Get the no_async_chunk stage config path (async_chunk disable, cuda_graph disabled).""" - return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name) # Same structure as test_qwen3_omni: models, stage_configs, test_params diff --git a/tests/e2e/online_serving/test_qwen2_5_omni.py b/tests/e2e/online_serving/test_qwen2_5_omni.py index e2913ce021..ba333e498c 100644 --- a/tests/e2e/online_serving/test_qwen2_5_omni.py +++ b/tests/e2e/online_serving/test_qwen2_5_omni.py @@ -3,7 +3,6 @@ """ import os -from pathlib import Path import pytest @@ -15,8 +14,7 @@ generate_synthetic_video, modify_stage_config, ) -from tests.utils import hardware_test -from vllm_omni.platforms import current_omni_platform +from tests.utils import get_deploy_config_path, hardware_test os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" @@ -24,20 +22,9 @@ models = ["Qwen/Qwen2.5-Omni-7B"] - -def get_config(): - path = modify_stage_config( - str(Path(__file__).parent.parent / "stage_configs" / "qwen2_5_omni_ci.yaml"), - ) - return path - - -# CI stage config for 2xH100-80G GPUs or AMD GPU MI325 -if current_omni_platform.is_rocm(): - # ROCm stage config optimized for MI325 GPU - stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml")] -else: - stage_configs = [get_config()] +# Single CI deploy YAML; rocm/xpu deltas are picked automatically via the +# platforms: section in vllm_omni/deploy/ci/qwen2_5_omni.yaml. +stage_configs = [modify_stage_config(get_deploy_config_path("ci/qwen2_5_omni.yaml"))] # Create parameter combinations for model and stage config test_params = [ diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py index 9737fa42bd..62eca6349f 100644 --- a/tests/e2e/online_serving/test_qwen3_omni.py +++ b/tests/e2e/online_serving/test_qwen3_omni.py @@ -3,7 +3,6 @@ """ import os -from pathlib import Path import pytest @@ -15,7 +14,7 @@ generate_synthetic_video, modify_stage_config, ) -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test from vllm_omni.platforms import current_omni_platform os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -23,32 +22,24 @@ models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] -QWEN3_OMNI_CONFIG_PATH = str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml") -QWEN3_OMNI_XPU_CONFIG_PATH = str(Path(__file__).parent.parent / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml") -_STAGE_CONFIGS_DIR = Path(__file__).parent.parent / "stage_configs" -_PD_SEP_CONFIG = str(_STAGE_CONFIGS_DIR / "qwen3_omni_moe_pd_ci.yaml") +# Set VLLM_TEST_PD_MODE=1 to test PD disaggregation (follow-up — deploy overlay not yet migrated). +_USE_PD = os.environ.get("VLLM_TEST_PD_MODE", "0") == "1" + +_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml") def get_chunk_config(config_path: str | None = None): - """Load qwen3_omni_ci.yaml with async_chunk modifications for streaming mode.""" + """Load the qwen3_omni CI deploy yaml with async_chunk modifications for streaming mode.""" if config_path is None: - config_path = str(_STAGE_CONFIGS_DIR / "qwen3_omni_ci.yaml") - return modify_stage_config( - config_path, - updates={ - "async_chunk": True, - "stage_args": { - 0: { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" - }, - 1: { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk" - }, - }, - }, - deletes={"stage_args": {2: ["custom_process_input_func"]}}, - ) + config_path = _CI_DEPLOY + # TODO: remove this workaround once legacy `stage_args` path is deleted. + # The pipeline (qwen3_omni/pipeline.py) already wires + # thinker2talker_async_chunk / talker2code2wav_async_chunk on stage 0/1, + # so only async_chunk needs flipping. Writing nested `engine_args:` into + # the new-schema overlay trips _parse_stage_deploy's legacy branch and + # drops flat fields (load_format, max_num_seqs, ...). + return modify_stage_config(config_path, updates={"async_chunk": True}) def get_prefix_caching_config(config_path: str): @@ -64,21 +55,16 @@ def get_prefix_caching_config(config_path: str): return path -# Set VLLM_TEST_PD_MODE=1 to test PD disaggregation, default tests async_chunk mode. -_USE_PD = os.environ.get("VLLM_TEST_PD_MODE", "0") == "1" - -# Stage configs for H100/CUDA, ROCm MI325, and XPU platforms -if current_omni_platform.is_rocm(): - rocm_config = str(_STAGE_CONFIGS_DIR / "rocm" / "qwen3_omni_ci.yaml") - stage_configs = [rocm_config] - prefix_caching_stage_configs = [get_prefix_caching_config(rocm_config)] -elif current_omni_platform.is_xpu(): - xpu_config = str(_STAGE_CONFIGS_DIR / "xpu" / "qwen3_omni_ci.yaml") - stage_configs = [xpu_config] - prefix_caching_stage_configs = [get_prefix_caching_config(xpu_config)] -else: - stage_configs = [_PD_SEP_CONFIG if _USE_PD else get_chunk_config(QWEN3_OMNI_CONFIG_PATH)] - prefix_caching_stage_configs = [get_prefix_caching_config(QWEN3_OMNI_CONFIG_PATH)] +# Platform-specific overrides live inside the new deploy yaml's ``platforms:`` +# section, so a single ``_CI_DEPLOY`` path serves CUDA, ROCm, and XPU. +# TODO: re-add VLLM_TEST_PD_MODE branch once the PD-disaggregation deploy +# overlay has been migrated to the new schema (previously used the deleted +# ``qwen3_omni_moe_pd_ci.yaml`` stage-configs file). +if current_omni_platform.is_xpu(): + stage_configs = [_CI_DEPLOY] +else: # CUDA + ROCm MI325 share the same deploy config + stage_configs = [get_chunk_config()] +prefix_caching_stage_configs = [get_prefix_caching_config(_CI_DEPLOY)] # Create parameter combinations for model and stage config test_params = [ diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 06847f3d51..acec0efde2 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -6,10 +6,7 @@ import os -from vllm_omni.platforms import current_omni_platform - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -from pathlib import Path import pytest @@ -21,7 +18,7 @@ generate_synthetic_video, modify_stage_config, ) -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test model = "Qwen/Qwen3-Omni-30B-A3B-Instruct" @@ -40,47 +37,56 @@ LONG_AUDIO_DURATION_SEC = 120 -def get_chunk_config(default_path): - path = modify_stage_config( +def get_batch_token_config(default_path): + """Override stage 1's max_num_batched_tokens to exercise small-batch paths. + + Uses the new flat-stage schema (``stages..``); the legacy + ``stage_args..engine_args.`` path no longer applies because + the deploy YAML doesn't nest engine fields under ``engine_args:``. + """ + return modify_stage_config( default_path, updates={ - "async_chunk": True, - "stage_args": { - 0: { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk", - "default_sampling_params.max_tokens": 2048, - }, - 1: { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk" - }, - }, + "stages": {1: {"max_num_batched_tokens": 64}}, }, - deletes={"stage_args": {2: ["custom_process_input_func"]}}, ) - return path -def get_batch_token_config(default_path): - path = modify_stage_config( +def get_async_chunk_config(default_path): + """Flip async_chunk on and bump stage 0 thinker output to 2048 tokens. + + Pipeline registry (qwen3_omni/pipeline.py) already wires + thinker2talker_async_chunk / talker2code2wav_async_chunk on stages 0/1, + so no per-stage processor override is needed. Using only flat-schema + writes so _parse_stage_deploy stays in its flat branch (nested + ``engine_args:`` would drop other overlay fields). + """ + return modify_stage_config( default_path, updates={ - "stage_args": {1: {"engine_args.max_num_batched_tokens": 64}}, + "async_chunk": True, + "stages": {0: {"default_sampling_params.max_tokens": 2048}}, }, ) - return path -# CI stage config for 2*H100-80G GPUs -default_path = str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml") +# CI deploy YAML (single file; xpu deltas applied via ``platforms:`` section). +# The overlay explicitly sets ``async_chunk: False``, so ``default`` tests the +# sync path and ``async_chunk`` tests the streaming path with a longer thinker +# output — two distinct scenarios, kept as separate parametrizations. +default_path = get_deploy_config_path("ci/qwen3_omni_moe.yaml") -if current_omni_platform.is_xpu(): - default_path = str(Path(__file__).parent.parent / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml") - -# Create parameter combinations for model and stage config test_params = [ - pytest.param(OmniServerParams(model=model, stage_config_path=default_path, use_stage_cli=True), id="default"), pytest.param( - OmniServerParams(model=model, stage_config_path=get_chunk_config(default_path), use_stage_cli=True), + OmniServerParams(model=model, stage_config_path=default_path, use_stage_cli=True), + id="default", + ), + pytest.param( + OmniServerParams( + model=model, + stage_config_path=get_async_chunk_config(default_path), + use_stage_cli=True, + ), id="async_chunk", ), ] diff --git a/tests/e2e/online_serving/test_qwen3_omni_realtime_websocket.py b/tests/e2e/online_serving/test_qwen3_omni_realtime_websocket.py index c0f700fc43..6a7cf1c67e 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_realtime_websocket.py +++ b/tests/e2e/online_serving/test_qwen3_omni_realtime_websocket.py @@ -12,7 +12,6 @@ import json import os import wave -from pathlib import Path import pytest import websockets @@ -24,23 +23,22 @@ generate_synthetic_audio, modify_stage_config, ) -from tests.utils import hardware_test -from vllm_omni.platforms import current_omni_platform +from tests.utils import get_deploy_config_path, hardware_test os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" MODEL = "Qwen/Qwen3-Omni-30B-A3B-Instruct" -default_stage_config = str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml") -if current_omni_platform.is_xpu(): - default_stage_config = str(Path(__file__).parent.parent / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml") +# The new-schema CI overlay bakes in async_chunk: False and covers CUDA/ROCm/XPU +# via its ``platforms:`` section, so one path serves all three. +default_stage_config = get_deploy_config_path("ci/qwen3_omni_moe.yaml") def _realtime_stage_config_path() -> str: """CI omni layout without async_chunk; stage 0 thinker max_tokens=10.""" return modify_stage_config( default_stage_config, - updates={"stage_args": {0: {"default_sampling_params.max_tokens": 10}}}, + updates={"stages": {0: {"default_sampling_params.max_tokens": 10}}}, ) diff --git a/tests/e2e/online_serving/test_qwen3_tts_base.py b/tests/e2e/online_serving/test_qwen3_tts_base.py index 002f9d9972..c97fdef5bc 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_base.py +++ b/tests/e2e/online_serving/test_qwen3_tts_base.py @@ -12,12 +12,10 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -from pathlib import Path - import pytest from tests.conftest import OmniServerParams -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test MODEL = "Qwen/Qwen3-TTS-12Hz-0.6B-Base" @@ -25,11 +23,6 @@ REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you." -def get_stage_config(name: str = "qwen3_tts.yaml"): - """Get the stage config path from vllm_omni model_executor stage_configs.""" - return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name) - - def get_prompt(prompt_type="text"): """Text prompt for text-to-audio tests (same as test_qwen3_omni - beijing test case).""" prompts = { @@ -48,7 +41,7 @@ def get_max_batch_size(size_type="few"): pytest.param( OmniServerParams( model=MODEL, - stage_config_path=get_stage_config("qwen3_tts.yaml"), + stage_config_path=get_deploy_config_path("qwen3_tts.yaml"), server_args=["--trust-remote-code", "--disable-log-stats"], ), id="async_chunk", diff --git a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py index 3c33485e4f..364865d286 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py @@ -12,12 +12,10 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -from pathlib import Path - import pytest from tests.conftest import OmniServerParams -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test MODEL = "Qwen/Qwen3-TTS-12Hz-0.6B-Base" @@ -25,11 +23,6 @@ REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you." -def get_stage_config(name: str = "qwen3_tts.yaml"): - """Get the stage config path from vllm_omni model_executor stage_configs.""" - return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name) - - def get_prompt(prompt_type="text"): """Text prompt for text-to-audio tests (same as test_qwen3_omni - beijing test case).""" prompts = { @@ -48,16 +41,19 @@ def get_max_batch_size(size_type="few"): pytest.param( OmniServerParams( model=MODEL, - stage_config_path=get_stage_config("qwen3_tts.yaml"), + stage_config_path=get_deploy_config_path("qwen3_tts.yaml"), server_args=["--trust-remote-code", "--disable-log-stats"], ), id="async_chunk", ), + # Synchronous (no async-chunk) variant — ``--no-async-chunk`` alone + # flips the deploy yaml's bool and the pipeline dispatches to the + # end-to-end codec processor. No variant yaml / pipeline needed. pytest.param( OmniServerParams( model=MODEL, - stage_config_path=get_stage_config("qwen3_tts_no_async_chunk.yaml"), - server_args=["--trust-remote-code", "--disable-log-stats"], + stage_config_path=get_deploy_config_path("qwen3_tts.yaml"), + server_args=["--trust-remote-code", "--disable-log-stats", "--no-async-chunk"], ), id="no_async_chunk", ), diff --git a/tests/e2e/online_serving/test_qwen3_tts_batch.py b/tests/e2e/online_serving/test_qwen3_tts_batch.py index 1a453afb72..bf13884997 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_batch.py +++ b/tests/e2e/online_serving/test_qwen3_tts_batch.py @@ -27,14 +27,15 @@ convert_audio_file_to_text, cosine_similarity_text, ) -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test MODEL = "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" STAGE_INIT_TIMEOUT_S = 120 -def get_stage_config(name: str = "qwen3_tts.yaml"): - return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name) +def get_stage_config(name: str = "qwen3_tts.yaml") -> str: + """Resolve a deploy config path under vllm_omni/deploy/.""" + return get_deploy_config_path(name) @pytest.fixture(scope="module") diff --git a/tests/e2e/online_serving/test_qwen3_tts_customvoice.py b/tests/e2e/online_serving/test_qwen3_tts_customvoice.py index fb60df725b..d19c652689 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_customvoice.py +++ b/tests/e2e/online_serving/test_qwen3_tts_customvoice.py @@ -12,21 +12,14 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -from pathlib import Path - import pytest from tests.conftest import OmniServerParams -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" -def get_stage_config(name: str = "qwen3_tts.yaml"): - """Get the stage config path from vllm_omni model_executor stage_configs.""" - return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name) - - def get_prompt(prompt_type="text"): """Text prompt for text-to-audio tests (same as test_qwen3_omni - beijing test case).""" prompts = { @@ -45,7 +38,7 @@ def get_max_batch_size(size_type="few"): pytest.param( OmniServerParams( model=MODEL, - stage_config_path=get_stage_config("qwen3_tts.yaml"), + stage_config_path=get_deploy_config_path("qwen3_tts.yaml"), server_args=["--trust-remote-code", "--disable-log-stats"], ), id="async_chunk", diff --git a/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py index 03a985896e..4087532d63 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py @@ -12,21 +12,14 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -from pathlib import Path - import pytest from tests.conftest import OmniServerParams -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" -def get_stage_config(name: str = "qwen3_tts.yaml"): - """Get the stage config path from vllm_omni model_executor stage_configs.""" - return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name) - - def get_prompt(prompt_type="english"): """Text prompt for text-to-audio tests (same as test_qwen3_omni - beijing test case).""" prompts = { @@ -46,16 +39,19 @@ def get_max_batch_size(size_type="few"): pytest.param( OmniServerParams( model=MODEL, - stage_config_path=get_stage_config("qwen3_tts.yaml"), + stage_config_path=get_deploy_config_path("qwen3_tts.yaml"), server_args=["--trust-remote-code", "--disable-log-stats"], ), id="async_chunk", ), + # Synchronous (no async-chunk) variant — ``--no-async-chunk`` alone + # flips the deploy yaml's bool and the pipeline dispatches to the + # end-to-end codec processor. No variant yaml / pipeline needed. pytest.param( OmniServerParams( model=MODEL, - stage_config_path=get_stage_config("qwen3_tts_no_async_chunk.yaml"), - server_args=["--trust-remote-code", "--disable-log-stats"], + stage_config_path=get_deploy_config_path("qwen3_tts.yaml"), + server_args=["--trust-remote-code", "--disable-log-stats", "--no-async-chunk"], ), id="no_async_chunk", ), diff --git a/tests/e2e/online_serving/test_qwen3_tts_speaker_embedding.py b/tests/e2e/online_serving/test_qwen3_tts_speaker_embedding.py index 8c1c860819..d4212bb5b1 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_speaker_embedding.py +++ b/tests/e2e/online_serving/test_qwen3_tts_speaker_embedding.py @@ -13,13 +13,12 @@ os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" import struct -from pathlib import Path import httpx import pytest from tests.conftest import OmniServer -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test MODEL_BASE = "Qwen/Qwen3-TTS-12Hz-0.6B-Base" MODEL_BASE_1_7B = "Qwen/Qwen3-TTS-12Hz-1.7B-Base" @@ -37,10 +36,8 @@ MAX_NEW_TOKENS = 256 -def get_stage_config(): - return str( - Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "qwen3_tts.yaml" - ) +def get_stage_config() -> str: + return get_deploy_config_path("qwen3_tts.yaml") def _server_args(): diff --git a/tests/e2e/online_serving/test_qwen3_tts_websocket.py b/tests/e2e/online_serving/test_qwen3_tts_websocket.py index 849d1c1158..dddba6e58a 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_websocket.py +++ b/tests/e2e/online_serving/test_qwen3_tts_websocket.py @@ -7,13 +7,12 @@ import asyncio import json import os -from pathlib import Path import pytest import websockets from tests.conftest import OmniServer -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" @@ -23,9 +22,7 @@ def get_stage_config() -> str: - return str( - Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "qwen3_tts.yaml" - ) + return get_deploy_config_path("qwen3_tts.yaml") @pytest.fixture(scope="module") diff --git a/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml b/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml deleted file mode 100644 index f6b0d3927c..0000000000 --- a/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml +++ /dev/null @@ -1,104 +0,0 @@ -# stage config for running qwen2.5-omni for multi-stage omni runtime. - -# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090). -# This config is optimized for CI e2e tests. -stage_args: - - stage_id: 0 - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) - engine_args: - model_stage: thinker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 16384 - max_num_batched_tokens: 16384 - max_num_seqs: 1 - gpu_memory_utilization: 0.9 - skip_mm_profiling: true - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - mm_processor_cache_gb: 0 - load_format: dummy - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 16384 - max_num_batched_tokens: 16384 - max_num_seqs: 1 - gpu_memory_utilization: 0.4 - skip_mm_profiling: true - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: latent - load_format: dummy - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 4096 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - stage_id: 2 - runtime: - process: true - devices: "2" # Example: use a different GPU than the previous stage; use "0" if single GPU - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.5 #increase the gpu memory utilization to enable the test on H800 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio - max_num_batched_tokens: 8192 - max_model_len: 8192 - load_format: dummy - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 8192 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 diff --git a/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml b/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml deleted file mode 100644 index 9401382847..0000000000 --- a/tests/e2e/stage_configs/qwen2_5_omni_thinker_ci.yaml +++ /dev/null @@ -1,31 +0,0 @@ -stage_args: - - stage_id: 0 - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) - engine_args: - model_stage: thinker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 16384 - max_num_batched_tokens: 16384 - max_num_seqs: 1 - gpu_memory_utilization: 0.9 - skip_mm_profiling: true - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - mm_processor_cache_gb: 0 - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/tests/e2e/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_ci.yaml deleted file mode 100644 index 08dd49de95..0000000000 --- a/tests/e2e/stage_configs/qwen3_omni_ci.yaml +++ /dev/null @@ -1,102 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) - -# The following config has been verified on 2x H100-80G GPUs. -stage_args: -- stage_id: 0 - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 5 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - max_num_batched_tokens: 32768 - max_model_len: 32768 - enable_prefix_caching: false - mm_processor_cache_gb: 0 - hf_config_name: thinker_config - tensor_parallel_size: 1 - load_format: dummy - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 150 - seed: 42 - ignore_eos: False - detokenize: True - repetition_penalty: 1.05 - -- stage_id: 1 - runtime: - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 5 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.5 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - enable_prefix_caching: false - max_num_batched_tokens: 32768 - max_model_len: 32768 - distributed_executor_backend: "mp" - hf_config_name: talker_config - load_format: dummy - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 1000 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - -- stage_id: 2 - runtime: - devices: "1" - engine_args: - model_stage: code2wav - max_num_seqs: 5 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.1 - distributed_executor_backend: "mp" - max_num_batched_tokens: 100000 - hf_config_name: thinker_config - async_scheduling: false - load_format: dummy - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2000 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/tests/e2e/stage_configs/rocm/qwen2_5_omni_ci.yaml b/tests/e2e/stage_configs/rocm/qwen2_5_omni_ci.yaml deleted file mode 100644 index b33891fcce..0000000000 --- a/tests/e2e/stage_configs/rocm/qwen2_5_omni_ci.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# stage config for running qwen2.5-omni for multi-stage omni runtime. - -# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090). -# This config is optimized for CI e2e tests. -stage_args: - - stage_id: 0 - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) - engine_args: - model_stage: thinker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 16384 - max_num_batched_tokens: 16384 - max_num_seqs: 1 - gpu_memory_utilization: 0.8 - skip_mm_profiling: true - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - mm_processor_cache_gb: 0 - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 16384 - max_num_batched_tokens: 16384 - max_num_seqs: 1 - gpu_memory_utilization: 0.8 - skip_mm_profiling: true - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 4096 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - stage_id: 2 - runtime: - process: true - devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.15 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio - max_num_batched_tokens: 4096 - max_model_len: 4096 - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 4096 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 diff --git a/tests/e2e/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/rocm/qwen3_omni_ci.yaml deleted file mode 100644 index ac2b1fbd71..0000000000 --- a/tests/e2e/stage_configs/rocm/qwen3_omni_ci.yaml +++ /dev/null @@ -1,100 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) - -# The following config has been verified on 2x H100-80G GPUs. -stage_args: - - stage_id: 0 - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - mm_processor_cache_gb: 0 - hf_config_name: thinker_config - tensor_parallel_size: 1 - load_format: dummy - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 100 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - runtime: - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - # tensor_parallel_size: 2 - enable_prefix_caching: false - distributed_executor_backend: "mp" - hf_config_name: talker_config - load_format: dummy - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 100 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 2 - runtime: - devices: "1" - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.1 - distributed_executor_backend: "mp" - max_num_batched_tokens: 1000000 - hf_config_name: thinker_config - load_format: dummy - async_scheduling: false - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 200 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/tests/e2e/stage_configs/xpu/qwen2_5_omni_ci.yaml b/tests/e2e/stage_configs/xpu/qwen2_5_omni_ci.yaml deleted file mode 100644 index 8958eefaa3..0000000000 --- a/tests/e2e/stage_configs/xpu/qwen2_5_omni_ci.yaml +++ /dev/null @@ -1,102 +0,0 @@ -# stage config for running qwen2.5-omni for multi-stage omni runtime. - -# The following config is verified with 2 * Intel Arc Pro B60 XPU. -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage - engine_args: - model_stage: thinker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 16384 - max_num_batched_tokens: 16384 - max_num_seqs: 1 - gpu_memory_utilization: 0.9 # thinker weight is around 16.74GB for Qwen2.5-Omni-7B - skip_mm_profiling: true - enforce_eager: true - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - mm_processor_cache_gb: 0 - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 16384 - max_num_batched_tokens: 16384 - max_num_seqs: 1 - gpu_memory_utilization: 0.5 # talker weight is 6.03GB for Qwen2.5-Omni-7B - skip_mm_profiling: true - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 4096 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true - devices: "2" - engine_args: - max_num_seqs: 1 - model_stage: code2wav - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.3 # code2wav weight is around 1.46GB for Qwen2.5-Omni-7B - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 diff --git a/tests/e2e/stage_configs/xpu/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/xpu/qwen3_omni_ci.yaml deleted file mode 100644 index c4586e0664..0000000000 --- a/tests/e2e/stage_configs/xpu/qwen3_omni_ci.yaml +++ /dev/null @@ -1,109 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) - -# The following config is verified with 8 * Intel Arc Pro B60 XPU. -stage_args: -- stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "0,1,2,3" - engine_args: - max_num_seqs: 1 - model_stage: thinker - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.85 # thinker weight is around 61.08GB for Qwen3-Omni-30B-A3B-Instruct - skip_mm_profiling: true - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - max_num_batched_tokens: 4096 - max_model_len: 4096 - enable_prefix_caching: false - hf_config_name: thinker_config - tensor_parallel_size: 4 - max_cudagraph_capture_size: 0 - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 100 - seed: 42 - ignore_eos: False - detokenize: True - repetition_penalty: 1.05 - -- stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "4" - engine_args: - max_num_seqs: 1 - model_stage: talker - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 # talker weight is around 8.5GB for Qwen3-Omni-30B-A3B-Instruct - skip_mm_profiling: true - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - enable_prefix_caching: false - max_num_batched_tokens: 4096 - max_model_len: 4096 - distributed_executor_backend: "mp" - hf_config_name: talker_config - max_cudagraph_capture_size: 0 - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - -- stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "5" - engine_args: - max_num_seqs: 1 - model_stage: code2wav - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.3 # code2wav weight is around 0.4GB for Qwen3-Omni-30B-A3B-Instruct - skip_mm_profiling: true - distributed_executor_backend: "mp" - max_num_batched_tokens: 100000 - hf_config_name: thinker_config - async_scheduling: false - max_cudagraph_capture_size: 0 - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2000 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 565c83c1ad..4d69f24c56 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -39,21 +39,28 @@ def test_default_stage_id_is_concrete_int(): assert cfg.stage_id == 0 -def test_multimodal_kwarg_overrides(): +def test_multimodal_kwarg_overrides(mocker): """Ensure that overrides in the multimodal config are preserved.""" - # Get a different value than the default for a multimodal field sig = inspect.signature(OmniEngineArgs) default_mm_cache = sig.parameters["mm_processor_cache_gb"].default override_val = default_mm_cache + 1 - # NOTE: This needs to be a model that resolves to supports_multimodal=True - # in vLLM, otherwise we won't have an MM config + fake_model_config = SimpleNamespace( + multimodal_config=SimpleNamespace(mm_processor_cache_gb=override_val), + ) + + def _fake_parent_create_model_config(self): + assert self.mm_processor_cache_gb == override_val + return fake_model_config + + mocker.patch.object(EngineArgs, "create_model_config", _fake_parent_create_model_config) + mocker.patch.object(OmniModelConfig, "from_vllm_model_config", side_effect=lambda model_config, **_: model_config) + cfg = OmniEngineArgs( model="Qwen/Qwen2-VL-2B-Instruct", mm_processor_cache_gb=override_val, ).create_model_config() - # Ensure that the override was applied correctly assert cfg.multimodal_config is not None assert cfg.multimodal_config.mm_processor_cache_gb == override_val diff --git a/tests/engine/test_async_omni_engine_abort.py b/tests/engine/test_async_omni_engine_abort.py index 34fdf45ea2..e7f2bb679f 100644 --- a/tests/engine/test_async_omni_engine_abort.py +++ b/tests/engine/test_async_omni_engine_abort.py @@ -2,20 +2,24 @@ import os import sys from contextlib import ExitStack -from pathlib import Path import pytest from vllm import SamplingParams from vllm.inputs import PromptType -from tests.utils import hardware_test +# Side-effect import: registers QWEN2_5_OMNI_THINKER_ONLY_PIPELINE in the +# pipeline registry so the materialized deploy overlay below can select it +# via its top-level ``pipeline:`` field. +import vllm_omni.model_executor.models.qwen2_5_omni.pipeline # noqa: F401, E402 +from tests.utils import get_deploy_config_path, hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" SEED = 42 -stage_config = str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_thinker_ci.yaml") +# Single-stage thinker-only deploy, materialized from tests.utils._CI_OVERLAYS. +stage_config = get_deploy_config_path("ci/qwen2_5_omni_thinker_only.yaml") model = "Qwen/Qwen2.5-Omni-7B" diff --git a/tests/examples/online_serving/test_qwen2_5_omni.py b/tests/examples/online_serving/test_qwen2_5_omni.py index a78ccf5924..2813b2fda8 100644 --- a/tests/examples/online_serving/test_qwen2_5_omni.py +++ b/tests/examples/online_serving/test_qwen2_5_omni.py @@ -5,8 +5,6 @@ import os -from vllm_omni.platforms import current_omni_platform - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" from pathlib import Path @@ -19,19 +17,15 @@ run_cmd, strip_trailing_audio_saved_line, ) -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test pytestmark = [pytest.mark.advanced_model, pytest.mark.example] models = ["Qwen/Qwen2.5-Omni-7B"] - -stage_configs = [str(Path(__file__).parent.parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_ci.yaml")] - -if current_omni_platform.is_xpu(): - stage_configs = [ - str(Path(__file__).parent.parent.parent / "e2e" / "stage_configs" / "xpu" / "qwen2_5_omni_ci.yaml") - ] +# Single CI deploy YAML; rocm/xpu deltas are picked automatically via the +# platforms: section in vllm_omni/deploy/ci/qwen2_5_omni.yaml. +stage_configs = [get_deploy_config_path("ci/qwen2_5_omni.yaml")] example_dir = str(Path(__file__).parent.parent.parent.parent / "examples" / "online_serving") # Create parameter combinations for model and stage config diff --git a/tests/examples/online_serving/test_qwen3_omni.py b/tests/examples/online_serving/test_qwen3_omni.py index 65f99d7bf2..e9ee2763bb 100644 --- a/tests/examples/online_serving/test_qwen3_omni.py +++ b/tests/examples/online_serving/test_qwen3_omni.py @@ -5,8 +5,6 @@ import os -from vllm_omni.platforms import current_omni_platform - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" from pathlib import Path @@ -19,17 +17,14 @@ run_cmd, strip_trailing_audio_saved_line, ) -from tests.utils import hardware_test +from tests.utils import get_deploy_config_path, hardware_test pytestmark = [pytest.mark.advanced_model, pytest.mark.example] models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] -stage_configs = [str(Path(__file__).parent.parent.parent / "e2e" / "stage_configs" / "qwen3_omni_ci.yaml")] - -if current_omni_platform.is_xpu(): - stage_configs = [str(Path(__file__).parent.parent.parent / "e2e" / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml")] +stage_configs = [get_deploy_config_path("ci/qwen3_omni_moe.yaml")] example_dir = str(Path(__file__).parent.parent.parent.parent / "examples" / "online_serving") diff --git a/tests/test_arg_utils.py b/tests/test_arg_utils.py new file mode 100644 index 0000000000..dab5ed6878 --- /dev/null +++ b/tests/test_arg_utils.py @@ -0,0 +1,353 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for vllm_omni.engine.arg_utils — invariants that must +hold for the orchestrator/engine/server CLI flag partition.""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, fields + +import pytest + +from vllm_omni.engine.arg_utils import ( + SHARED_FIELDS, + derive_server_dests_from_vllm_parser, + internal_blacklist_keys, + orchestrator_args_from_argparse, + orchestrator_field_names, + split_kwargs, +) + +# --------------------------------------------------------------------------- +# Fake engine class for unit testing — avoids pulling in the full vllm +# EngineArgs and its heavy __post_init__ at test time. +# --------------------------------------------------------------------------- + + +@dataclass +class _FakeEngineArgs: + """Stand-in for OmniEngineArgs with a representative subset of fields.""" + + model: str = "" + stage_id: int = 0 + max_num_seqs: int = 64 + gpu_memory_utilization: float = 0.9 + async_chunk: bool = False # also in OrchestratorArgs → shared + log_stats: bool = False # also in OrchestratorArgs → shared + stage_configs_path: str | None = None + + +# ============================================================================ +# Invariant 1 — OrchestratorArgs and engine must not ambiguously overlap. +# ============================================================================ + + +def test_no_ambiguous_overlap_with_fake_engine(): + """OrchestratorArgs ∩ engine fields must be ⊆ SHARED_FIELDS.""" + orch = orchestrator_field_names() + engine = {f.name for f in fields(_FakeEngineArgs)} + overlap = orch & engine + unexpected = overlap - SHARED_FIELDS + assert not unexpected, ( + f"Fields declared in both OrchestratorArgs and the engine class " + f"but not in SHARED_FIELDS: {sorted(unexpected)}. These cause " + f"double-routing — either remove the duplicate declaration or add " + f"to SHARED_FIELDS if sharing is intentional." + ) + + +def test_no_ambiguous_overlap_with_real_engine(): + """Same check, but against the real OmniEngineArgs.""" + try: + from vllm_omni.engine.arg_utils import OmniEngineArgs + except Exception as exc: + pytest.skip(f"OmniEngineArgs not importable: {exc}") + + orch = orchestrator_field_names() + engine = {f.name for f in fields(OmniEngineArgs)} + overlap = orch & engine + unexpected = overlap - SHARED_FIELDS + assert not unexpected, ( + f"Real OmniEngineArgs has ambiguous overlap with OrchestratorArgs: " + f"{sorted(unexpected)}. Update SHARED_FIELDS or remove duplication." + ) + + +# ============================================================================ +# Invariant 2 — split_kwargs partitions correctly. +# ============================================================================ + + +def test_split_orchestrator_only(): + """Pure orchestrator fields go to OrchestratorArgs, not engine_kwargs.""" + raw = {"stage_init_timeout": 500, "worker_backend": "ray"} + orch, engine = split_kwargs(raw, engine_cls=_FakeEngineArgs) + assert orch.stage_init_timeout == 500 + assert orch.worker_backend == "ray" + assert "stage_init_timeout" not in engine + assert "worker_backend" not in engine + + +def test_split_engine_only(): + """Pure engine fields go to engine_kwargs, not OrchestratorArgs.""" + raw = {"max_num_seqs": 128, "gpu_memory_utilization": 0.85} + orch, engine = split_kwargs(raw, engine_cls=_FakeEngineArgs) + assert engine["max_num_seqs"] == 128 + assert engine["gpu_memory_utilization"] == 0.85 + # These fields don't exist on OrchestratorArgs at all. + + +def test_split_shared_fields_go_to_both(): + """Fields in SHARED_FIELDS are copied to both buckets.""" + raw = {"model": "Qwen/Qwen2.5-Omni-7B", "log_stats": True} + orch, engine = split_kwargs(raw, engine_cls=_FakeEngineArgs) + assert orch.log_stats is True + assert engine["model"] == "Qwen/Qwen2.5-Omni-7B" + assert engine["log_stats"] is True + + +def test_split_drops_unclassified(): + """Unclassified fields (uvicorn/server) are dropped silently.""" + raw = { + "max_num_seqs": 64, # engine + "host": "0.0.0.0", # unclassified (server) + "port": 8091, # unclassified (server) + "ssl_keyfile": "key.pem", # unclassified (server) + } + orch, engine = split_kwargs(raw, engine_cls=_FakeEngineArgs) + assert engine == {"max_num_seqs": 64} + assert "host" not in engine + assert "port" not in engine + assert "ssl_keyfile" not in engine + + +def test_split_mixed_real_world(): + """End-to-end: raw CLI kwargs with all three classes present.""" + raw = { + # orchestrator + "stage_init_timeout": 400, + "deploy_config": "/tmp/deploy.yaml", + "worker_backend": "multi_process", + "async_chunk": True, + # engine + "max_num_seqs": 32, + "gpu_memory_utilization": 0.8, + # shared + "model": "Qwen/Qwen3-Omni", + "log_stats": False, + # server / unclassified + "host": "0.0.0.0", + "port": 8091, + "api_key": "secret", + # None values + "ray_address": None, + } + orch, engine = split_kwargs(raw, engine_cls=_FakeEngineArgs) + + # Orchestrator side + assert orch.stage_init_timeout == 400 + assert orch.deploy_config == "/tmp/deploy.yaml" + assert orch.worker_backend == "multi_process" + assert orch.async_chunk is True + assert orch.log_stats is False # shared, read from raw + assert orch.ray_address is None # default preserved + + # Engine side + assert engine["max_num_seqs"] == 32 + assert engine["gpu_memory_utilization"] == 0.8 + assert engine["model"] == "Qwen/Qwen3-Omni" + assert engine["log_stats"] is False + assert "host" not in engine + assert "port" not in engine + assert "api_key" not in engine + # orchestrator-only keys never reach engine + assert "stage_init_timeout" not in engine + assert "deploy_config" not in engine + assert "async_chunk" not in engine + + +# ============================================================================ +# Invariant 3 — user-typed unclassifiable flags warn (don't fail silently). +# ============================================================================ + + +def test_user_typed_unclassified_warns(caplog): + """If the user types a flag we can't route, warn — don't silently drop.""" + raw = {"bogus_flag": "value", "max_num_seqs": 64} + with caplog.at_level(logging.WARNING, logger="vllm_omni.engine.arg_utils"): + split_kwargs(raw, engine_cls=_FakeEngineArgs, user_typed={"bogus_flag"}) + assert any("bogus_flag" in rec.message for rec in caplog.records), ( + f"Expected warning mentioning 'bogus_flag', got: {[rec.message for rec in caplog.records]}" + ) + + +def test_unclassified_without_user_typed_silent(caplog): + """Without user_typed, unclassified keys drop silently (argparse defaults + for server flags shouldn't spam logs on every launch).""" + raw = {"host": "0.0.0.0", "port": 8091, "max_num_seqs": 64} + with caplog.at_level(logging.WARNING, logger="vllm_omni.engine.arg_utils"): + split_kwargs(raw, engine_cls=_FakeEngineArgs, user_typed=None) + # No warnings because we don't know these were user-typed. + assert not any("host" in rec.message or "port" in rec.message for rec in caplog.records) + + +# ============================================================================ +# Invariant 4 — CLI flag classification completeness. +# Catches new flags added without updating OrchestratorArgs or OmniEngineArgs. +# ============================================================================ + + +def test_all_omni_cli_flags_classified(): + """Every vllm-omni-added CLI flag must be classifiable. + + Runs ``OmniServeCommand.subparser_init`` and checks that every new + argument (compared to vllm's base parser) is either: + - a field on OrchestratorArgs, OR + - a field on OmniEngineArgs, OR + - in SHARED_FIELDS + """ + try: + from vllm.utils.argparse_utils import FlexibleArgumentParser + + from vllm_omni.engine.arg_utils import OmniEngineArgs + from vllm_omni.entrypoints.cli.serve import OmniServeCommand + except Exception as exc: + pytest.skip(f"Cannot build parser in this environment: {exc}") + + # Build the serve parser + root = FlexibleArgumentParser() + subparsers = root.add_subparsers() + cmd = OmniServeCommand() + try: + parser = cmd.subparser_init(subparsers) + except Exception as exc: + pytest.skip(f"subparser_init failed (dev env issue): {exc}") + + all_dests = {a.dest for a in parser._actions if a.dest and a.dest not in {"help", "model_tag"}} + + orch = orchestrator_field_names() + engine = {f.name for f in fields(OmniEngineArgs)} + server_derived = derive_server_dests_from_vllm_parser() + + unclassified = all_dests - orch - engine - SHARED_FIELDS - server_derived + # Some argparse-internal dests (suppressed, private) may not match — + # filter those out. + unclassified = {d for d in unclassified if not d.startswith("_")} + + assert not unclassified, ( + f"These CLI flags are not classified as " + f"orchestrator/engine/shared/server: {sorted(unclassified)}. " + f"Add them to OrchestratorArgs (if consumed by orchestrator), " + f"OmniEngineArgs (if consumed by per-stage engine), or the known-server " + f"allowlist (if they're vllm frontend flags). " + f"If intentional (e.g. a new CLI-only flag that doesn't map to either " + f"dataclass), add it to a KNOWN_UNROUTED allowlist." + ) + + +# ============================================================================ +# argparse interop (Phase 3). +# ============================================================================ + + +def test_orchestrator_args_from_argparse(): + """Can build OrchestratorArgs from an argparse.Namespace.""" + import argparse + + ns = argparse.Namespace( + stage_init_timeout=500, + deploy_config="/tmp/x.yaml", + max_num_seqs=64, # engine field — ignored + host="0.0.0.0", # server field — ignored + ) + orch = orchestrator_args_from_argparse(ns) + assert orch.stage_init_timeout == 500 + assert orch.deploy_config == "/tmp/x.yaml" + assert orch.worker_backend == "multi_process" # default + + +def test_derive_server_dests_returns_frozenset(): + """Server-dest derivation returns a frozenset (possibly empty).""" + result = derive_server_dests_from_vllm_parser() + assert isinstance(result, frozenset) + + +# ============================================================================ +# internal_blacklist_keys — single source of truth for per-stage forwarding. +# ============================================================================ + + +def test_internal_blacklist_keys_derived_from_orchestrator(): + """Blacklist is exactly OrchestratorArgs fields minus SHARED_FIELDS. + + This function replaces the old hardcoded INTERNAL_STAGE_OVERRIDE_KEYS + frozenset. Asserts the contract so future changes to OrchestratorArgs + automatically propagate to the blacklist. + """ + blacklist = internal_blacklist_keys() + assert blacklist == orchestrator_field_names() - SHARED_FIELDS + # Spot-check expected entries + assert "stage_init_timeout" in blacklist + assert "deploy_config" in blacklist + assert "async_chunk" in blacklist + # Shared fields must NOT appear — they flow to both orchestrator and engine + assert "model" not in blacklist + assert "log_stats" not in blacklist + + +# ============================================================================ +# Boundary value analysis — edge cases around split_kwargs. +# ============================================================================ + + +def test_split_empty_kwargs(): + """Empty kwargs yields default OrchestratorArgs and empty engine dict.""" + orch, engine = split_kwargs({}, engine_cls=_FakeEngineArgs) + assert orch.stage_init_timeout == 300 # dataclass default + assert orch.worker_backend == "multi_process" # dataclass default + assert engine == {} + + +def test_split_all_none_values_preserved_on_orchestrator(): + """None values for orchestrator fields are kept (represents 'not set').""" + raw = {"ray_address": None, "deploy_config": None, "max_num_seqs": None} + orch, engine = split_kwargs(raw, engine_cls=_FakeEngineArgs) + assert orch.ray_address is None + assert orch.deploy_config is None + # Engine-side None still passes through; caller decides semantics downstream. + assert engine.get("max_num_seqs") is None + + +def test_split_user_typed_with_empty_kwargs_no_warn(caplog): + """user_typed non-empty but kwargs empty — no warnings emitted.""" + with caplog.at_level(logging.WARNING, logger="vllm_omni.engine.arg_utils"): + split_kwargs({}, engine_cls=_FakeEngineArgs, user_typed={"nothing"}) + assert not caplog.records + + +def test_ambiguous_field_strict_raises(): + """strict=True raises ValueError on overlap outside SHARED_FIELDS.""" + + # deploy_config is on OrchestratorArgs; declaring it on the engine class + # too (without adding to SHARED_FIELDS) creates an ambiguous route. + @dataclass + class _AmbiguousEngine: + deploy_config: str | None = None + + with pytest.raises(ValueError, match="both OrchestratorArgs and"): + split_kwargs({"deploy_config": "x"}, engine_cls=_AmbiguousEngine, strict=True) + + +def test_ambiguous_field_non_strict_routes_to_orchestrator(caplog): + """strict=False logs ERROR but routes the ambiguous field to orchestrator.""" + + @dataclass + class _AmbiguousEngine: + deploy_config: str | None = None + + with caplog.at_level(logging.ERROR, logger="vllm_omni.engine.arg_utils"): + orch, engine = split_kwargs({"deploy_config": "x"}, engine_cls=_AmbiguousEngine, strict=False) + assert orch.deploy_config == "x" + assert "deploy_config" not in engine + assert any("both OrchestratorArgs" in r.message for r in caplog.records) diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py index e284de48d0..2e2bdc75dc 100644 --- a/tests/test_config_factory.py +++ b/tests/test_config_factory.py @@ -4,12 +4,26 @@ Unit tests for StageConfigFactory and related classes. """ +from dataclasses import dataclass +from pathlib import Path + +import pytest + from vllm_omni.config.stage_config import ( + _EXECUTION_TYPE_TO_SCHEDULER, + _PIPELINE_REGISTRY, ModelPipeline, + PipelineConfig, StageConfig, StageConfigFactory, + StageExecutionType, + StagePipelineConfig, StageType, + build_stage_runtime_overrides, + register_pipeline, + strip_parent_engine_args, ) +from vllm_omni.engine.arg_utils import internal_blacklist_keys class TestStageType: @@ -241,8 +255,9 @@ def test_default_diffusion_no_yaml(self): def test_default_diffusion_with_parallel_config(self): """Test diffusion config calculates devices from parallel_config.""" + @dataclass class MockParallelConfig: - world_size = 4 + world_size: int = 4 kwargs = { "parallel_config": MockParallelConfig(), @@ -270,7 +285,7 @@ def test_cli_override_forwards_engine_registered_args(self): stage = StageConfig(stage_id=0, model_stage="thinker", input_sources=[]) cli_overrides = { "gpu_memory_utilization": 0.9, # Well-known param - "custom_engine_flag": True, # Not in _INTERNAL_KEYS, so forwarded + "custom_engine_flag": True, # Not orchestrator-owned, so forwarded } overrides = StageConfigFactory._merge_cli_overrides(stage, cli_overrides) @@ -311,6 +326,53 @@ def test_per_stage_override_excludes_internal_keys(self): assert "batch_timeout" not in overrides +class TestStageResolutionHelpers: + """Tests for shared stage override / filtering helpers.""" + + def test_build_stage_runtime_overrides_ignores_other_stage_and_internal_keys(self): + overrides = build_stage_runtime_overrides( + 0, + { + "gpu_memory_utilization": 0.5, + "stage_0_gpu_memory_utilization": 0.9, + "stage_1_gpu_memory_utilization": 0.1, + "stage_0_model": "should_be_ignored", + "parallel_config": {"world_size": 2}, + }, + internal_keys=internal_blacklist_keys(), + ) + + assert overrides["gpu_memory_utilization"] == 0.9 + assert "model" not in overrides + assert "parallel_config" not in overrides + + def test_strip_parent_engine_args_reports_only_surprising_parent_overrides(self): + from dataclasses import fields as dc_fields + + from vllm.engine.arg_utils import EngineArgs + + parent_fields = {f.name: f for f in dc_fields(EngineArgs)} + filtered, overridden = strip_parent_engine_args( + { + "model": "some/model", + "stage_configs_path": "/tmp/stages.yaml", + "tensor_parallel_size": 4, + "worker_extension_cls": "some.Extension", + "custom_pipeline_args": {"pipeline_class": "demo.Pipeline"}, + }, + parent_fields=parent_fields, + keep_keys={"worker_extension_cls"}, + strip_keys={"stage_configs_path"}, + no_warn_keys={"model"}, + ) + + assert filtered == { + "worker_extension_cls": "some.Extension", + "custom_pipeline_args": {"pipeline_class": "demo.Pipeline"}, + } + assert overridden == ["tensor_parallel_size"] + + class TestPipelineYamlParsing: """Tests for pipeline YAML file parsing (@ZJY0516).""" @@ -609,16 +671,606 @@ def test_parse_missing_async_chunk_defaults_false(self, tmp_path): assert pipeline.async_chunk is False -class TestArchitectureFallback: - """Tests for architecture-based model detection fallback.""" +class TestPipelineDiscovery: + """Tests for auto-discovery of pipelines from models/*/pipeline.py.""" + + def test_discover_populates_registry_with_known_models(self): + """``_discover_all_pipelines`` imports every pipeline.py so the + registry is populated with the built-in models after one call.""" + from vllm_omni.config.stage_config import _discover_all_pipelines + + _discover_all_pipelines() + # These models have a pipeline.py in-tree and must be registered. + assert "qwen2_5_omni" in _PIPELINE_REGISTRY + assert "qwen3_omni_moe" in _PIPELINE_REGISTRY + assert "qwen3_tts" in _PIPELINE_REGISTRY + + def test_pipeline_config_supports_hf_architectures(self): + """PipelineConfig accepts hf_architectures for HF-arch fallback + (replaces the old _ARCHITECTURE_MODELS dict).""" + p = PipelineConfig( + model_type="custom_collide", + hf_architectures=("SomeCollidingArch",), + ) + assert p.hf_architectures == ("SomeCollidingArch",) + + +class TestStagePipelineConfig: + def test_frozen(self): + s = StagePipelineConfig(stage_id=0, model_stage="a") + with pytest.raises(AttributeError): + s.model_stage = "changed" + + def test_defaults(self): + s = StagePipelineConfig(stage_id=0, model_stage="a") + assert s.execution_type == StageExecutionType.LLM_AR + assert s.input_sources == () + assert s.final_output is False + assert s.sampling_constraints == {} + assert s.engine_output_type is None + + +class TestPipelineConfigNew: + def test_frozen(self): + p = PipelineConfig(model_type="t", model_arch="A") + with pytest.raises(AttributeError): + p.model_type = "changed" + + def test_validate_valid(self): + p = PipelineConfig( + model_type="t", + model_arch="A", + stages=( + StagePipelineConfig(stage_id=0, model_stage="a"), + StagePipelineConfig(stage_id=1, model_stage="b", input_sources=(0,)), + ), + ) + assert p.validate() == [] + + def test_validate_no_stages(self): + p = PipelineConfig(model_type="t", model_arch="A") + assert any("no stages" in e.lower() for e in p.validate()) + + def test_get_scheduler_cls(self): + p = PipelineConfig( + model_type="t", + model_arch="A", + stages=( + StagePipelineConfig(stage_id=0, model_stage="a", execution_type=StageExecutionType.LLM_AR), + StagePipelineConfig( + stage_id=1, model_stage="b", execution_type=StageExecutionType.LLM_GENERATION, input_sources=(0,) + ), + ), + ) + assert "OmniARScheduler" in p.get_scheduler_cls(0) + assert "OmniGenerationScheduler" in p.get_scheduler_cls(1) + + +class TestExecutionTypeToScheduler: + def test_all_types_mapped(self): + for et in StageExecutionType: + assert et in _EXECUTION_TYPE_TO_SCHEDULER + + +class TestPipelineRegistry: + def test_register_and_lookup(self): + p = PipelineConfig( + model_type="__test_only__", + model_arch="A", + stages=(StagePipelineConfig(stage_id=0, model_stage="a"),), + ) + register_pipeline(p) + assert _PIPELINE_REGISTRY["__test_only__"] is p + del _PIPELINE_REGISTRY["__test_only__"] + + +class TestDeployConfigLoading: + def test_load_deploy_config(self): + from pathlib import Path + + from vllm_omni.config.stage_config import load_deploy_config + + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not deploy_path.exists(): + pytest.skip("Deploy config not found") + + deploy = load_deploy_config(deploy_path) + assert len(deploy.stages) == 3 + assert deploy.async_chunk is True + assert deploy.connectors is not None + assert deploy.platforms is not None + + def test_merge_pipeline_deploy(self): + from pathlib import Path + + import vllm_omni.model_executor.models.qwen3_omni.pipeline # noqa: F401 + from vllm_omni.config.stage_config import load_deploy_config, merge_pipeline_deploy + + pipeline = _PIPELINE_REGISTRY["qwen3_omni_moe"] + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not deploy_path.exists(): + pytest.skip("Deploy config not found") + + deploy = load_deploy_config(deploy_path) + stages = merge_pipeline_deploy(pipeline, deploy) + + assert len(stages) == 3 + s0 = stages[0] + assert s0.model_stage == "thinker" + assert s0.yaml_engine_args["model_arch"] == "Qwen3OmniMoeForConditionalGeneration" + assert s0.yaml_engine_args["engine_output_type"] == "latent" + assert s0.yaml_extras["default_sampling_params"]["detokenize"] is True + + +class TestQwen3OmniPipeline: + def test_registered(self): + import vllm_omni.model_executor.models.qwen3_omni.pipeline # noqa: F401 + + p = _PIPELINE_REGISTRY.get("qwen3_omni_moe") + assert p is not None + assert p.model_arch == "Qwen3OmniMoeForConditionalGeneration" + assert len(p.stages) == 3 + assert p.validate() == [] + + def test_thinker(self): + import vllm_omni.model_executor.models.qwen3_omni.pipeline # noqa: F401 + + s = _PIPELINE_REGISTRY["qwen3_omni_moe"].get_stage(0) + assert s.model_stage == "thinker" + assert s.execution_type == StageExecutionType.LLM_AR + assert s.owns_tokenizer is True + assert s.engine_output_type == "latent" + assert s.sampling_constraints["detokenize"] is True + + def test_talker(self): + import vllm_omni.model_executor.models.qwen3_omni.pipeline # noqa: F401 + + s = _PIPELINE_REGISTRY["qwen3_omni_moe"].get_stage(1) + assert s.input_sources == (0,) + assert s.sampling_constraints["stop_token_ids"] == [2150] + assert s.custom_process_input_func is not None + assert s.custom_process_next_stage_input_func is not None + + def test_code2wav(self): + import vllm_omni.model_executor.models.qwen3_omni.pipeline # noqa: F401 + + s = _PIPELINE_REGISTRY["qwen3_omni_moe"].get_stage(2) + assert s.execution_type == StageExecutionType.LLM_GENERATION + assert s.final_output_type == "audio" + assert s.custom_process_input_func is not None + + +class TestQwen2_5OmniPipeline: + def test_registered(self): + import vllm_omni.model_executor.models.qwen2_5_omni.pipeline # noqa: F401 + + p = _PIPELINE_REGISTRY.get("qwen2_5_omni") + assert p is not None + assert p.model_arch == "Qwen2_5OmniForConditionalGeneration" + assert len(p.stages) == 3 + assert p.validate() == [] + + def test_thinker(self): + import vllm_omni.model_executor.models.qwen2_5_omni.pipeline # noqa: F401 + + s = _PIPELINE_REGISTRY["qwen2_5_omni"].get_stage(0) + assert s.model_stage == "thinker" + assert s.execution_type == StageExecutionType.LLM_AR + assert s.owns_tokenizer is True + assert s.engine_output_type == "latent" + assert s.requires_multimodal_data is True + + def test_talker(self): + import vllm_omni.model_executor.models.qwen2_5_omni.pipeline # noqa: F401 + + s = _PIPELINE_REGISTRY["qwen2_5_omni"].get_stage(1) + assert s.input_sources == (0,) + assert s.sampling_constraints["stop_token_ids"] == [8294] + assert s.custom_process_input_func is not None + + def test_code2wav(self): + import vllm_omni.model_executor.models.qwen2_5_omni.pipeline # noqa: F401 + + s = _PIPELINE_REGISTRY["qwen2_5_omni"].get_stage(2) + assert s.execution_type == StageExecutionType.LLM_GENERATION + assert s.final_output_type == "audio" + assert s.engine_output_type == "audio" + + +class TestQwen3TTSPipeline: + def test_registered(self): + import vllm_omni.model_executor.models.qwen3_tts.pipeline # noqa: F401 + + p = _PIPELINE_REGISTRY.get("qwen3_tts") + assert p is not None + assert p.model_arch == "Qwen3TTSTalkerForConditionalGeneration" + assert len(p.stages) == 2 + assert p.validate() == [] + + def test_talker_stage(self): + import vllm_omni.model_executor.models.qwen3_tts.pipeline # noqa: F401 + + s = _PIPELINE_REGISTRY["qwen3_tts"].get_stage(0) + assert s.model_stage == "qwen3_tts" + assert s.execution_type == StageExecutionType.LLM_AR + assert s.owns_tokenizer is True + assert s.engine_output_type == "latent" + assert s.sampling_constraints["stop_token_ids"] == [2150] + # Stage 0 inherits the pipeline-level model_arch + assert s.model_arch is None + + def test_code2wav_stage_has_per_stage_model_arch(self): + import vllm_omni.model_executor.models.qwen3_tts.pipeline # noqa: F401 + + s = _PIPELINE_REGISTRY["qwen3_tts"].get_stage(1) + assert s.execution_type == StageExecutionType.LLM_GENERATION + assert s.final_output_type == "audio" + assert s.engine_output_type == "audio" + # Per-stage model_arch override (different from pipeline-level talker) + assert s.model_arch == "Qwen3TTSCode2Wav" + # tts_args is passed through via extras + assert s.extras["tts_args"]["max_instructions_length"] == 500 + + def test_per_stage_model_arch_flows_through_merge(self, tmp_path): + """Verify the new ps.model_arch override survives merge_pipeline_deploy.""" + import vllm_omni.model_executor.models.qwen3_tts.pipeline # noqa: F401 + from vllm_omni.config.stage_config import load_deploy_config, merge_pipeline_deploy + + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_tts.yaml" + if not deploy_path.exists(): + pytest.skip("qwen3_tts deploy yaml not found") + + deploy = load_deploy_config(deploy_path) + pipeline = _PIPELINE_REGISTRY["qwen3_tts"] + stages = merge_pipeline_deploy(pipeline, deploy) + + # Stage 0 inherits pipeline-level model_arch + assert stages[0].yaml_engine_args["model_arch"] == "Qwen3TTSTalkerForConditionalGeneration" + # Stage 1 uses its per-stage override + assert stages[1].yaml_engine_args["model_arch"] == "Qwen3TTSCode2Wav" + + +class TestBaseConfigInheritance: + """Test deploy YAML base_config inheritance.""" + + def test_ci_inherits_from_main(self): + from tests.utils import get_deploy_config_path + from vllm_omni.config.stage_config import load_deploy_config + + ci_path = Path(get_deploy_config_path("ci/qwen3_omni_moe.yaml")) + if not ci_path.exists(): + pytest.skip("CI deploy config not found") + + deploy = load_deploy_config(ci_path) + assert len(deploy.stages) == 3 + # CI overrides + assert deploy.stages[0].engine_extras.get("load_format") == "dummy" + assert deploy.stages[0].max_num_seqs == 5 + # Inherited from base + assert deploy.stages[0].gpu_memory_utilization == 0.9 + assert deploy.connectors is not None + assert "connector_of_shared_memory" in deploy.connectors + assert deploy.async_chunk is True + + def test_ci_sampling_merge(self): + from tests.utils import get_deploy_config_path + from vllm_omni.config.stage_config import load_deploy_config + + ci_path = Path(get_deploy_config_path("ci/qwen3_omni_moe.yaml")) + if not ci_path.exists(): + pytest.skip("CI deploy config not found") + + deploy = load_deploy_config(ci_path) + s0 = deploy.stages[0].default_sampling_params + # CI overrides max_tokens + assert s0["max_tokens"] == 150 + # Inherited from base + assert s0["temperature"] == 0.4 + assert s0["seed"] == 42 + + def test_pure_inheritance_overlay(self, tmp_path): + """An overlay with only ``base_config`` inherits everything.""" + from vllm_omni.config.stage_config import load_deploy_config + + base = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not base.exists(): + pytest.skip("Base deploy config not found") + + overlay = tmp_path / "overlay.yaml" + overlay.write_text(f"base_config: {base}\n") + + deploy = load_deploy_config(overlay) + assert len(deploy.stages) == 3 + assert deploy.stages[0].gpu_memory_utilization == 0.9 + + def test_single_field_overlay(self, tmp_path): + """An overlay overriding one stage field merges with the base.""" + from vllm_omni.config.stage_config import load_deploy_config + + base = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not base.exists(): + pytest.skip("Base deploy config not found") + + overlay = tmp_path / "overlay.yaml" + overlay.write_text(f"base_config: {base}\nstages:\n - stage_id: 2\n max_num_batched_tokens: 1000000\n") + + deploy = load_deploy_config(overlay) + assert deploy.stages[2].max_num_batched_tokens == 1000000 + # Rest inherited + assert deploy.stages[0].gpu_memory_utilization == 0.9 + + +class TestPlatformOverrides: + """Test platform-specific deploy config overrides.""" + + def test_npu_overrides(self): + from pathlib import Path + + from vllm_omni.config.stage_config import _apply_platform_overrides, load_deploy_config + + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not deploy_path.exists(): + pytest.skip("Deploy config not found") + + deploy = load_deploy_config(deploy_path) + deploy = _apply_platform_overrides(deploy, platform="npu") + + assert deploy.stages[0].gpu_memory_utilization == 0.6 + assert deploy.stages[0].tensor_parallel_size == 2 + assert deploy.stages[0].devices == "0,1" + # Stage 2 unaffected fields stay at base + assert deploy.stages[2].enforce_eager is True + + def test_xpu_overrides(self): + from pathlib import Path + + from vllm_omni.config.stage_config import _apply_platform_overrides, load_deploy_config + + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not deploy_path.exists(): + pytest.skip("Deploy config not found") + + deploy = load_deploy_config(deploy_path) + deploy = _apply_platform_overrides(deploy, platform="xpu") + + assert deploy.stages[0].tensor_parallel_size == 4 + assert deploy.stages[0].devices == "0,1,2,3" + assert deploy.stages[0].engine_extras.get("max_cudagraph_capture_size") == 0 + + def test_unknown_platform_noop(self): + from pathlib import Path + + from vllm_omni.config.stage_config import _apply_platform_overrides, load_deploy_config + + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not deploy_path.exists(): + pytest.skip("Deploy config not found") + + deploy = load_deploy_config(deploy_path) + original_mem = deploy.stages[0].gpu_memory_utilization + deploy = _apply_platform_overrides(deploy, platform="unknown_hw") + assert deploy.stages[0].gpu_memory_utilization == original_mem + + def test_platforms_deep_merge_inheritance(self, tmp_path): + """Overlay's platforms: block layers onto base's, per-stage.""" + from vllm_omni.config.stage_config import _apply_platform_overrides, load_deploy_config + + base = tmp_path / "base.yaml" + base.write_text( + "stages:\n" + " - stage_id: 0\n" + " gpu_memory_utilization: 0.9\n" + "platforms:\n" + " rocm:\n" + " stages:\n" + " - stage_id: 0\n" + " enforce_eager: true\n" + ) + overlay = tmp_path / "overlay.yaml" + overlay.write_text( + f"base_config: {base.name}\n" + "platforms:\n" + " rocm:\n" + " stages:\n" + " - stage_id: 0\n" + " max_num_seqs: 1\n" + ) + + deploy = load_deploy_config(overlay) + deploy = _apply_platform_overrides(deploy, platform="rocm") + # Both base's enforce_eager and overlay's max_num_seqs should apply. + assert deploy.stages[0].enforce_eager is True + assert deploy.stages[0].max_num_seqs == 1 + # Inherited stage default not touched by overlay platforms section. + assert deploy.stages[0].gpu_memory_utilization == 0.9 + + +class TestCLIOverrideFlow: + """Test --stage-overrides JSON merge into StageConfig.""" + + def test_stage_overrides_merge(self): + from pathlib import Path + + import vllm_omni.model_executor.models.qwen3_omni.pipeline # noqa: F401 + from vllm_omni.config.stage_config import load_deploy_config, merge_pipeline_deploy + + pipeline = _PIPELINE_REGISTRY["qwen3_omni_moe"] + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not deploy_path.exists(): + pytest.skip("Deploy config not found") + + deploy = load_deploy_config(deploy_path) + stages = merge_pipeline_deploy(pipeline, deploy) + + # Simulate --stage-overrides '{"0": {"gpu_memory_utilization": 0.5}}' + overrides = {"stage_0_gpu_memory_utilization": 0.5} + stages[0].runtime_overrides = StageConfigFactory._merge_cli_overrides(stages[0], overrides) + assert stages[0].runtime_overrides["gpu_memory_utilization"] == 0.5 + + def test_global_override_applies_to_all(self): + from pathlib import Path + + import vllm_omni.model_executor.models.qwen3_omni.pipeline # noqa: F401 + from vllm_omni.config.stage_config import load_deploy_config, merge_pipeline_deploy + + pipeline = _PIPELINE_REGISTRY["qwen3_omni_moe"] + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not deploy_path.exists(): + pytest.skip("Deploy config not found") + + deploy = load_deploy_config(deploy_path) + stages = merge_pipeline_deploy(pipeline, deploy) + + overrides = {"enforce_eager": True} + for s in stages: + s.runtime_overrides = StageConfigFactory._merge_cli_overrides(s, overrides) + assert s.runtime_overrides["enforce_eager"] is True + + +class TestCLIExplicitPrecedence: + """Verify YAML > argparse defaults; explicit CLI args > YAML.""" + + def _stages(self, cli_overrides, cli_explicit_keys): + import vllm_omni.model_executor.models.qwen3_omni.pipeline # noqa: F401 + + return StageConfigFactory._create_from_registry( + "qwen3_omni_moe", + cli_overrides=cli_overrides, + cli_explicit_keys=cli_explicit_keys, + ) + + def test_explicit_cli_overrides_yaml(self): + """User-typed --max-num-seqs wins over the deploy YAML value.""" + stages = self._stages( + cli_overrides={"max_num_seqs": 999}, + cli_explicit_keys={"max_num_seqs"}, + ) + # Stage 2 yaml has max_num_seqs=1; explicit CLI must beat it. + assert stages[2].runtime_overrides.get("max_num_seqs") == 999 + + def test_default_cli_does_not_override_yaml(self): + """Argparse defaults must NOT clobber values that are present in YAML.""" + stages = self._stages( + cli_overrides={"max_num_seqs": 256}, + cli_explicit_keys=set(), # user typed nothing + ) + # Stage 2's YAML value (1) should win because the user didn't type --max-num-seqs. + assert stages[2].runtime_overrides.get("max_num_seqs") != 256 + + def test_default_cli_fills_missing_yaml_field(self): + """Argparse defaults still fill fields the YAML doesn't set.""" + stages = self._stages( + cli_overrides={"some_unrelated_knob": "fallback"}, + cli_explicit_keys=set(), + ) + # Field absent from YAML → CLI default flows through as a fallback. + assert stages[0].runtime_overrides.get("some_unrelated_knob") == "fallback" + + def test_per_stage_overrides_always_explicit(self): + """``stage__*`` keys are always treated as explicit.""" + stages = self._stages( + cli_overrides={"stage_0_gpu_memory_utilization": 0.42}, + cli_explicit_keys=set(), # not in the explicit set, but per-stage + ) + assert stages[0].runtime_overrides.get("gpu_memory_utilization") == 0.42 + + def test_none_explicit_set_treats_all_as_explicit(self): + """Programmatic Omni() callers (cli_explicit_keys=None) keep current behavior.""" + stages = self._stages( + cli_overrides={"max_num_seqs": 999}, + cli_explicit_keys=None, + ) + assert stages[2].runtime_overrides.get("max_num_seqs") == 999 + + def test_explicit_async_chunk_false_overrides_yaml(self): + """``--no-async-chunk`` flips the deploy-level async_chunk to False even + when the YAML sets it to True. Verifies that the per-stage + ``async_chunk: True`` injection in ``merge_pipeline_deploy`` is skipped + and that ``async_chunk`` does not leak through ``_merge_cli_overrides``. + """ + stages = self._stages( + cli_overrides={"async_chunk": False}, + cli_explicit_keys={"async_chunk"}, + ) + # qwen3_omni_moe.yaml has `async_chunk: true`, so by default every + # stage's engine_args would carry it. With the explicit override, it + # must NOT show up. + for stage in stages: + assert stage.yaml_engine_args.get("async_chunk") is not True + assert stage.runtime_overrides.get("async_chunk") is None + + def test_default_async_chunk_leaves_yaml_alone(self): + """An unset ``--async-chunk`` (default None) must leave the YAML's True + in force on every stage.""" + stages = self._stages( + cli_overrides={"async_chunk": None}, + cli_explicit_keys=set(), + ) + # qwen3_omni_moe.yaml: `async_chunk: true` → injected on every stage. + for stage in stages: + assert stage.yaml_engine_args.get("async_chunk") is True + + def test_explicit_enable_prefix_caching_overrides_yaml(self): + """``--enable-prefix-caching`` (global) flips every stage's + ``enable_prefix_caching`` to True regardless of the YAML default.""" + stages = self._stages( + cli_overrides={"enable_prefix_caching": True}, + cli_explicit_keys={"enable_prefix_caching"}, + ) + for stage in stages: + assert stage.runtime_overrides.get("enable_prefix_caching") is True + + def test_async_chunk_dispatches_processors(self): + """A single ``qwen3_tts`` pipeline picks per-chunk vs end-to-end + processors based on ``deploy.async_chunk``, without needing a + separate variant pipeline registration.""" + import vllm_omni.model_executor.models.qwen3_tts.pipeline # noqa: F401 + from vllm_omni.config.stage_config import ( + _PIPELINE_REGISTRY, + DeployConfig, + merge_pipeline_deploy, + ) + + pipeline = _PIPELINE_REGISTRY["qwen3_tts"] + + # async_chunk=True → stage 0's per-chunk processor wires up, stage 1 + # has no sync input processor. + async_stages = merge_pipeline_deploy(pipeline, DeployConfig(async_chunk=True)) + assert ( + async_stages[0] + .yaml_engine_args.get("custom_process_next_stage_input_func", "") + .endswith("talker2code2wav_async_chunk") + ) + assert async_stages[1].custom_process_input_func is None + + # async_chunk=False → stage 0 has no streaming processor, stage 1's + # batch-end processor wires up. + sync_stages = merge_pipeline_deploy(pipeline, DeployConfig(async_chunk=False)) + assert "custom_process_next_stage_input_func" not in sync_stages[0].yaml_engine_args + assert sync_stages[1].custom_process_input_func is not None + assert sync_stages[1].custom_process_input_func.endswith("talker2code2wav") + + +class TestSamplingConstraintsPrecedence: + """Test that pipeline sampling_constraints override deploy defaults.""" + + def test_constraints_win(self): + from pathlib import Path + + import vllm_omni.model_executor.models.qwen3_omni.pipeline # noqa: F401 + from vllm_omni.config.stage_config import load_deploy_config, merge_pipeline_deploy + + pipeline = _PIPELINE_REGISTRY["qwen3_omni_moe"] + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not deploy_path.exists(): + pytest.skip("Deploy config not found") - def test_architecture_models_mapping_exists(self): - """Test that _ARCHITECTURE_MODELS contains expected entries.""" - assert "MiMoAudioForConditionalGeneration" in StageConfigFactory._ARCHITECTURE_MODELS - assert StageConfigFactory._ARCHITECTURE_MODELS["MiMoAudioForConditionalGeneration"] == "mimo_audio" - assert "HunyuanImage3ForCausalMM" in StageConfigFactory._ARCHITECTURE_MODELS - assert StageConfigFactory._ARCHITECTURE_MODELS["HunyuanImage3ForCausalMM"] == "hunyuan_image3" + deploy = load_deploy_config(deploy_path) + stages = merge_pipeline_deploy(pipeline, deploy) - def test_mimo_audio_in_pipeline_models(self): - """Test that mimo_audio is registered in PIPELINE_MODELS.""" - assert "mimo_audio" in StageConfigFactory.PIPELINE_MODELS + # Pipeline says detokenize=True for thinker, deploy can't override + assert stages[0].yaml_extras["default_sampling_params"]["detokenize"] is True + # Pipeline says stop_token_ids=[2150] for talker + assert stages[1].yaml_extras["default_sampling_params"]["stop_token_ids"] == [2150] + # Deploy temperature still flows through + assert stages[0].yaml_extras["default_sampling_params"]["temperature"] == 0.4 diff --git a/tests/utils.py b/tests/utils.py index 84edbbf3d1..d8137cf963 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -11,6 +11,7 @@ import time from collections.abc import Callable from contextlib import ExitStack, contextmanager, suppress +from pathlib import Path from typing import Any, Literal import cloudpickle @@ -24,6 +25,221 @@ _P = ParamSpec("_P") +_REPO_ROOT = Path(__file__).resolve().parent.parent +_DEPLOY_DIR = _REPO_ROOT / "vllm_omni" / "deploy" +_CI_GENERATED_DIR = _REPO_ROOT / "tests" / ".ci_generated" + + +# CI overlays as Python dicts (LSP-friendly). Materialized on demand to +# tests/.ci_generated/.yaml via get_deploy_config_path("ci/.yaml"). +_CI_OVERLAYS: dict[str, dict[str, Any]] = { + "qwen2_5_omni": { + "base_config": "qwen2_5_omni.yaml", + "async_chunk": False, + "stages": [ + { + "stage_id": 0, + "max_model_len": 16384, + "max_num_batched_tokens": 16384, + "max_num_seqs": 1, + "gpu_memory_utilization": 0.9, + "skip_mm_profiling": True, + "load_format": "dummy", + "default_sampling_params": {"max_tokens": 128}, + }, + { + "stage_id": 1, + "max_model_len": 16384, + "max_num_batched_tokens": 16384, + "max_num_seqs": 1, + "gpu_memory_utilization": 0.4, + "skip_mm_profiling": True, + "load_format": "dummy", + "default_sampling_params": {"max_tokens": 4096}, + }, + { + "stage_id": 2, + "max_num_seqs": 1, + "gpu_memory_utilization": 0.5, + "max_num_batched_tokens": 8192, + "max_model_len": 8192, + "load_format": "dummy", + "devices": "2", + "default_sampling_params": {"max_tokens": 8192}, + }, + ], + "platforms": { + "rocm": { + "stages": [ + {"stage_id": 0, "gpu_memory_utilization": 0.9}, + {"stage_id": 1, "gpu_memory_utilization": 0.4}, + {"stage_id": 2, "gpu_memory_utilization": 0.5, "devices": "2"}, + ], + }, + "xpu": { + "stages": [ + { + "stage_id": 0, + "gpu_memory_utilization": 0.9, + "max_num_batched_tokens": 16384, + "max_model_len": 16384, + }, + {"stage_id": 1, "gpu_memory_utilization": 0.5}, + { + "stage_id": 2, + "gpu_memory_utilization": 0.3, + "max_num_batched_tokens": 4096, + "max_model_len": 4096, + "devices": "2", + }, + ], + }, + }, + }, + "qwen3_omni_moe": { + "base_config": "qwen3_omni_moe.yaml", + "async_chunk": False, + "stages": [ + { + "stage_id": 0, + "max_num_seqs": 5, + "max_model_len": 32768, + "mm_processor_cache_gb": 0, + "load_format": "dummy", + "default_sampling_params": {"max_tokens": 150, "ignore_eos": False}, + }, + { + "stage_id": 1, + "gpu_memory_utilization": 0.5, + "max_num_seqs": 5, + "max_model_len": 32768, + "load_format": "dummy", + "default_sampling_params": {"max_tokens": 1000}, + }, + { + "stage_id": 2, + "max_num_seqs": 5, + "max_num_batched_tokens": 100000, + "load_format": "dummy", + "default_sampling_params": {"max_tokens": 2000}, + }, + ], + "platforms": { + "rocm": { + "stages": [ + {"stage_id": 0, "max_num_seqs": 1, "default_sampling_params": {"max_tokens": 100}}, + { + "stage_id": 1, + "max_num_seqs": 1, + "enforce_eager": True, + "default_sampling_params": {"max_tokens": 100}, + }, + { + "stage_id": 2, + "max_num_seqs": 1, + "max_num_batched_tokens": 1000000, + "default_sampling_params": {"max_tokens": 200}, + }, + ], + }, + "xpu": { + "stages": [ + { + "stage_id": 0, + "gpu_memory_utilization": 0.85, + "max_num_seqs": 1, + "tensor_parallel_size": 4, + "enforce_eager": True, + "max_num_batched_tokens": 4096, + "max_model_len": 4096, + "max_cudagraph_capture_size": 0, + "skip_mm_profiling": True, + "devices": "0,1,2,3", + "default_sampling_params": {"max_tokens": 100, "ignore_eos": False}, + }, + { + "stage_id": 1, + "gpu_memory_utilization": 0.6, + "max_num_seqs": 1, + "enforce_eager": True, + "max_num_batched_tokens": 4096, + "max_model_len": 4096, + "max_cudagraph_capture_size": 0, + "skip_mm_profiling": True, + "devices": "4", + }, + { + "stage_id": 2, + "gpu_memory_utilization": 0.3, + "max_num_seqs": 1, + "max_num_batched_tokens": 100000, + "max_cudagraph_capture_size": 0, + "skip_mm_profiling": True, + "devices": "5", + "default_sampling_params": {"max_tokens": 2000}, + }, + ], + }, + }, + }, + # Single-stage thinker-only topology for the abort test. + "qwen2_5_omni_thinker_only": { + "async_chunk": False, + "pipeline": "qwen2_5_omni_thinker_only", + "stages": [ + { + "stage_id": 0, + "max_num_seqs": 1, + "gpu_memory_utilization": 0.9, + "enforce_eager": True, + "max_num_batched_tokens": 16384, + "max_model_len": 16384, + "skip_mm_profiling": True, + "mm_processor_cache_gb": 0, + "load_format": "dummy", + "devices": "0", + "default_sampling_params": { + "temperature": 0.0, + "top_p": 1.0, + "top_k": -1, + "max_tokens": 128, + "seed": 42, + "repetition_penalty": 1.1, + }, + }, + ], + }, +} + + +def _materialize_ci_overlay(model_type: str) -> Path: + import yaml + + if model_type not in _CI_OVERLAYS: + raise KeyError(f"No CI overlay registered for {model_type!r}. Available: {sorted(_CI_OVERLAYS)}") + + _CI_GENERATED_DIR.mkdir(parents=True, exist_ok=True) + out = _CI_GENERATED_DIR / f"{model_type}.yaml" + + overlay = {**_CI_OVERLAYS[model_type]} + base = overlay.get("base_config") + if base: + overlay["base_config"] = str(_DEPLOY_DIR / base) + + with open(out, "w", encoding="utf-8") as f: + yaml.safe_dump(overlay, f, sort_keys=False) + return out + + +def get_deploy_config_path(rel_path: str) -> str: + """Resolve a deploy yaml; ``ci/.yaml`` materializes from ``_CI_OVERLAYS``.""" + if rel_path.startswith("ci/") and rel_path.endswith(".yaml"): + model_type = rel_path[len("ci/") : -len(".yaml")] + if model_type in _CI_OVERLAYS: + return str(_materialize_ci_overlay(model_type)) + return str(_DEPLOY_DIR / rel_path) + + if current_platform.is_rocm(): from amdsmi import ( amdsmi_get_gpu_vram_usage, diff --git a/vllm_omni/config/__init__.py b/vllm_omni/config/__init__.py index 2aa236e69f..f02c075880 100644 --- a/vllm_omni/config/__init__.py +++ b/vllm_omni/config/__init__.py @@ -5,10 +5,18 @@ from vllm_omni.config.lora import LoRAConfig from vllm_omni.config.model import OmniModelConfig from vllm_omni.config.stage_config import ( + DeployConfig, ModelPipeline, + PipelineConfig, StageConfig, StageConfigFactory, + StageDeployConfig, + StageExecutionType, + StagePipelineConfig, StageType, + load_deploy_config, + merge_pipeline_deploy, + register_pipeline, ) from vllm_omni.config.yaml_util import ( create_config, @@ -24,6 +32,14 @@ "StageConfigFactory", "ModelPipeline", "StageType", + "StageExecutionType", + "StagePipelineConfig", + "PipelineConfig", + "StageDeployConfig", + "DeployConfig", + "load_deploy_config", + "merge_pipeline_deploy", + "register_pipeline", "create_config", "load_yaml_config", "merge_configs", diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index a4e186c3bd..fc14283630 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -1,18 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Stage Configuration System for vLLM-Omni. - -Pipeline structure (stages, types, data-flow) is defined in per-model YAML -files and is set by model developers at integration time. -Runtime parameters (gpu_memory_utilization, tp_size, etc.) come from CLI. -""" +"""Stage configuration system for vLLM-Omni.""" from __future__ import annotations +import dataclasses import re import warnings -from dataclasses import asdict, dataclass, field +from dataclasses import asdict, dataclass, field, fields from enum import Enum from pathlib import Path from typing import Any @@ -20,76 +15,686 @@ from vllm.logger import init_logger from vllm_omni.config.yaml_util import create_config, load_yaml_config, to_dict +from vllm_omni.core.sched.omni_ar_scheduler import OmniARScheduler +from vllm_omni.core.sched.omni_generation_scheduler import OmniGenerationScheduler -# Pipeline YAMLs live alongside model code in model_executor/models// _MODELS_DIR = Path(__file__).resolve().parent.parent / "model_executor" / "models" def get_pipeline_path(model_dir: str, filename: str) -> Path: - """Return the full path to a pipeline YAML file. + return _MODELS_DIR / model_dir / filename + + +logger = init_logger(__name__) + - Args: - model_dir: Model subdirectory name (e.g., "qwen3_omni"). - filename: Name of the YAML file (e.g., "pipeline.yaml"). +_STAGE_OVERRIDE_PATTERN = re.compile(r"^stage_(\d+)_(.+)$") - Returns: - Absolute path to the file. + +def build_stage_runtime_overrides( + stage_id: int, + cli_overrides: dict[str, Any], + *, + internal_keys: set[str] | frozenset[str] | None = None, +) -> dict[str, Any]: + """Build per-stage runtime overrides from global and ``stage__*`` kwargs. + + ``internal_keys`` defaults to the set derived from ``OrchestratorArgs`` + (via ``arg_utils.internal_blacklist_keys``) so that orchestrator + fields are never forwarded as per-stage engine args. Callers can pass an + explicit set for tests or specialized flows. """ - return _MODELS_DIR / model_dir / filename + if internal_keys is None: + from vllm_omni.engine.arg_utils import internal_blacklist_keys + internal_keys = internal_blacklist_keys() -logger = init_logger(__name__) + result: dict[str, Any] = {} + + for key, value in cli_overrides.items(): + if value is None or key in internal_keys: + continue + + match = _STAGE_OVERRIDE_PATTERN.match(key) + if match is not None: + override_stage_id = int(match.group(1)) + param_name = match.group(2) + if override_stage_id == stage_id and param_name not in internal_keys: + result[param_name] = value + continue + + result[key] = value + + return result + + +def strip_parent_engine_args( + kwargs: dict[str, Any], + *, + parent_fields: dict[str, dataclasses.Field], + keep_keys: set[str] | frozenset[str] = frozenset(), + strip_keys: set[str] | frozenset[str] = frozenset(), + no_warn_keys: set[str] | frozenset[str] = frozenset(), +) -> tuple[dict[str, Any], list[str]]: + """Strip parent ``EngineArgs`` fields before merging into stage YAML.""" + overridden: list[str] = [] + result: dict[str, Any] = {} + + for key, value in kwargs.items(): + if key in strip_keys: + continue + + if key not in parent_fields or key in keep_keys: + result[key] = value + continue + + field_def = parent_fields[key] + if field_def.default is not dataclasses.MISSING: + default = field_def.default + elif field_def.default_factory is not dataclasses.MISSING: + default = field_def.default_factory() + else: + default = dataclasses.MISSING + + if default is dataclasses.MISSING or value is None: + continue + + if dataclasses.is_dataclass(default) and not isinstance(default, type): + default = asdict(default) + + if value != default and key not in no_warn_keys: + overridden.append(key) + + return result, sorted(overridden) class StageType(str, Enum): """Type of processing stage in the Omni pipeline.""" + # TODO(@lishunyang12): remove once all models migrate to StageExecutionType LLM = "llm" DIFFUSION = "diffusion" +class StageExecutionType(str, Enum): + """Merged StageType + WorkerType — 3 combinations today.""" + + LLM_AR = "llm_ar" + LLM_GENERATION = "llm_generation" + DIFFUSION = "diffusion" + + +# Mapping class refs (not dotted-path strings) so module/class renames fail +# at import time instead of lazily at scheduler resolution. YAML overrides +# and downstream serialization still use the dotted-path string form; the +# conversion happens at the map lookup site via _scheduler_path(). +_EXECUTION_TYPE_TO_SCHEDULER: dict[StageExecutionType, type | None] = { + StageExecutionType.LLM_AR: OmniARScheduler, + StageExecutionType.LLM_GENERATION: OmniGenerationScheduler, + StageExecutionType.DIFFUSION: None, +} + + +def _scheduler_path(cls: type | None) -> str | None: + """Return the dotted import path for a scheduler class (``None`` passes through).""" + if cls is None: + return None + return f"{cls.__module__}.{cls.__qualname__}" + + +@dataclass(frozen=True) +class StagePipelineConfig: + """Fixed topology for one stage (frozen, not user-configurable).""" + + stage_id: int + model_stage: str + execution_type: StageExecutionType = StageExecutionType.LLM_AR + input_sources: tuple[int, ...] = () + final_output: bool = False + final_output_type: str | None = None + owns_tokenizer: bool = False + requires_multimodal_data: bool = False + hf_config_name: str | None = None + engine_output_type: str | None = None + model_arch: str | None = None + sampling_constraints: dict[str, Any] = field(default_factory=dict) + custom_process_input_func: str | None = None + custom_process_next_stage_input_func: str | None = None + # Alternates picked by ``merge_pipeline_deploy`` based on ``deploy.async_chunk``. + async_chunk_process_next_stage_input_func: str | None = None + sync_process_input_func: str | None = None + prompt_expand_func: str | None = None + cfg_kv_collect_func: str | None = None + omni_kv_config: dict[str, Any] | None = None + extras: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class PipelineConfig: + """Complete pipeline topology for a model (frozen).""" + + model_type: str + model_arch: str = "" + stages: tuple[StagePipelineConfig, ...] = () + # HF architecture aliases: used by StageConfigFactory when the model's + # HF config reports a generic model_type that collides with a different + # model (e.g. MiMo Audio reports model_type="qwen2"). The factory + # matches ``hf_config.architectures[*]`` against this tuple to route + # to the correct pipeline. Leave empty for models with unique model_type. + hf_architectures: tuple[str, ...] = () + + def get_stage(self, stage_id: int) -> StagePipelineConfig | None: + """Look up a stage by its ID.""" + for stage in self.stages: + if stage.stage_id == stage_id: + return stage + return None + + def get_scheduler_cls(self, stage_id: int) -> str | None: + """Return the inferred scheduler class path for a stage. + + Returns ``None`` for DIFFUSION stages (no vLLM scheduler). Raises + ``ValueError`` if ``stage_id`` doesn't exist in this pipeline, and + ``KeyError`` if ``execution_type`` isn't in the scheduler map. + """ + stage = self.get_stage(stage_id) + if stage is None: + raise ValueError(f"Pipeline {self.model_type!r} has no stage with id {stage_id}") + return _scheduler_path(_EXECUTION_TYPE_TO_SCHEDULER[stage.execution_type]) + + def validate(self) -> list[str]: + """Return list of topology errors (empty if valid).""" + errors: list[str] = [] + if not self.stages: + errors.append("Pipeline has no stages defined") + return errors + stage_ids = [s.stage_id for s in self.stages] + if len(stage_ids) != len(set(stage_ids)): + errors.append("Duplicate stage IDs found") + stage_id_set = set(stage_ids) + for stage in self.stages: + for src in stage.input_sources: + if src not in stage_id_set: + errors.append(f"Stage {stage.stage_id} references non-existent input source {src}") + if src == stage.stage_id: + errors.append(f"Stage {stage.stage_id} references itself") + if not any(not s.input_sources for s in self.stages): + errors.append("No entry point (stage with empty input_sources)") + return errors + + +_PIPELINE_REGISTRY: dict[str, PipelineConfig] = {} + + +def register_pipeline(pipeline: PipelineConfig) -> None: + """Register a pipeline config (called at import time by pipeline.py modules).""" + errors = pipeline.validate() + if errors: + logger.warning("Pipeline %s has issues: %s", pipeline.model_type, errors) + _PIPELINE_REGISTRY[pipeline.model_type] = pipeline + + +_DEPLOY_DIR = Path(__file__).resolve().parent.parent / "deploy" + + +@dataclass +class StageDeployConfig: + """Per-stage deployment knobs. + + Only fields whose value legitimately varies across stages of the same + pipeline live here (e.g. ``max_num_seqs`` on thinker vs talker, + ``devices`` for GPU placement). Pipeline-wide settings + (``trust_remote_code``, ``distributed_executor_backend``, ``dtype``, + ``quantization``, prefix/chunked prefill, DP/PP sizes) are declared at + the top level of ``DeployConfig`` and propagated to every stage. + """ + + stage_id: int + max_num_seqs: int = 64 + gpu_memory_utilization: float = 0.9 + tensor_parallel_size: int = 1 + enforce_eager: bool = False + max_num_batched_tokens: int = 32768 + max_model_len: int | None = None + async_scheduling: bool | None = None + devices: str = "0" + output_connectors: dict[str, str] | None = None + input_connectors: dict[str, str] | None = None + default_sampling_params: dict[str, Any] | None = None + engine_extras: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class DeployConfig: + """Loaded from deploy/.yaml — the only config file users edit. + + Top-level fields (``trust_remote_code``, ``distributed_executor_backend``, + ``dtype``, ``quantization``, ``enable_prefix_caching``, + ``enable_chunked_prefill``, ``data_parallel_size``, + ``pipeline_parallel_size``) are pipeline-wide: they apply uniformly to + every stage. Fields that legitimately vary per stage live in the + individual ``StageDeployConfig`` entries under ``stages:``. + """ + + async_chunk: bool = True + connectors: dict[str, Any] | None = None + edges: list[dict[str, Any]] | None = None + stages: list[StageDeployConfig] = field(default_factory=list) + platforms: dict[str, Any] | None = None + # Overrides the auto-detected pipeline registry key for structural variants. + pipeline: str | None = None + + # === Pipeline-wide engine settings (applied uniformly to every stage) === + trust_remote_code: bool = True + distributed_executor_backend: str = "mp" + dtype: str | None = None + quantization: str | None = None + enable_prefix_caching: bool = False + enable_chunked_prefill: bool | None = None + data_parallel_size: int = 1 + pipeline_parallel_size: int = 1 + + +_STAGE_NON_ENGINE_KEYS = frozenset( + { + "stage_id", + "devices", + "output_connectors", + "input_connectors", + "default_sampling_params", + "engine_extras", + } +) + +# Fields on StageDeployConfig that are populated from engine_args dict +_STAGE_DEPLOY_FIELDS = {f.name: f for f in fields(StageDeployConfig) if f.name not in _STAGE_NON_ENGINE_KEYS} + + +def _parse_stage_deploy(stage_data: dict[str, Any]) -> StageDeployConfig: + """Parse a single stage entry from deploy YAML into StageDeployConfig.""" + if "engine_args" in stage_data: + engine_args = dict(stage_data["engine_args"]) + devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices", "0")) + else: + engine_args = {k: v for k, v in stage_data.items() if k not in _STAGE_NON_ENGINE_KEYS and k != "stage_id"} + devices = stage_data.get("devices", "0") + + kwargs: dict[str, Any] = {"stage_id": stage_data["stage_id"], "devices": devices} + for name, f in _STAGE_DEPLOY_FIELDS.items(): + if name in engine_args: + kwargs[name] = engine_args.pop(name) + + kwargs["output_connectors"] = stage_data.get("output_connectors") + kwargs["input_connectors"] = stage_data.get("input_connectors") + kwargs["default_sampling_params"] = stage_data.get("default_sampling_params") + kwargs["engine_extras"] = engine_args + return StageDeployConfig(**kwargs) + + +_DEEP_MERGE_KEYS = frozenset({"default_sampling_params", "engine_extras", "engine_args"}) + + +def _deep_merge_stage(base: dict, overlay: dict) -> dict: + """Deep-merge ``_DEEP_MERGE_KEYS`` so thin overlays don't drop base keys.""" + merged = dict(base) + for k, v in overlay.items(): + if k in _DEEP_MERGE_KEYS: + base_val = merged.get(k) + if isinstance(v, dict) and isinstance(base_val, dict): + merged[k] = {**base_val, **v} + continue + # Deep-merge key but at least one side isn't a dict: surface the + # silent clobber so mismatched YAML types don't get past review. + if base_val is not None: + logger.warning( + "Deep-merge key %r has non-dict value (base=%s, overlay=%s); " + "overlay will fully replace base instead of merging.", + k, + type(base_val).__name__, + type(v).__name__, + ) + merged[k] = v + return merged + + +def _merge_stage_lists( + base_stages: list[dict[str, Any]] | None, + overlay_stages: list[dict[str, Any]] | None, +) -> list[dict[str, Any]]: + """Merge two ``stages:`` lists by ``stage_id`` (overlay wins per field).""" + by_id: dict[int, dict[str, Any]] = {s["stage_id"]: s for s in (base_stages or [])} + for overlay_stage in overlay_stages or []: + sid = overlay_stage["stage_id"] + if sid in by_id: + by_id[sid] = _deep_merge_stage(by_id[sid], overlay_stage) + else: + by_id[sid] = overlay_stage + return list(by_id.values()) + + +def _merge_platforms( + base: dict[str, Any] | None, + overlay: dict[str, Any] | None, +) -> dict[str, Any] | None: + """Deep-merge two ``platforms:`` blocks per-platform, per-stage_id.""" + if not base and not overlay: + return None + base = base or {} + overlay = overlay or {} + merged: dict[str, Any] = {} + for plat in set(base) | set(overlay): + bp = base.get(plat) or {} + op = overlay.get(plat) or {} + merged_plat = {**bp, **{k: v for k, v in op.items() if k != "stages"}} + merged_plat["stages"] = _merge_stage_lists(bp.get("stages"), op.get("stages")) + merged[plat] = merged_plat + return merged + + +def resolve_deploy_yaml(path: str | Path) -> dict[str, Any]: + """Load a deploy YAML with optional ``base_config`` inheritance.""" + raw_dict = to_dict(load_yaml_config(path)) + + base_path = raw_dict.pop("base_config", None) + if base_path is None: + return raw_dict + + # Resolve relative to the overlay file's directory + base_path = Path(path).parent / base_path + base_dict = resolve_deploy_yaml(base_path) + + # Merge top-level scalars: overlay wins. ``stages:`` and ``platforms:`` + # are deep-merged below so an overlay can layer on top of the base. + merged = { + **base_dict, + **{k: v for k, v in raw_dict.items() if k not in ("stages", "platforms")}, + } + merged["stages"] = _merge_stage_lists(base_dict.get("stages"), raw_dict.get("stages")) + merged_platforms = _merge_platforms(base_dict.get("platforms"), raw_dict.get("platforms")) + if merged_platforms is not None: + merged["platforms"] = merged_platforms + + return merged + + +def load_deploy_config(path: str | Path) -> DeployConfig: + """Load a deploy YAML (with optional base_config inheritance).""" + raw_dict = resolve_deploy_yaml(path) + + stages = [_parse_stage_deploy(s) for s in raw_dict.get("stages", [])] + + kwargs: dict[str, Any] = { + "async_chunk": raw_dict.get("async_chunk", True), + "connectors": raw_dict.get("connectors", None), + "edges": raw_dict.get("edges", None), + "stages": stages, + "platforms": raw_dict.get("platforms", None), + "pipeline": raw_dict.get("pipeline", None), + } + # Pipeline-wide engine settings: only set if explicitly present in YAML + # so the DeployConfig dataclass defaults take effect otherwise. + for name in ( + "trust_remote_code", + "distributed_executor_backend", + "dtype", + "quantization", + "enable_prefix_caching", + "enable_chunked_prefill", + "data_parallel_size", + "pipeline_parallel_size", + ): + if name in raw_dict: + kwargs[name] = raw_dict[name] + return DeployConfig(**kwargs) + + +def _detect_platform() -> str | None: + """Return "npu", "rocm", "xpu", or None (CUDA default).""" + try: + from vllm.platforms import current_platform + + name = current_platform.device_name.lower() + if "npu" in name: + return "npu" + if "rocm" in name or "amd" in name: + return "rocm" + if "xpu" in name: + return "xpu" + except Exception as e: + logger.debug("Platform auto-detect failed, falling back to CUDA: %s", e) + return None + + +def _extract_platform_overrides(ps: dict[str, Any]) -> tuple[dict[str, Any], str | None]: + """Return ``(overrides, devices)`` from a platform stage entry. + + Handles both the nested layout (``engine_args:`` / ``runtime.devices``) and + the flat layout. ``devices`` is ``None`` when no override is set. + """ + if "engine_args" in ps: + return dict(ps["engine_args"]), ps.get("runtime", {}).get("devices") + overrides = {k: v for k, v in ps.items() if k not in ("stage_id", "devices")} + return overrides, ps.get("devices") + + +def _apply_platform_overrides( + deploy: DeployConfig, + platform: str | None = None, +) -> DeployConfig: + """Merge platform-specific stage overrides into deploy config.""" + if platform is None: + platform = _detect_platform() + if platform is None or deploy.platforms is None: + return deploy + platform_section = deploy.platforms.get(platform) + if platform_section is None: + return deploy + + platform_stages = platform_section.get("stages", []) + base_by_id = {s.stage_id: s for s in deploy.stages} + + for ps in platform_stages: + base = base_by_id.get(ps["stage_id"]) + if base is None: + continue + overrides, devices = _extract_platform_overrides(ps) + if devices is not None: + base.devices = devices + for key, val in overrides.items(): + if hasattr(base, key): + setattr(base, key, val) + else: + base.engine_extras[key] = val + + return deploy + + +_EXECUTION_TYPE_TO_STAGE_WORKER: dict[StageExecutionType, tuple[StageType, str | None]] = { + StageExecutionType.LLM_AR: (StageType.LLM, "ar"), + StageExecutionType.LLM_GENERATION: (StageType.LLM, "generation"), + StageExecutionType.DIFFUSION: (StageType.DIFFUSION, None), +} + + +def _resolve_execution_mode( + execution_type: StageExecutionType, +) -> tuple[StageType, str | None]: + """Map ``execution_type`` → ``(stage_type, worker_type)`` legacy tuple.""" + return _EXECUTION_TYPE_TO_STAGE_WORKER.get(execution_type, (StageType.LLM, None)) + + +def _select_processor_funcs( + ps: StagePipelineConfig, + async_chunk: bool, +) -> tuple[str | None, str | None]: + """Pick ``(input_proc, next_stage_proc)`` based on the async_chunk mode.""" + next_stage_proc = ps.custom_process_next_stage_input_func + input_proc = ps.custom_process_input_func + if async_chunk and ps.async_chunk_process_next_stage_input_func: + next_stage_proc = ps.async_chunk_process_next_stage_input_func + elif not async_chunk and ps.sync_process_input_func: + input_proc = ps.sync_process_input_func + return input_proc, next_stage_proc + + +# Pipeline-wide DeployConfig fields that are propagated to every stage's +# engine args during merge. These live at top level of the deploy YAML. +_PIPELINE_WIDE_ENGINE_FIELDS: tuple[str, ...] = ( + "trust_remote_code", + "distributed_executor_backend", + "dtype", + "quantization", + "enable_prefix_caching", + "enable_chunked_prefill", + "data_parallel_size", + "pipeline_parallel_size", +) + + +def _build_engine_args( + ps: StagePipelineConfig, + ds: StageDeployConfig | None, + pipeline: PipelineConfig, + deploy: DeployConfig, + next_stage_proc: str | None, +) -> dict[str, Any]: + """Assemble the flat ``yaml_engine_args`` dict for one stage. + + Pipeline-wide DeployConfig fields are applied uniformly to every stage; + per-stage StageDeployConfig overrides take precedence when present (e.g. + ``engine_extras`` can still carry a stage-specific ``dtype``). + """ + engine_args: dict[str, Any] = {"model_arch": ps.model_arch or pipeline.model_arch} + if ps.engine_output_type: + engine_args["engine_output_type"] = ps.engine_output_type + if next_stage_proc: + engine_args["custom_process_next_stage_input_func"] = next_stage_proc + + # Pipeline-wide top-level DeployConfig settings, applied to every stage. + for name in _PIPELINE_WIDE_ENGINE_FIELDS: + value = getattr(deploy, name) + if value is not None: + engine_args[name] = value + + # Per-stage StageDeployConfig values override pipeline-wide settings. + if ds is not None: + for k, v in asdict(ds).items(): + if k in _STAGE_NON_ENGINE_KEYS or v is None: + continue + engine_args[k] = v + engine_args.update(ds.engine_extras) + if deploy.async_chunk: + engine_args["async_chunk"] = True + return engine_args + + +def _build_extras( + ps: StagePipelineConfig, + ds: StageDeployConfig | None, +) -> dict[str, Any]: + """Assemble ``yaml_extras`` (sampling + connectors + pipeline extras).""" + extras: dict[str, Any] = {} + sampling: dict[str, Any] = {} + if ds is not None and ds.default_sampling_params: + sampling.update(ds.default_sampling_params) + sampling.update(ps.sampling_constraints) + if sampling: + extras["default_sampling_params"] = sampling + if ds is not None and ds.output_connectors: + extras["output_connectors"] = dict(ds.output_connectors) + if ds is not None and ds.input_connectors: + extras["input_connectors"] = dict(ds.input_connectors) + if ps.extras: + extras.update(ps.extras) + return extras + + +def merge_pipeline_deploy( + pipeline: PipelineConfig, + deploy: DeployConfig, + cli_overrides: dict[str, Any] | None = None, +) -> list[StageConfig]: + """Merge pipeline + deploy + platform overrides → list[StageConfig].""" + if cli_overrides is None: + cli_overrides = {} + + deploy = _apply_platform_overrides(deploy) + deploy_by_id = {s.stage_id: s for s in deploy.stages} + + # A pipeline supports async_chunk if any stage has either an explicit + # async-chunk-only processor slot OR a custom next-stage processor (some + # pipelines like qwen3_omni wire async-chunk processing directly through + # ``custom_process_next_stage_input_func``). Only raise when neither is + # present — that's the "user enabled async_chunk but pipeline has no + # inter-stage processing at all" case. + if deploy.async_chunk and not any( + ps.async_chunk_process_next_stage_input_func or ps.custom_process_next_stage_input_func + for ps in pipeline.stages + ): + raise ValueError( + f"Pipeline {pipeline.model_type!r} has async_chunk=True in deploy but no stage " + "declares a next-stage input processor " + "(``async_chunk_process_next_stage_input_func`` or ``custom_process_next_stage_input_func``). " + "Either set async_chunk=False or implement an async-chunk processor on the pipeline." + ) + + result: list[StageConfig] = [] + for ps in pipeline.stages: + ds = deploy_by_id.get(ps.stage_id) + stage_type, worker_type = _resolve_execution_mode(ps.execution_type) + input_proc, next_stage_proc = _select_processor_funcs(ps, deploy.async_chunk) + engine_args = _build_engine_args(ps, ds, pipeline, deploy, next_stage_proc) + extras = _build_extras(ps, ds) + runtime: dict[str, Any] = {"process": True} + if ds is not None: + runtime["devices"] = ds.devices + + result.append( + StageConfig( + stage_id=ps.stage_id, + model_stage=ps.model_stage, + stage_type=stage_type, + input_sources=list(ps.input_sources), + custom_process_input_func=input_proc, + final_output=ps.final_output, + final_output_type=ps.final_output_type, + worker_type=worker_type, + scheduler_cls=_scheduler_path(_EXECUTION_TYPE_TO_SCHEDULER.get(ps.execution_type)), + hf_config_name=ps.hf_config_name, + is_comprehension=ps.owns_tokenizer, + yaml_engine_args=engine_args, + yaml_runtime=runtime, + yaml_extras=extras, + ) + ) + return result + + @dataclass class StageConfig: - """Per-stage configuration from pipeline YAML. + """Per-stage config (legacy path). Used by both new and legacy loaders. - Topology fields (stage_id, input_sources, etc.) define the DAG. - Engine and runtime defaults come from the YAML; CLI overrides take - precedence via ``runtime_overrides``. + TODO(@lishunyang12): replace with ResolvedStageConfig once all models are migrated. """ - # Identity stage_id: int model_stage: str - - # Stage type stage_type: StageType = StageType.LLM - input_sources: list[int] = field(default_factory=list) custom_process_input_func: str | None = None final_output: bool = False - final_output_type: str | None = None # "text", "audio", "image" - worker_type: str | None = None # "ar" or "generation" + final_output_type: str | None = None + worker_type: str | None = None scheduler_cls: str | None = None hf_config_name: str | None = None is_comprehension: bool = False - - # Per-stage engine args from pipeline YAML (defaults) yaml_engine_args: dict[str, Any] = field(default_factory=dict) - # Per-stage runtime config from pipeline YAML (devices, etc.) yaml_runtime: dict[str, Any] = field(default_factory=dict) - # Pass-through fields from pipeline YAML (default_sampling_params, - # output_connectors, input_connectors, tts_args, etc.) yaml_extras: dict[str, Any] = field(default_factory=dict) - - # Runtime overrides (populated from CLI, not from pipeline YAML) runtime_overrides: dict[str, Any] = field(default_factory=dict) def to_omegaconf(self) -> Any: - """Convert to OmegaConf for backward compatibility with OmniStage. - - Returns: - OmegaConf DictConfig with stage configuration in legacy format. - """ + """TODO(@lishunyang12): remove once engine consumes ResolvedStageConfig directly.""" # Start with YAML engine_args defaults engine_args: dict[str, Any] = dict(self.yaml_engine_args) @@ -152,9 +757,9 @@ def to_omegaconf(self) -> Any: @dataclass class ModelPipeline: - """Complete pipeline definition for a multi-stage model. + """Complete pipeline definition for a multi-stage model (legacy). - Defined by model developers, bundled with the model, not user-editable. + TODO(@lishunyang12): remove once all models migrate to PipelineConfig. """ model_type: str @@ -221,53 +826,86 @@ def validate_pipeline(self) -> list[str]: return errors +def _discover_all_pipelines() -> None: + """Import every ``models//pipeline.py`` once to populate the registry. + + Each pipeline.py is expected to call ``register_pipeline(PipelineConfig(...))`` + at import time. This function walks the models directory and imports any + pipeline.py it finds — contributors only need to drop a new pipeline.py + in their model's directory for the factory to pick it up. + + Idempotent: Python's module cache ensures subsequent calls are no-ops. + """ + if not _MODELS_DIR.exists(): + return + for subdir in sorted(_MODELS_DIR.iterdir()): + if not subdir.is_dir(): + continue + if not (subdir / "pipeline.py").exists(): + continue + module_path = f"vllm_omni.model_executor.models.{subdir.name}.pipeline" + try: + __import__(module_path) + except Exception as exc: + logger.debug("Skipping pipeline module %s: %s", module_path, exc) + + class StageConfigFactory: """Factory that loads pipeline YAML and merges CLI overrides. Handles both single-stage and multi-stage models. - """ - - # Mapping of model types to directories under model_executor/models/. - PIPELINE_MODELS: dict[str, str] = { - "qwen3_omni_moe": "qwen3_omni", - "qwen2_5_omni": "qwen2_5_omni", - "bagel": "bagel", - "qwen3_tts": "qwen3_tts", - "voxtral_tts": "voxtral_tts", - "mimo_audio": "mimo_audio", - "glm-image": "glm_image", - "cosyvoice3": "cosyvoice3", - "mammothmoda2": "mammoth_moda2", - } - # Fallback: map HF architecture class names to pipeline dirs. - # Used when model_type collides with another model (e.g. MiMo Audio - # reports model_type="qwen2" which matches plain Qwen2, not our pipeline). - _ARCHITECTURE_MODELS: dict[str, str] = { - "MiMoAudioForConditionalGeneration": "mimo_audio", - "HunyuanImage3ForCausalMM": "hunyuan_image3", - } + Pipelines are auto-discovered from ``models//pipeline.py`` modules; + no hardcoded model-type → directory mapping is maintained here. Models + with generic HF ``model_type`` collisions (e.g. MiMo Audio reports + ``qwen2``) should declare ``hf_architectures=(...)`` on their + ``PipelineConfig`` so the factory can disambiguate via + ``hf_config.architectures``. + """ @classmethod def create_from_model( cls, model: str, cli_overrides: dict[str, Any] | None = None, + deploy_config_path: str | None = None, + cli_explicit_keys: set[str] | None = None, ) -> list[StageConfig] | None: - """Load pipeline YAML, merge with CLI overrides. + """Load pipeline + deploy config, merge with CLI overrides. - Args: - model: Model name or path. - cli_overrides: CLI overrides from VllmConfig/OmniDiffusionConfig. + Checks _PIPELINE_REGISTRY first (new path), falls back to legacy YAML. - Returns: - List of StageConfig objects with CLI overrides applied, - or None if no pipeline definition was found for this model. + ``cli_explicit_keys`` is the set of CLI keys the user actually typed + (captured at the parser layer in ``vllm serve``). When ``None`` — + which is the case for programmatic ``Omni()`` callers — every kwarg + in ``cli_overrides`` is treated as explicit. """ if cli_overrides is None: cli_overrides = {} trust_remote_code = cli_overrides.get("trust_remote_code", True) + + # Ensure every pipeline.py has been imported so the registry is populated. + _discover_all_pipelines() + + # --- New path: check pipeline registry by model_type first --- + model_type, hf_config = cls._auto_detect_model_type(model, trust_remote_code=trust_remote_code) + if model_type and model_type in _PIPELINE_REGISTRY: + return cls._create_from_registry(model_type, cli_overrides, deploy_config_path, cli_explicit_keys) + + # --- HF architecture fallback: some models report a generic + # model_type that collides with another model. Match by the + # hf_architectures declared on each registered PipelineConfig. + if hf_config is not None: + hf_archs = set(getattr(hf_config, "architectures", []) or []) + if hf_archs: + for registered in _PIPELINE_REGISTRY.values(): + if hf_archs.intersection(registered.hf_architectures): + return cls._create_from_registry( + registered.model_type, cli_overrides, deploy_config_path, cli_explicit_keys + ) + + # --- Legacy path: load from pipeline YAML --- pipeline = cls._load_pipeline(model, trust_remote_code=trust_remote_code) if pipeline is None: @@ -295,6 +933,78 @@ def create_from_model( return result + @classmethod + def _create_from_registry( + cls, + model_type: str, + cli_overrides: dict[str, Any], + deploy_config_path: str | None = None, + cli_explicit_keys: set[str] | None = None, + ) -> list[StageConfig]: + """Create StageConfigs from pipeline registry + deploy YAML. + + Precedence (high → low): + explicit CLI args > deploy YAML > parser default CLI values + + ``cli_explicit_keys`` carries the set of long-option attribute names + the user actually typed (captured in ``OmniServeCommand.cmd``). Any + kwarg whose key is not in that set is treated as a parser default + and is only used to fill fields YAML doesn't already cover. When the + set is ``None`` (programmatic ``Omni()`` callers, which have no + argparse layer), every kwarg is treated as explicit. + """ + # Resolve deploy config path + if deploy_config_path is None: + deploy_path = _DEPLOY_DIR / f"{model_type}.yaml" + else: + deploy_path = Path(deploy_config_path) + + if not deploy_path.exists(): + logger.warning( + "Deploy config not found: %s — using pipeline defaults only", + deploy_path, + ) + deploy_cfg = DeployConfig() + else: + deploy_cfg = load_deploy_config(deploy_path) + + cli_async_chunk = cli_overrides.get("async_chunk") + if cli_async_chunk is not None and (cli_explicit_keys is None or "async_chunk" in cli_explicit_keys): + deploy_cfg.async_chunk = bool(cli_async_chunk) + + pipeline_key = deploy_cfg.pipeline or model_type + if pipeline_key not in _PIPELINE_REGISTRY: + raise KeyError( + f"Pipeline {pipeline_key!r} not in registry " + f"(resolved from {deploy_path.name!r}). Available: " + f"{sorted(_PIPELINE_REGISTRY.keys())}" + ) + pipeline_cfg = _PIPELINE_REGISTRY[pipeline_key] + + stages = merge_pipeline_deploy(pipeline_cfg, deploy_cfg, cli_overrides) + + # Precedence: explicit CLI > yaml > parser-default CLI. + # Per-stage (``stage_N_*``) keys are always treated as explicit. + explicit_overrides: dict[str, Any] = {} + default_overrides: dict[str, Any] = {} + for key, value in cli_overrides.items(): + if value is None: + continue + is_per_stage = bool(re.match(r"stage_\d+_", key)) + is_explicit = cli_explicit_keys is None or key in cli_explicit_keys or is_per_stage + if is_explicit: + explicit_overrides[key] = value + else: + default_overrides[key] = value + + for stage in stages: + yaml_keys = set(stage.yaml_engine_args) + fallback = {k: v for k, v in default_overrides.items() if k not in yaml_keys} + merged = {**fallback, **explicit_overrides} + stage.runtime_overrides = cls._merge_cli_overrides(stage, merged) + + return stages + @classmethod def create_default_diffusion(cls, kwargs: dict[str, Any]) -> list[dict[str, Any]]: """Single-stage diffusion - no YAML needed. @@ -322,9 +1032,16 @@ def create_default_diffusion(cls, kwargs: dict[str, Any]) -> list[dict[str, Any] continue engine_args[key] = value - # Serialize parallel_config as dict for OmegaConf compatibility + # Serialize parallel_config as dict for OmegaConf. Test helpers + # sometimes pass SimpleNamespace rather than a dataclass instance. if "parallel_config" in kwargs: - engine_args["parallel_config"] = asdict(kwargs["parallel_config"]) + parallel_config = kwargs["parallel_config"] + if dataclasses.is_dataclass(parallel_config) and not isinstance(parallel_config, type): + engine_args["parallel_config"] = asdict(parallel_config) + elif hasattr(parallel_config, "__dict__"): + engine_args["parallel_config"] = dict(vars(parallel_config)) + else: + engine_args["parallel_config"] = parallel_config engine_args.setdefault("cache_backend", "none") engine_args["model_stage"] = "diffusion" @@ -351,40 +1068,49 @@ def create_default_diffusion(cls, kwargs: dict[str, Any]) -> list[dict[str, Any] @classmethod def _load_pipeline(cls, model: str, trust_remote_code: bool = True) -> ModelPipeline | None: - """Load pipeline YAML for the model. + """Load a legacy ``pipeline.yaml`` for the model. - Args: - model: Model name or path. - trust_remote_code: Whether to trust remote code for HF config loading. + Searches ``model_executor/models//pipeline.yaml`` by trying + (a) the raw ``model_type`` as the directory name, then + (b) ``model_type`` with hyphens replaced by underscores, + and finally (c) scanning every ``pipeline.yaml`` for one that + declares a matching ``model_type`` or ``hf_architectures``. - Returns: - ModelPipeline if found, None otherwise. + Returns None if no pipeline.yaml is found — caller handles the + ``resolve_model_config_path`` fallback via stage_configs/ YAMLs. """ model_type, hf_config = cls._auto_detect_model_type(model, trust_remote_code=trust_remote_code) if model_type is None: return None - pipeline_dir = cls.PIPELINE_MODELS.get(model_type) - - # Fallback: check HF architectures when model_type doesn't match - if pipeline_dir is None and hf_config is not None: - for arch in getattr(hf_config, "architectures", []) or []: - pipeline_dir = cls._ARCHITECTURE_MODELS.get(arch) - if pipeline_dir is not None: - model_type = pipeline_dir - break - - if pipeline_dir is None: - logger.debug(f"No pipeline mapping for model_type: {model_type}") - return None - - pipeline_path = get_pipeline_path(pipeline_dir, "pipeline.yaml") - - if not pipeline_path.exists(): - logger.debug(f"Pipeline file not found: {pipeline_path}") - return None + # Direct lookups by convention + candidates = [model_type, model_type.replace("-", "_")] + for dir_name in candidates: + pipeline_path = get_pipeline_path(dir_name, "pipeline.yaml") + if pipeline_path.exists(): + return cls._parse_pipeline_yaml(pipeline_path, model_type) + + # Scan fallback: read every pipeline.yaml and match on declared fields + hf_archs = set(getattr(hf_config, "architectures", []) or []) if hf_config else set() + if _MODELS_DIR.exists(): + for subdir in sorted(_MODELS_DIR.iterdir()): + if not subdir.is_dir(): + continue + pipeline_path = subdir / "pipeline.yaml" + if not pipeline_path.exists(): + continue + try: + cfg = load_yaml_config(pipeline_path) + except Exception as exc: + logger.debug("Skip %s: %s", pipeline_path, exc) + continue + declared_type = getattr(cfg, "model_type", None) + declared_archs = set(getattr(cfg, "hf_architectures", None) or []) + if declared_type == model_type or (hf_archs and hf_archs.intersection(declared_archs)): + return cls._parse_pipeline_yaml(pipeline_path, declared_type or model_type) - return cls._parse_pipeline_yaml(pipeline_path, model_type) + logger.debug("No pipeline.yaml found for model_type %s (archs=%s)", model_type, sorted(hf_archs)) + return None # Keys consumed as explicit StageConfig fields — everything else is # passed through via yaml_extras. @@ -542,66 +1268,17 @@ def _auto_detect_model_type(cls, model: str, trust_remote_code: bool = True) -> return None, None - # Keys that should never be forwarded as engine overrides (internal / - # orchestrator-only knobs, complex objects, etc.). - _INTERNAL_KEYS: set[str] = { - "model", - "stage_configs_path", - "stage_id", - "stage_init_timeout", - "init_timeout", - "shm_threshold_bytes", - "worker_backend", - "ray_address", - "batch_timeout", - "log_stats", - "tokenizer", - "parallel_config", - } - @classmethod def _merge_cli_overrides( cls, stage: StageConfig, cli_overrides: dict[str, Any], ) -> dict[str, Any]: - """Merge CLI overrides into stage runtime config. - - All CLI arguments registered by engine config classes (e.g. - EngineArgs / OmniDiffusionConfig) are accepted as overrides - unless they appear in ``_INTERNAL_KEYS``. - - Handles: - - Global overrides (apply to all stages) - - Per-stage overrides (--stage-N-* format, take precedence) - - Args: - stage: The stage to merge overrides into. - cli_overrides: CLI arguments from VllmConfig/OmniDiffusionConfig. + """Merge global and per-stage (``stage_N_*``) CLI overrides. - Returns: - Dict of runtime overrides for this stage. + Orchestrator-owned keys are filtered by ``build_stage_runtime_overrides`` + using ``OrchestratorArgs`` as the single source of truth; unknown + server/uvicorn keys are dropped downstream by + ``filter_dataclass_kwargs(OmniEngineArgs, ...)``. """ - result: dict[str, Any] = {} - - # Apply global overrides – any key not in the internal blocklist - # is forwarded so that engine-registered params work out of the box. - for key, value in cli_overrides.items(): - if key in cls._INTERNAL_KEYS: - continue - if re.match(r"stage_\d+_", key): - # Per-stage keys handled below - continue - if value is not None: - result[key] = value - - # Apply per-stage overrides (--stage-N-* format, take precedence) - stage_prefix = f"stage_{stage.stage_id}_" - for key, value in cli_overrides.items(): - if key.startswith(stage_prefix) and value is not None: - param_name = key[len(stage_prefix) :] - if param_name in cls._INTERNAL_KEYS: - continue - result[param_name] = value - - return result + return build_stage_runtime_overrides(stage.stage_id, cli_overrides) diff --git a/vllm_omni/deploy/qwen2_5_omni.yaml b/vllm_omni/deploy/qwen2_5_omni.yaml new file mode 100644 index 0000000000..41aef0df6f --- /dev/null +++ b/vllm_omni/deploy/qwen2_5_omni.yaml @@ -0,0 +1,92 @@ +# Qwen2.5-Omni deploy: CUDA defaults + platform overrides, verified on 2x H100. +# Stage 2 disables flashinfer autotune because its DiT block never invokes +# flashinfer; the autotune dummy run OOMs the shared cuda:0 device otherwise. +# +# Fields omitted from a stage fall back to StageDeployConfig dataclass +# defaults (see vllm_omni/config/stage_config.py). For instance, every +# stage here uses vLLM's default max_num_batched_tokens=32768 because +# chat-sized prefill comfortably fits; only models with codec prefill +# (Qwen3-Omni, Qwen3-TTS) need to bump it above 32k. +# +# enforce_eager policy across the three deploy YAMLs: +# * code2wav / generation stages: always true (cudagraph incompatible with +# the custom generation loop — set explicitly everywhere). +# * AR stages (thinker, talker): model-dependent. Qwen2.5-Omni runs eager +# on CUDA (thinker uses custom ops that don't trace cleanly); NPU / XPU +# platform overrides flip back to false where cudagraph is verified. +# Qwen3-Omni / Qwen3-TTS AR stages use the default (false = cudagraph on). +async_chunk: false + +stages: + - stage_id: 0 + max_num_seqs: 1 + gpu_memory_utilization: 0.8 + enforce_eager: true + mm_processor_cache_gb: 0 + devices: "0" + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 2048 + seed: 42 + repetition_penalty: 1.1 + + - stage_id: 1 + max_num_seqs: 1 + gpu_memory_utilization: 0.8 + enforce_eager: true + devices: "1" + default_sampling_params: + temperature: 0.9 + top_p: 0.8 + top_k: 40 + max_tokens: 2048 + seed: 42 + repetition_penalty: 1.05 + + - stage_id: 2 + max_num_seqs: 1 + gpu_memory_utilization: 0.15 + enforce_eager: true + enable_flashinfer_autotune: false + async_scheduling: false + devices: "0" + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 2048 + seed: 42 + repetition_penalty: 1.1 + +platforms: + npu: + stages: + # NPU has cudagraph support for the thinker, unlike GPU which still + # only runs eager. + - stage_id: 0 + enforce_eager: false + - stage_id: 2 + # 3-NPU layout: stage 2 lives on its own card. + devices: "2" + + rocm: + stages: + - stage_id: 2 + # 3-GPU MI325 layout: stage 2 on a separate card. + devices: "2" + + xpu: + stages: + # Verified on 2x Intel Arc Pro B60. Both AR stages use cudagraphs. + - stage_id: 0 + gpu_memory_utilization: 0.9 + enforce_eager: false + - stage_id: 1 + gpu_memory_utilization: 0.5 + enforce_eager: false + - stage_id: 2 + gpu_memory_utilization: 0.3 + # Stage 2 colocates with stage 1's device on XPU. + devices: "1" diff --git a/vllm_omni/deploy/qwen3_omni_moe.yaml b/vllm_omni/deploy/qwen3_omni_moe.yaml new file mode 100644 index 0000000000..fb8b616213 --- /dev/null +++ b/vllm_omni/deploy/qwen3_omni_moe.yaml @@ -0,0 +1,98 @@ +# Qwen3-Omni-MoE production deploy, verified on 2x H100 (stage 0 on cuda:0, +# stages 1+2 on cuda:1). +# +# Fields omitted from a stage fall back to StageDeployConfig defaults (see +# vllm_omni/config/stage_config.py). Notable implicit defaults for this +# model: +# * Stages 0/1 (thinker, talker) do not set max_num_batched_tokens — +# chat-sized prefill fits in the 32768 default. +# * Stages 0/1 do not set enforce_eager — cudagraph runs by default +# (false). Stage 2 (code2wav) sets true because its generation loop +# is cudagraph-incompatible. +# * Platform sections flip enforce_eager per-stage where platform +# cudagraph support differs. +async_chunk: true + +connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + codec_chunk_frames: 25 + codec_left_context_frames: 25 + +stages: + - stage_id: 0 + gpu_memory_utilization: 0.9 + devices: "0" + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + top_k: 1 + max_tokens: 2048 + seed: 42 + repetition_penalty: 1.05 + + - stage_id: 1 + gpu_memory_utilization: 0.6 + devices: "1" + input_connectors: + from_stage_0: connector_of_shared_memory + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + repetition_penalty: 1.05 + + - stage_id: 2 + gpu_memory_utilization: 0.1 + max_num_seqs: 1 + enforce_eager: true + async_scheduling: false + max_num_batched_tokens: 51200 + devices: "1" + input_connectors: + from_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + repetition_penalty: 1.1 + +platforms: + npu: + stages: + - stage_id: 0 + gpu_memory_utilization: 0.6 + tensor_parallel_size: 2 + devices: "0,1" + - stage_id: 1 + gpu_memory_utilization: 0.6 + enforce_eager: true + devices: "2" + - stage_id: 2 + gpu_memory_utilization: 0.3 + devices: "2" + + rocm: + stages: + - stage_id: 0 + enforce_eager: true + + xpu: + stages: + - stage_id: 0 + tensor_parallel_size: 4 + enforce_eager: true + max_cudagraph_capture_size: 0 + devices: "0,1,2,3" + - stage_id: 1 + enforce_eager: true + max_cudagraph_capture_size: 0 + devices: "4" + - stage_id: 2 + gpu_memory_utilization: 0.3 + max_cudagraph_capture_size: 0 + devices: "4" diff --git a/vllm_omni/deploy/qwen3_tts.yaml b/vllm_omni/deploy/qwen3_tts.yaml new file mode 100644 index 0000000000..32dceebd80 --- /dev/null +++ b/vllm_omni/deploy/qwen3_tts.yaml @@ -0,0 +1,81 @@ +# Qwen3-TTS deploy: talker → code2wav via shared-memory chunk streaming. +# Verified on 1x H100. +# +# Fields omitted from a stage fall back to StageDeployConfig defaults (see +# vllm_omni/config/stage_config.py). Notable choices for this model: +# * Stage 0 (talker) sets max_num_batched_tokens=512 for async-chunk +# latency tuning (not correctness) — small per-step batches keep +# first-chunk latency low. +# * Stage 1 (code2wav) sets max_num_batched_tokens=65536 for correctness: +# codec prefill length (Q * num_frames) exceeds the 32k default. +# * Stage 0 does not set enforce_eager — talker runs cudagraph by default. +# Stage 1 sets true because its codec generation loop is not +# cudagraph-compatible. NPU platform flips stage 0 to true where +# cudagraph is not yet verified. +async_chunk: true + +connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + codec_streaming: true + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + # Must match the decoder sliding attention window. + codec_chunk_frames: 25 + codec_left_context_frames: 72 + +stages: + - stage_id: 0 + max_num_seqs: 10 + gpu_memory_utilization: 0.3 + async_scheduling: true + max_num_batched_tokens: 512 + max_model_len: 4096 + devices: "0" + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + repetition_penalty: 1.05 + + - stage_id: 1 + max_num_seqs: 1 + gpu_memory_utilization: 0.3 + enforce_eager: true + async_scheduling: true + # Must be divisible by num_code_groups and cover (left_context + chunk). + # Prefill length is Q * num_frames (e.g. 16 * 2148 = 34368); keep + # headroom past 32k. + max_num_batched_tokens: 65536 + # async_chunk appends windows per step; max_model_len must cover the + # accumulated flat codec stream. + max_model_len: 65536 + devices: "0" + input_connectors: + from_stage_0: connector_of_shared_memory + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + repetition_penalty: 1.0 + +platforms: + npu: + stages: + # NPU does not yet support async-scheduling for TTS, and the + # talker fits at max_num_seqs=1 only. + - stage_id: 0 + max_num_seqs: 1 + enforce_eager: true + async_scheduling: false + - stage_id: 1 + gpu_memory_utilization: 0.2 + async_scheduling: false diff --git a/vllm_omni/distributed/omni_connectors/utils/initialization.py b/vllm_omni/distributed/omni_connectors/utils/initialization.py index 0497bbb3a2..f012af3c9c 100644 --- a/vllm_omni/distributed/omni_connectors/utils/initialization.py +++ b/vllm_omni/distributed/omni_connectors/utils/initialization.py @@ -206,6 +206,19 @@ def load_omni_transfer_config( if config_dict is None: return None + # Normalize new-schema (top-level ``connectors`` + ``stages``) into the + # legacy ``runtime.connectors`` + ``stage_args`` shape the parser reads. + if "stages" in config_dict and "stage_args" not in config_dict: + normalized: dict[str, Any] = dict(config_dict) + runtime = dict(normalized.get("runtime") or {}) + if "connectors" in normalized and "connectors" not in runtime: + runtime["connectors"] = normalized["connectors"] + if "edges" in normalized and "edges" not in runtime: + runtime["edges"] = normalized["edges"] + normalized["runtime"] = runtime + normalized["stage_args"] = normalized["stages"] + config_dict = normalized + # Parse connectors connectors = {} runtime_config = config_dict.get("runtime", {}) diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index 5b69d6b1f0..d98ce7d419 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -3,7 +3,7 @@ import json import os import tempfile -from dataclasses import dataclass, field +from dataclasses import dataclass, field, fields from typing import Any from vllm.engine.arg_utils import EngineArgs @@ -300,3 +300,254 @@ def create_model_config(self) -> OmniModelConfig: def output_modality(self) -> OutputModality: """Parse engine_output_type into a type-safe OutputModality flag.""" return OutputModality.from_string(self.engine_output_type) + + +# ============================================================================ +# CLI argument routing +# ============================================================================ +# +# vLLM-Omni's CLI flags live in three buckets: +# +# ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ +# │ OrchestratorArgs │ │ OmniEngineArgs │ │ (upstream vllm) │ +# │ │ │ │ │ server/api │ +# │ stage_timeout │ │ max_num_seqs │ │ host, port │ +# │ worker_backend │ │ gpu_mem_util │ │ ssl_keyfile │ +# │ deploy_config │ │ dtype, quant │ │ api_key │ +# │ ... │ │ ... │ │ ... │ +# └──────────────────┘ └──────────────────┘ └──────────────────┘ +# │ │ │ +# ▼ ▼ ▼ +# orchestrator each stage uvicorn / +# consumes engine FastAPI +# +# Fields in ``SHARED_FIELDS`` (e.g. ``model``, ``log_stats``) flow to BOTH +# orchestrator and engine by design. +# +# Invariants enforced by ``tests/test_arg_utils.py``: +# +# 1. ``OrchestratorArgs`` ∩ ``OmniEngineArgs`` ⊆ ``SHARED_FIELDS`` +# 2. Every CLI flag is classifiable into one of the three buckets +# 3. User-typed flags that match none of the above are logged as dropped +# +# Adding a new orchestrator-only flag → add a field to ``OrchestratorArgs``. +# Everything else is automatic. + + +@dataclass(frozen=True) +class OrchestratorArgs: + """CLI flags consumed by the orchestrator. + + Contract: every field here is either + (a) orchestrator-only (never needed by a stage engine), OR + (b) orchestrator-read-then-redistributed (e.g. ``async_chunk`` is read + from CLI, written to ``DeployConfig``, then propagated to every + stage via ``merge_pipeline_deploy`` — not via direct kwargs + forwarding). + + Fields that BOTH orchestrator and engine genuinely need (e.g. ``model``, + ``log_stats``) should be listed in ``SHARED_FIELDS`` below; ``split_kwargs`` + will copy them to both buckets. + """ + + # === Lifecycle === + stage_init_timeout: int = 300 + init_timeout: int = 600 + + # === Cross-stage Communication === + shm_threshold_bytes: int = 65536 + batch_timeout: int = 10 + + # === Cluster / Backend === + worker_backend: str = "multi_process" + ray_address: str | None = None + + # === Config Files === + stage_configs_path: str | None = None + deploy_config: str | None = None + stage_overrides: str | None = None # raw JSON string; parsed downstream + + # === Mode Switches (orchestrator reads, DeployConfig redistributes) === + async_chunk: bool | None = None + + # === Observability === + log_stats: bool = False + + # === Headless Mode (also forwarded to engine — see SHARED_FIELDS) === + stage_id: int | None = None + + # === Pre-built Objects === + parallel_config: Any = None + + # === Multi-stage guards === + # --tokenizer is captured here so it does not propagate to every stage + # uniformly (different stages often need different tokenizers, e.g. + # qwen3_omni thinker vs talker). Users wanting a per-stage tokenizer + # should set it in the deploy YAML. + tokenizer: str | None = None + + +# Fields that live in BOTH OrchestratorArgs and OmniEngineArgs by design. +# Changes to this set are a review red flag — revisit the contract. +SHARED_FIELDS: frozenset[str] = frozenset( + { + "model", # orch: detect model_type; engine: load weights + "stage_id", # orch: route (headless); engine: identity + "log_stats", # both want the flag + "stage_configs_path", # orch: load legacy YAML; engine: may reference for validation + } +) + + +def orchestrator_field_names() -> frozenset[str]: + """Return the names of every field on OrchestratorArgs.""" + return frozenset(f.name for f in fields(OrchestratorArgs)) + + +def internal_blacklist_keys() -> frozenset[str]: + """Return the set of CLI keys that must never be forwarded as per-stage + engine overrides. + + Derived from ``OrchestratorArgs`` fields minus ``SHARED_FIELDS``, so + adding a new orchestrator-owned flag is a one-line change to the + dataclass — this function updates automatically. + """ + return orchestrator_field_names() - SHARED_FIELDS + + +def split_kwargs( + kwargs: dict[str, Any], + *, + engine_cls: type | None = None, + user_typed: set[str] | None = None, + strict: bool = False, +) -> tuple[OrchestratorArgs, dict[str, Any]]: + """Partition CLI kwargs into (orchestrator, engine) buckets. + + Args: + kwargs: Raw dict, typically ``vars(args)``. + engine_cls: Engine dataclass used to whitelist-filter the engine + bucket. Defaults to ``OmniEngineArgs``. Pass a custom class + for testing. + user_typed: Keys the user actually typed on the command line. Used + to warn when a user-typed flag is unclassifiable. + strict: If True, raise ``ValueError`` on ambiguous (double-classified + but not in ``SHARED_FIELDS``) fields. Default False to keep the + rollout non-breaking; flip to True in tests and CI. + + Returns: + ``(orchestrator_args, engine_kwargs)``. ``engine_kwargs`` has already + been whitelist-filtered against ``engine_cls`` — safe to pass directly + to ``engine_cls(**engine_kwargs)``. + """ + if engine_cls is None: + engine_cls = OmniEngineArgs + + orch_fields = orchestrator_field_names() + engine_fields = {f.name for f in fields(engine_cls)} + + orch_kwargs: dict[str, Any] = {} + engine_candidate: dict[str, Any] = {} + shared_values: dict[str, Any] = {} + unclassified: dict[str, Any] = {} + + for key, value in kwargs.items(): + in_orch = key in orch_fields + in_engine = key in engine_fields + is_shared = key in SHARED_FIELDS + + if is_shared: + shared_values[key] = value + elif in_orch and in_engine: + # Declared in both but not marked shared → ambiguous. + msg = ( + f"Field {key!r} is defined on both OrchestratorArgs and " + f"{engine_cls.__name__} but is not in SHARED_FIELDS. " + f"This causes double-routing. Either remove the duplicate or " + f"add {key!r} to SHARED_FIELDS if the sharing is intentional." + ) + if strict: + raise ValueError(msg) + logger.error(msg) + # Default: treat as orchestrator-only to preserve existing behavior. + orch_kwargs[key] = value + elif in_orch: + orch_kwargs[key] = value + elif in_engine: + engine_candidate[key] = value + else: + unclassified[key] = value + + # Warn on user-typed but unclassifiable flags so we don't silently drop + # something the user cared about (fixes the class of bug that spawned #873). + if unclassified and user_typed: + user_typed_unknown = sorted(k for k in unclassified if k in user_typed) + if user_typed_unknown: + logger.warning( + "CLI flags not consumed by vllm-omni and dropped before " + "per-stage engine construction: %s. If these are vllm " + "frontend/uvicorn flags (host, port, ssl_*, api_key, …) this " + "is expected; otherwise check your spelling.", + user_typed_unknown, + ) + + # Engine bucket: shared + engine-only. We do NOT pass through unclassified + # fields — that's exactly the server/uvicorn noise we want to shed. + engine_kwargs = {**shared_values, **engine_candidate} + + # Construct the orchestrator dataclass. Shared fields that OrchestratorArgs + # also declares get copied into its constructor. + orch_init: dict[str, Any] = dict(orch_kwargs) + for key, value in shared_values.items(): + if key in orch_fields: + orch_init[key] = value + orch_args = OrchestratorArgs(**orch_init) + + return orch_args, engine_kwargs + + +def derive_server_dests_from_vllm_parser() -> frozenset[str]: + """Derive the set of argparse dests that belong to vllm's frontend/server. + + Returns every dest registered by ``make_arg_parser`` that is NOT a field + of ``OmniEngineArgs`` and NOT a field of ``OrchestratorArgs``. Useful for + CI tests to assert all CLI flags are classifiable without maintaining + a hardcoded server list. + + Returns empty frozenset if vllm's parser cannot be built (e.g. in a + minimal test environment). + """ + try: + from vllm.entrypoints.openai.cli_args import make_arg_parser + from vllm.utils.argparse_utils import FlexibleArgumentParser + except ImportError: + logger.debug("Cannot import vllm parser — server-dest derivation skipped") + return frozenset() + + try: + parser = make_arg_parser(FlexibleArgumentParser()) + all_dests = {a.dest for a in parser._actions if a.dest and a.dest != "help"} + except Exception as exc: + logger.debug("Failed to build vllm parser: %s", exc) + return frozenset() + + engine_fields = {f.name for f in fields(OmniEngineArgs)} + orch_fields = orchestrator_field_names() + + return frozenset(all_dests - engine_fields - orch_fields - SHARED_FIELDS) + + +def orchestrator_args_from_argparse(args: Any) -> OrchestratorArgs: + """Build an ``OrchestratorArgs`` from an ``argparse.Namespace``. + + Only copies attributes that exist on the namespace — missing fields fall + back to the dataclass default. Useful when the full parser is already + built and ``vars(args)`` would include noise. + """ + kwargs: dict[str, Any] = {} + for f in fields(OrchestratorArgs): + if hasattr(args, f.name): + value = getattr(args, f.name) + if value is not None or f.default is None: + kwargs[f.name] = value + return OrchestratorArgs(**kwargs) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index fe6527349c..b796401fe0 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -33,6 +33,7 @@ from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.input_processor import InputProcessor +from vllm_omni.config.stage_config import strip_parent_engine_args from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient from vllm_omni.diffusion.stage_diffusion_proc import ( @@ -94,6 +95,27 @@ logger = init_logger(__name__) +# ============================================================================ +# Parent-EngineArgs field-routing contracts (consumed by +# AsyncOmniEngine._strip_parent_engine_args when ``stage_configs_path`` is set). +# ============================================================================ + +# Fields that must survive the "equal to default → strip" filter because +# diffusion stages need them even when equal to vllm's default value +# (e.g. colocate worker setup relies on worker_extension_cls being forwarded). +_PARENT_ARGS_KEEP: frozenset[str] = frozenset({"worker_extension_cls"}) + +# Omni orchestrator-level fields consumed by ``_resolve_stage_configs`` that +# must never leak into per-stage EngineArgs (``stage_configs_path`` would +# trigger the ``create_model_config`` guard). +_PARENT_ARGS_STRIP: frozenset[str] = frozenset({"stage_configs_path"}) + +# Fields always populated by callers (via ``from_cli_args`` / ``asdict``) so +# their presence as an override is never a surprise — suppress the +# "override ignored" warning for these. +_PARENT_ARGS_NO_WARN: frozenset[str] = frozenset({"model"}) + + def _patch_generation_config_if_needed(model_config: Any) -> None: """Ensure try_get_generation_config won't crash for models whose HF config.json lacks model_type (e.g. CosyVoice3). We probe it once; @@ -1310,45 +1332,17 @@ def _strip_single_engine_args(kwargs: dict[str, Any]) -> dict[str, Any]: Logs a warning for any parent field whose value differs from the dataclass default, so users know their explicit overrides are ignored. + See the module-level ``_PARENT_ARGS_*`` constants for the routing + contracts this method enforces. """ - # worker_extension_cls is a parent field but must pass through to - # diffusion stages for colocate worker setup. - _keep = {"worker_extension_cls"} - # Orchestrator-level OmniEngineArgs fields that are consumed by - # _resolve_stage_configs and must not leak into per-stage configs - # (stage_configs_path would trigger the create_model_config guard). - _strip_omni = {"stage_configs_path"} - # Fields that are always set by callers (via from_cli_args / asdict) - # and would always appear as overridden — suppress from the warning - # so it only surfaces genuinely surprising overrides. - _no_warn = {"model"} - parent_fields: dict[str, dataclasses.Field] = {f.name: f for f in dataclasses.fields(EngineArgs)} - overridden: list[str] = [] - result: dict[str, Any] = {} - for k, v in kwargs.items(): - if k in _strip_omni: - continue - if k not in parent_fields or k in _keep: - result[k] = v - continue - # Detect explicitly-set values that differ from the default. - # Values may have been through asdict() which converts dataclass - # defaults to dicts, so normalise before comparing. - field = parent_fields[k] - if field.default is not dataclasses.MISSING: - default = field.default - elif field.default_factory is not dataclasses.MISSING: - default = field.default_factory() - else: - default = dataclasses.MISSING - if default is dataclasses.MISSING or v is None: - continue - # Normalise dataclass defaults to dicts for comparison - if dataclasses.is_dataclass(default) and not isinstance(default, type): - default = dataclasses.asdict(default) - if v != default and k not in _no_warn: - overridden.append(k) + result, overridden = strip_parent_engine_args( + kwargs, + parent_fields=parent_fields, + keep_keys=_PARENT_ARGS_KEEP, + strip_keys=_PARENT_ARGS_STRIP, + no_warn_keys=_PARENT_ARGS_NO_WARN, + ) if overridden: logger.warning( @@ -1363,6 +1357,12 @@ def _resolve_stage_configs(self, model: str, kwargs: dict[str, Any]) -> tuple[st """Resolve stage configs and inject defaults shared by orchestrator/headless.""" stage_configs_path = kwargs.get("stage_configs_path", None) + deploy_config_path = kwargs.pop("deploy_config", None) + stage_overrides_json = kwargs.pop("stage_overrides", None) + # Set of CLI keys the user actually typed; ``None`` means we have no + # parser-level info (e.g. programmatic Omni() call) and the lower + # layers should treat all kwargs as explicit. + cli_explicit_keys = kwargs.pop("_cli_explicit_keys", None) explicit_stage_configs = kwargs.pop("stage_configs", None) if explicit_stage_configs is not None: logger.warning( @@ -1375,13 +1375,27 @@ def _resolve_stage_configs(self, model: str, kwargs: dict[str, Any]) -> tuple[st else: base_kwargs = kwargs - # Use the legacy config loading path (load_and_resolve_stage_configs). - # StageConfigFactory wiring will be done in config refactor [2/N]. + # Parse --stage-overrides JSON string if provided + stage_overrides = None + if stage_overrides_json: + if isinstance(stage_overrides_json, str): + try: + stage_overrides = json.loads(stage_overrides_json) + except json.JSONDecodeError as exc: + raise ValueError( + f"--stage-overrides is not valid JSON: {exc}. Got: {stage_overrides_json!r}" + ) from exc + else: + stage_overrides = stage_overrides_json + config_path, stage_configs = load_and_resolve_stage_configs( model, stage_configs_path, base_kwargs, default_stage_cfg_factory=lambda: self._create_default_diffusion_stage_cfg(kwargs), + deploy_config_path=deploy_config_path, + stage_overrides=stage_overrides, + cli_explicit_keys=cli_explicit_keys, ) # Inject diffusion LoRA-related knobs from kwargs if not present in the stage config. diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index c697e34bac..89dfdc163c 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -434,6 +434,20 @@ def build_vllm_config( filtered_engine_args_dict = filter_dataclass_kwargs(OmniEngineArgs, engine_args_dict) omni_engine_args = OmniEngineArgs(**filtered_engine_args_dict) + + # Multi-stage pipelines (qwen3_tts code2wav, etc.) set max_model_len + # larger than HF max_position_embeddings by design. vLLM's validator + # rejects that without the env flag. + if filtered_engine_args_dict.get("max_model_len") is not None and not os.environ.get( + "VLLM_ALLOW_LONG_MAX_MODEL_LEN" + ): + os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1" + logger.debug( + "Auto-set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 for stage %s (max_model_len=%s).", + stage_config.stage_id, + filtered_engine_args_dict["max_model_len"], + ) + vllm_config = omni_engine_args.create_engine_config( usage_context=UsageContext.LLM_CLASS, headless=headless, diff --git a/vllm_omni/entrypoints/cli/serve.py b/vllm_omni/entrypoints/cli/serve.py index 6e9adc2461..8bccfbb591 100644 --- a/vllm_omni/entrypoints/cli/serve.py +++ b/vllm_omni/entrypoints/cli/serve.py @@ -9,6 +9,7 @@ import json import os import signal +import sys from types import FrameType from typing import Any @@ -21,6 +22,7 @@ from vllm_omni.entrypoints.cli.logo import log_logo from vllm_omni.entrypoints.openai.api_server import omni_run_server +from vllm_omni.entrypoints.utils import detect_explicit_cli_keys logger = init_logger(__name__) @@ -79,6 +81,9 @@ class OmniServeCommand(CLISubcommand): """The `serve` subcommand for the vLLM CLI.""" name = "serve" + # Parser stashed at subparser_init so ``cmd`` can resolve each user-typed + # flag to its real ``dest`` via the parser's action table. + _parser: FlexibleArgumentParser | None = None @staticmethod def cmd(args: argparse.Namespace) -> None: @@ -90,6 +95,10 @@ def cmd(args: argparse.Namespace) -> None: if hasattr(args, "model_tag") and args.model_tag is not None: args.model = args.model_tag + # Stash the set of long-option keys the user actually typed so the + # stage-config factory can give YAML precedence over argparse defaults. + args._cli_explicit_keys = detect_explicit_cli_keys(sys.argv[1:], OmniServeCommand._parser) + if args.headless: run_headless(args) else: @@ -138,11 +147,33 @@ def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgu help="Default task type for TTS models (CustomVoice, VoiceDesign, or Base). " "If not specified, will be inferred from model path.", ) + # TODO(@lishunyang12): deprecate once all models migrate to --deploy-config omni_config_group.add_argument( "--stage-configs-path", type=str, default=None, - help="Path to the stage configs file. If not specified, the stage configs will be loaded from the model.", + help="[Deprecated — will be removed in a future release] Path to a legacy " + "stage configs YAML (stage_args format). Prefer --deploy-config for new-format deploy YAMLs.", + ) + omni_config_group.add_argument( + "--deploy-config", + type=str, + default=None, + help="Path to a deploy config YAML (new format with stages/engine_args). " + "Mutually exclusive with --stage-configs-path.", + ) + omni_config_group.add_argument( + "--stage-overrides", + type=str, + default=None, + help="Per-stage JSON overrides. Example: " + '\'{"0": {"gpu_memory_utilization": 0.8}, "2": {"enforce_eager": true}}\'', + ) + omni_config_group.add_argument( + "--async-chunk", + action=argparse.BooleanOptionalAction, + default=None, + help="Override the deploy YAML's ``async_chunk:`` bool. Unset leaves the YAML value in force.", ) omni_config_group.add_argument( "--stage-id", @@ -406,6 +437,9 @@ def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgu action="store_true", help="Enable diffusion pipeline profiler to display stage durations.", ) + # Stash via type(self) so the docs hook (which execs this function in a + # sandboxed globals dict via ``DummySelf``) doesn't fail on a NameError. + type(self)._parser = serve_parser return serve_parser @@ -461,10 +495,15 @@ def run_headless(args: argparse.Namespace) -> None: raise ValueError("headless mode requires worker_backend=multi_process") args_dict = vars(args).copy() + # Preserve the explicit-keys set captured at parse time so per-stage yaml + # values (e.g. stage 1's ``gpu_memory_utilization: 0.5``) are not + # overwritten by argparse defaults for flags the user didn't type. + cli_explicit_keys = args_dict.pop("_cli_explicit_keys", None) config_path, stage_configs = load_and_resolve_stage_configs( model, args_dict.get("stage_configs_path"), args_dict, + cli_explicit_keys=cli_explicit_keys, ) # Locate the stage config that matches stage_id. diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 82c6489247..023a2e16cf 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -1,6 +1,8 @@ from __future__ import annotations +import argparse import os +import sys import time import types import weakref @@ -15,7 +17,7 @@ from vllm_omni.engine.async_omni_engine import AsyncOmniEngine from vllm_omni.entrypoints.client_request_state import ClientRequestState from vllm_omni.entrypoints.pd_utils import PDDisaggregationMixin -from vllm_omni.entrypoints.utils import get_final_stage_id_for_e2e +from vllm_omni.entrypoints.utils import detect_explicit_cli_keys, get_final_stage_id_for_e2e from vllm_omni.metrics.stats import OrchestratorAggregator as OrchestratorMetrics from vllm_omni.model_executor.model_loader.weight_utils import download_weights_from_hf_specific from vllm_omni.outputs import OmniRequestOutput @@ -69,6 +71,48 @@ def omni_snapshot_download(model_id: str) -> str: class OmniBase(PDDisaggregationMixin): """Shared runtime foundation for AsyncOmni and Omni.""" + @classmethod + def from_cli_args( + cls, + args: argparse.Namespace, + *, + parser: argparse.ArgumentParser | None = None, + **overrides: Any, + ) -> OmniBase: + """Construct an ``Omni`` / ``AsyncOmni`` from an ``argparse.Namespace``. + + Mirrors the ``EngineArgs.from_cli_args`` pattern used upstream and in + ``OmniEngineArgs.from_cli_args``. This is the recommended entry point + for any argparse-based caller (offline scripts, tests, CI): it + expands ``vars(args)`` into kwargs and automatically captures which + flags the user typed on the command line so that argparse defaults + do not silently override deploy YAML values. + + Passing ``parser`` is strongly recommended: without it, flag-to-dest + resolution falls back to a name-based heuristic that misidentifies + flags with ``dest=`` overrides, alias flags, and ``--disable-X`` / + ``store_false`` pairs. See :func:`detect_explicit_cli_keys`. + + Args: + args: Parsed argparse namespace from ``parser.parse_args()``. + parser: The argparse parser used to produce ``args``. When + provided, each user-typed flag is resolved to its real + ``dest`` via the parser's action table. + **overrides: Extra keyword arguments that take precedence over + attributes on ``args``. + + Example:: + + parser = FlexibleArgumentParser() + parser.add_argument("--model", required=True) + args = parser.parse_args() + omni = Omni.from_cli_args(args, parser=parser) # preferred + omni = Omni.from_cli_args(args, parser=parser, model="other") + """ + kwargs: dict[str, Any] = {**vars(args), **overrides} + kwargs["_cli_explicit_keys"] = detect_explicit_cli_keys(sys.argv[1:], parser) + return cls(**kwargs) + def __init__( self, model: str, diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index 84391c2ea8..5757d38990 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -1,3 +1,4 @@ +import argparse import os import types from collections import Counter @@ -5,10 +6,12 @@ from pathlib import Path from typing import Any, get_args, get_origin +import yaml from vllm.logger import init_logger from vllm.transformers_utils.config import get_config, get_hf_file_to_dict from vllm.transformers_utils.repo_utils import file_or_path_exists +from vllm_omni.config.stage_config import StageConfigFactory from vllm_omni.config.yaml_util import create_config, load_yaml_config, merge_configs from vllm_omni.entrypoints.stage_utils import _to_dict from vllm_omni.platforms import current_omni_platform @@ -23,6 +26,65 @@ } +def detect_explicit_cli_keys( + argv: list[str], + parser: argparse.ArgumentParser | None = None, +) -> set[str]: + """Walk ``argv`` and return the set of ``dest`` attribute names the user + explicitly provided (e.g. ``--max-num-seqs 64`` → ``max_num_seqs``). + + Used to distinguish user-typed CLI args from argparse default values so + deploy YAMLs are not silently overridden by parser defaults. Shared + across online (``vllm serve``) and offline (scripts, examples, tests, + CI) entry points — offline callers that parse CLI args via argparse + should invoke this on ``sys.argv[1:]`` and pass the result through to + ``AsyncOmni`` / ``Omni`` via the ``_cli_explicit_keys`` kwarg. + + When ``parser`` is provided, each token is looked up in the parser's + action table to find its real ``dest``. This correctly handles flags + with ``dest=`` overrides, alias flags (e.g. ``--usp`` / + ``--ulysses-degree`` both mapping to ``ulysses_degree``), and + ``--disable-foo`` / ``store_false`` patterns that map to a differently + named dest. Callers with access to an ``argparse.ArgumentParser`` should + always pass it. + + When ``parser`` is ``None``, a name-based heuristic is used as a + fallback (hyphens → underscores, plus a ``no_`` prefix strip for + ``argparse.BooleanOptionalAction``). This is correct for simple flags + but silently misidentifies ``--disable-X``-style flags and explicit + ``dest=`` overrides, so prefer the parser-aware form. + """ + if parser is not None: + dest_map: dict[str, str] = {} + for action in parser._actions: + for opt in action.option_strings: + dest_map[opt] = action.dest + explicit: set[str] = set() + for tok in argv: + if not tok.startswith("--"): + continue + flag = tok.split("=", 1)[0] + dest = dest_map.get(flag) + if dest is not None: + explicit.add(dest) + return explicit + + # Fallback: name-based heuristic (legacy path for callers without a parser). + explicit = set() + for tok in argv: + if not tok.startswith("--"): + continue + name = tok[2:].split("=", 1)[0] + if not name: + continue + attr = name.replace("-", "_") + explicit.add(attr) + # BooleanOptionalAction: --no-foo records as dest `foo`, not `no_foo`. + if attr.startswith("no_"): + explicit.add(attr[3:]) + return explicit + + def inject_omni_kv_config(stage: Any, omni_conn_cfg: dict[str, Any], omni_from: str, omni_to: str) -> None: """Inject connector configuration into stage engine arguments.""" # Prepare omni_kv_config dict @@ -273,29 +335,59 @@ def resolve_model_config_path(model: str) -> str: return str(stage_config_path) -def load_stage_configs_from_model(model: str, base_engine_args: dict | None = None) -> list: +def load_stage_configs_from_model( + model: str, + base_engine_args: dict | None = None, + deploy_config_path: str | None = None, + stage_overrides: dict[str, dict[str, Any]] | None = None, + cli_explicit_keys: set[str] | None = None, +) -> list: """Load stage configurations from model's default config file. - .. deprecated:: - This is the legacy OmegaConf-based loading path. New code should use - ``StageConfigFactory.create_from_model()`` instead. This function will - be removed once all callers are migrated (see PR series [2/N]). + For models registered in the pipeline registry (new path), uses + ``StageConfigFactory.create_from_model()`` which merges + PipelineConfig + DeployConfig + CLI overrides. - Loads stage configurations based on the model type and device type. - First tries to load a device-specific YAML file from stage_configs/{device_type}/ - directory. If not found, falls back to the default config file. + For other models (legacy path), loads stage configs from YAML. Args: model: Model name or path (used to determine model_type) + base_engine_args: Base engine args to merge as CLI overrides. + deploy_config_path: Optional explicit deploy config path. + stage_overrides: Per-stage overrides from --stage-overrides. + cli_explicit_keys: Set of CLI keys the user actually typed. When + provided, only these keys override deploy YAML; argparse defaults + stay subordinate to YAML. ``None`` means treat every kwarg as + explicit (programmatic ``Omni()`` calls). Returns: List of stage configuration dictionaries - - Raises: - FileNotFoundError: If no stage config file exists for the model type """ if base_engine_args is None: base_engine_args = {} + + cli_overrides = _convert_dataclasses_to_dict(dict(base_engine_args)) + # Per-stage JSON overrides are always explicit (the user typed --stage-overrides). + explicit = set(cli_explicit_keys) if cli_explicit_keys is not None else None + if stage_overrides: + for stage_id_str, overrides in stage_overrides.items(): + for key, val in overrides.items(): + stage_key = f"stage_{stage_id_str}_{key}" + cli_overrides[stage_key] = val + if explicit is not None: + explicit.add(stage_key) + + stages = StageConfigFactory.create_from_model( + model, + cli_overrides=cli_overrides, + deploy_config_path=deploy_config_path, + cli_explicit_keys=explicit, + ) + if stages is not None: + # Convert StageConfig objects to OmegaConf for backward compat + return [stage.to_omegaconf() for stage in stages] + + # Legacy fallback: load from YAML stage_config_path = resolve_model_config_path(model) if stage_config_path is None: return [] @@ -312,10 +404,9 @@ def load_stage_configs_from_yaml( base_engine_args: dict | None = None, prefer_stage_engine_args: bool = True, ) -> list: - """Load stage configurations from a YAML file. + """Load stage configurations from a YAML file (legacy OmegaConf path). - .. deprecated:: - Legacy OmegaConf-based loader. Will be removed in PR series [2/N]. + TODO(@lishunyang12): remove once all models use PipelineConfig + DeployConfig. Args: config_path: Path to the YAML configuration file @@ -449,22 +540,75 @@ def load_and_resolve_stage_configs( stage_configs_path: str | None, kwargs: dict | None, default_stage_cfg_factory: Any = None, + deploy_config_path: str | None = None, + stage_overrides: dict[str, dict[str, Any]] | None = None, + cli_explicit_keys: set[str] | None = None, ) -> tuple[str, list]: """Load stage configurations from model or YAML file with fallback to defaults. Args: model: Model name or path - stage_configs_path: Optional path to YAML file containing stage configurations + stage_configs_path: Optional path to legacy YAML (stage_args format) kwargs: Engine arguments to merge with stage configs default_stage_cfg_factory: Optional callable that takes no args and returns default stage config list when no configs are found + deploy_config_path: Optional path to deploy YAML (new format). + Mutually exclusive with ``stage_configs_path``. + stage_overrides: Per-stage overrides from ``--stage-overrides`` JSON. + Keys are stage_id strings, values are dicts of overrides. Returns: Tuple of (config_path, stage_configs) """ - if stage_configs_path is None: + if stage_configs_path is not None and deploy_config_path is not None: + raise ValueError( + "--stage-configs-path and --deploy-config are mutually exclusive: " + "they use different path resolution rules and loading paths. " + "Use --deploy-config for new-format YAMLs (preferred); " + "--stage-configs-path is kept only for the legacy `stage_args` format " + "and will be removed in a future release." + ) + if stage_configs_path is not None and deploy_config_path is None: + if not os.path.exists(stage_configs_path): + raise FileNotFoundError( + f"--stage-configs-path {stage_configs_path!r} does not exist. " + "Legacy `stage_configs/` yamls were replaced by `vllm_omni/deploy/.yaml`; " + "use --deploy-config. See docs/configuration/stage_configs.md." + ) + with open(stage_configs_path, encoding="utf-8") as f: + _peek = yaml.safe_load(f) or {} + if "stages" in _peek and "stage_args" not in _peek: + deploy_config_path = stage_configs_path + stage_configs_path = None + else: + logger.warning( + "--stage-configs-path is deprecated; migrate %r and use --deploy-config.", + stage_configs_path, + ) + + if deploy_config_path is not None: + config_path = deploy_config_path + stage_configs = load_stage_configs_from_model( + model, + base_engine_args=kwargs, + deploy_config_path=deploy_config_path, + stage_overrides=stage_overrides, + cli_explicit_keys=cli_explicit_keys, + ) + if not stage_configs: + if default_stage_cfg_factory is not None: + default_stage_cfg = default_stage_cfg_factory() + stage_configs = create_config(default_stage_cfg) + else: + stage_configs = [] + elif stage_configs_path is None: config_path = resolve_model_config_path(model) - stage_configs = load_stage_configs_from_model(model, base_engine_args=kwargs) + stage_configs = load_stage_configs_from_model( + model, + base_engine_args=kwargs, + stage_overrides=stage_overrides, + cli_explicit_keys=cli_explicit_keys, + ) if not stage_configs: if default_stage_cfg_factory is not None: default_stage_cfg = default_stage_cfg_factory() diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/pipeline.py b/vllm_omni/model_executor/models/qwen2_5_omni/pipeline.py new file mode 100644 index 0000000000..2a9b247a1d --- /dev/null +++ b/vllm_omni/model_executor/models/qwen2_5_omni/pipeline.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Qwen2.5-Omni pipeline topology (frozen). + +Stage 0: Thinker — multimodal understanding + text generation +Stage 1: Talker — text embeddings → speech tokens +Stage 2: Code2Wav — speech tokens → audio waveform +""" + +from vllm_omni.config.stage_config import ( + PipelineConfig, + StageExecutionType, + StagePipelineConfig, + register_pipeline, +) + +_PROC = "vllm_omni.model_executor.stage_input_processors.qwen2_5_omni" + +QWEN2_5_OMNI_PIPELINE = PipelineConfig( + model_type="qwen2_5_omni", + model_arch="Qwen2_5OmniForConditionalGeneration", + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="thinker", + execution_type=StageExecutionType.LLM_AR, + input_sources=(), + final_output=True, + final_output_type="text", + owns_tokenizer=True, + requires_multimodal_data=True, + engine_output_type="latent", + sampling_constraints={"detokenize": True}, + ), + StagePipelineConfig( + stage_id=1, + model_stage="talker", + execution_type=StageExecutionType.LLM_AR, + input_sources=(0,), + engine_output_type="latent", + custom_process_input_func=f"{_PROC}.thinker2talker", + sampling_constraints={ + "detokenize": True, + "stop_token_ids": [8294], + }, + ), + StagePipelineConfig( + stage_id=2, + model_stage="code2wav", + execution_type=StageExecutionType.LLM_GENERATION, + input_sources=(1,), + final_output=True, + final_output_type="audio", + engine_output_type="audio", + sampling_constraints={"detokenize": True}, + ), + ), +) + +register_pipeline(QWEN2_5_OMNI_PIPELINE) + + +# Single-stage thinker-only variant for the abort test. +QWEN2_5_OMNI_THINKER_ONLY_PIPELINE = PipelineConfig( + model_type="qwen2_5_omni_thinker_only", + model_arch="Qwen2_5OmniForConditionalGeneration", + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="thinker", + execution_type=StageExecutionType.LLM_AR, + input_sources=(), + final_output=True, + final_output_type="text", + owns_tokenizer=True, + requires_multimodal_data=True, + engine_output_type="latent", + sampling_constraints={"detokenize": True}, + ), + ), +) + +register_pipeline(QWEN2_5_OMNI_THINKER_ONLY_PIPELINE) diff --git a/vllm_omni/model_executor/models/qwen3_omni/pipeline.py b/vllm_omni/model_executor/models/qwen3_omni/pipeline.py new file mode 100644 index 0000000000..fcaa7ba028 --- /dev/null +++ b/vllm_omni/model_executor/models/qwen3_omni/pipeline.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Qwen3-Omni-MoE pipeline topology (frozen). + +Stage 0: Thinker — multimodal understanding + text generation +Stage 1: Talker — text embeddings → RVQ codec codes +Stage 2: Code2Wav — RVQ codes → audio waveform +""" + +from vllm_omni.config.stage_config import ( + PipelineConfig, + StageExecutionType, + StagePipelineConfig, + register_pipeline, +) + +_PROC = "vllm_omni.model_executor.stage_input_processors.qwen3_omni" + +QWEN3_OMNI_PIPELINE = PipelineConfig( + model_type="qwen3_omni_moe", + model_arch="Qwen3OmniMoeForConditionalGeneration", + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="thinker", + execution_type=StageExecutionType.LLM_AR, + input_sources=(), + final_output=True, + final_output_type="text", + owns_tokenizer=True, + requires_multimodal_data=True, + hf_config_name="thinker_config", + engine_output_type="latent", + custom_process_next_stage_input_func=(f"{_PROC}.thinker2talker_async_chunk"), + sampling_constraints={"detokenize": True}, + ), + StagePipelineConfig( + stage_id=1, + model_stage="talker", + execution_type=StageExecutionType.LLM_AR, + input_sources=(0,), + hf_config_name="talker_config", + engine_output_type="latent", + custom_process_input_func=f"{_PROC}.thinker2talker", + custom_process_next_stage_input_func=(f"{_PROC}.talker2code2wav_async_chunk"), + sampling_constraints={ + "detokenize": False, + "stop_token_ids": [2150], + }, + ), + StagePipelineConfig( + stage_id=2, + model_stage="code2wav", + execution_type=StageExecutionType.LLM_GENERATION, + input_sources=(1,), + final_output=True, + final_output_type="audio", + hf_config_name="thinker_config", + engine_output_type="audio", + custom_process_input_func=f"{_PROC}.talker2code2wav", + sampling_constraints={"detokenize": True}, + ), + ), +) + +register_pipeline(QWEN3_OMNI_PIPELINE) diff --git a/vllm_omni/model_executor/models/qwen3_tts/pipeline.py b/vllm_omni/model_executor/models/qwen3_tts/pipeline.py new file mode 100644 index 0000000000..6c9ed44785 --- /dev/null +++ b/vllm_omni/model_executor/models/qwen3_tts/pipeline.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Qwen3-TTS pipeline: Talker (text → RVQ codec) → Code2Wav (codec → audio). + +Chunked vs end-to-end mode is dispatched from ``deploy.async_chunk``. +""" + +from vllm_omni.config.stage_config import ( + PipelineConfig, + StageExecutionType, + StagePipelineConfig, + register_pipeline, +) + +_PROC = "vllm_omni.model_executor.stage_input_processors.qwen3_tts" + +QWEN3_TTS_PIPELINE = PipelineConfig( + model_type="qwen3_tts", + # Pipeline-level default; the code2wav stage overrides per-stage below. + model_arch="Qwen3TTSTalkerForConditionalGeneration", + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="qwen3_tts", + execution_type=StageExecutionType.LLM_AR, + input_sources=(), + owns_tokenizer=True, + engine_output_type="latent", + async_chunk_process_next_stage_input_func=(f"{_PROC}.talker2code2wav_async_chunk"), + sampling_constraints={ + "detokenize": False, + "stop_token_ids": [2150], + }, + ), + StagePipelineConfig( + stage_id=1, + model_stage="code2wav", + execution_type=StageExecutionType.LLM_GENERATION, + input_sources=(0,), + final_output=True, + final_output_type="audio", + engine_output_type="audio", + model_arch="Qwen3TTSCode2Wav", + sync_process_input_func=f"{_PROC}.talker2code2wav", + sampling_constraints={"detokenize": True}, + extras={"tts_args": {"max_instructions_length": 500}}, + ), + ), +) + +register_pipeline(QWEN3_TTS_PIPELINE) diff --git a/vllm_omni/model_executor/models/qwen3_tts/pipeline.yaml b/vllm_omni/model_executor/models/qwen3_tts/pipeline.yaml deleted file mode 100644 index ec4c1c98b4..0000000000 --- a/vllm_omni/model_executor/models/qwen3_tts/pipeline.yaml +++ /dev/null @@ -1,92 +0,0 @@ -model_type: qwen3_tts -async_chunk: true - -stages: - - stage_id: 0 - model_stage: qwen3_tts - stage_type: llm - is_comprehension: true - input_sources: [] - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - runtime: - devices: "0" - engine_args: - max_num_seqs: 10 - model_arch: Qwen3TTSTalkerForConditionalGeneration - hf_overrides: - architectures: [Qwen3TTSTalkerForConditionalGeneration] - enforce_eager: false - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.08 - distributed_executor_backend: "mp" - max_num_batched_tokens: 512 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - model_stage: code2wav - stage_type: llm - input_sources: [0] - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - final_output: true - final_output_type: audio - runtime: - devices: "0" - engine_args: - max_num_seqs: 1 - model_arch: Qwen3TTSCode2Wav - hf_overrides: - architectures: [Qwen3TTSCode2Wav] - enforce_eager: true - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.08 - distributed_executor_backend: "mp" - max_num_batched_tokens: 65536 - max_model_len: 65536 - input_connectors: - from_stage_0: connector_of_shared_memory - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - codec_streaming: true - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - codec_chunk_frames: 25 - # Match the decoder sliding attention window to avoid chunk-boundary noise. - codec_left_context_frames: 72 - -edges: - - from: 0 - to: 1 diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml deleted file mode 100644 index 9970765ef6..0000000000 --- a/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# stage config for running qwen2.5-omni for multi-stage omni runtime. - -# The following config has been verified on 2x H100-80G GPU. -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - max_num_batched_tokens: 32768 - mm_processor_cache_gb: 0 - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - max_num_batched_tokens: 32768 - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true - devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.15 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - max_num_batched_tokens: 32768 - async_scheduling: false - engine_output_type: audio - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml deleted file mode 100644 index 6f5cc9959c..0000000000 --- a/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml +++ /dev/null @@ -1,135 +0,0 @@ -# stage config for running qwen2.5-omni for multi-stage omni runtime. - -# The following config has been verified on 1x H100-80G GPU. -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - mm_processor_cache_gb: 0 - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - # Distributed connector configuration (optional) - output_connectors: - to_stage_1: mooncake_connector - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - # Distributed connector configuration (optional) - input_connectors: - from_stage_0: mooncake_connector - output_connectors: - to_stage_2: mooncake_connector - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true - devices: "2" # Example: use a different GPU than the previous stage; use "0" if single GPU - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.3 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - max_num_batched_tokens: 32768 - engine_output_type: audio - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - # Distributed connector configuration (optional) - input_connectors: - from_stage_1: mooncake_connector - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - # Distributed connectors configuration (optional) - # More connectors will be supported in the future. - connectors: - # Mooncake connector for cross-node/intra-node communication - mooncake_connector: - name: MooncakeStoreConnector - extra: - host: "127.0.0.1" - metadata_server: "http://10.90.67.86:8080/metadata" - master: "10.90.67.86:50051" - segment: 512000000 # 512MB - localbuf: 64000000 # 64MB - proto: "tcp" - - # Yuanrong connector for cross-node/intra-node communication - yuanrong_connector: - name: YuanrongConnector - extra: - host: "127.0.0.1" - port: "35000" - - # SharedMemory connector for intra-node communication - # Alternative SHM connector with different threshold - shared_memory_connector: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 # 64KB threshold - - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml deleted file mode 100644 index 0ce4f0c94f..0000000000 --- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) - -# The following config has been verified on 2x H100-80G GPUs. -async_chunk: false -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - hf_config_name: thinker_config - tensor_parallel_size: 1 - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - enable_prefix_caching: false - max_num_batched_tokens: 32768 - distributed_executor_backend: "mp" - hf_config_name: talker_config - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "1" - engine_args: - model_stage: code2wav - max_num_seqs: 32 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.1 - distributed_executor_backend: "mp" - max_num_batched_tokens: 1000000 - hf_config_name: thinker_config - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml deleted file mode 100644 index 38626fc081..0000000000 --- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml +++ /dev/null @@ -1,117 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes) -# Stage 2: Code2Wav (16-layer RVQ codes → audio waveform) - -# The following config has been verified on 2x H100-80G GPUs. -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - hf_config_name: thinker_config - tensor_parallel_size: 1 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk - final_output: true - final_output_type: text - is_comprehension: true - # Use named connector to apply runtime.connectors.extra. - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - enable_prefix_caching: false - max_num_batched_tokens: 32768 - distributed_executor_backend: "mp" - hf_config_name: talker_config - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk - engine_input_source: [0] - # final_output: true - # final_output_type: text - # Distributed connector configuration - input_connectors: - from_stage_0: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "1" - engine_args: - model_stage: code2wav - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.1 - distributed_executor_backend: "mp" - max_num_batched_tokens: 51200 # [TODO] if max_num_batch_tokens < max_num_seqs * 800, there will be precision problem. - hf_config_name: thinker_config - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -runtime: - - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - # Align with Omni: small chunks with sufficient context overlap. - codec_chunk_frames: 25 # code2wav decode chunk size - codec_left_context_frames: 25 # code2wav left context size diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml deleted file mode 100644 index 5cb95d6a35..0000000000 --- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml +++ /dev/null @@ -1,137 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings -> 8-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes -> audio waveform) - -# The following config has been verified on 2x H100-80G GPUs. -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - hf_config_name: thinker_config - tensor_parallel_size: 1 - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - # Distributed connector configuration - output_connectors: - to_stage_1: mooncake_connector - - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - # tensor_parallel_size: 2 - enable_prefix_caching: false - distributed_executor_backend: "mp" - hf_config_name: talker_config - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - # Distributed connector configuration - input_connectors: - from_stage_0: mooncake_connector - output_connectors: - to_stage_2: mooncake_connector - - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "1" - engine_args: - model_stage: code2wav - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.1 - distributed_executor_backend: "mp" - max_num_batched_tokens: 1000000 - hf_config_name: thinker_config - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - # Distributed connector configuration - input_connectors: - from_stage_1: mooncake_connector - -# Top-level runtime config: default windows and stage edges -runtime: - enabled: true - # Distributed connectors configuration - connectors: - # Mooncake connector for cross-node/intra-node communication - mooncake_connector: - name: MooncakeStoreConnector - extra: - host: "127.0.0.1" - metadata_server: "http://10.90.67.86:8080/metadata" - master: "10.90.67.86:50051" - segment: 512000000 # 512MB - localbuf: 64000000 # 64MB - proto: "tcp" - - # SharedMemory connector for intra-node communication - shared_memory_connector: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 # 64KB threshold - - edges: - - from: 0 - to: 1 - - from: 1 - to: 2 diff --git a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml b/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml deleted file mode 100644 index 80defdfea9..0000000000 --- a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml +++ /dev/null @@ -1,94 +0,0 @@ -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - model_stage: qwen3_tts - max_num_seqs: 10 - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: false - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 512 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - # Use named connector to apply runtime.connectors.extra. - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - # Must be divisible by num_code_groups and cover (left_context + chunk). - # Prefill length is Q * num_frames (e.g. 16 * 2148 = 34368); keep headroom past 32k. - max_num_batched_tokens: 65536 - # async_chunk appends windows per step; max_model_len must cover accumulated flat codec stream. - max_model_len: 65536 - engine_input_source: [0] - final_output: true - final_output_type: audio - # Distributed connector configuration - input_connectors: - from_stage_0: connector_of_shared_memory - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -runtime: - enabled: true - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - # Frame-aligned codec streaming transport. - codec_streaming: true - # Connector polling / timeout (unit: loop count, sleep interval in seconds). - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - # Match the decoder sliding attention window to avoid chunk-boundary noise. - codec_chunk_frames: 25 - codec_left_context_frames: 72 - - edges: - - from: 0 - to: 1 diff --git a/vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml b/vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml deleted file mode 100644 index d7ded0c517..0000000000 --- a/vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml +++ /dev/null @@ -1,95 +0,0 @@ -# Same as qwen3_tts.yaml with batched talker and code2wav. -# Stage 0: max_num_seqs 4, stage 1: max_num_seqs 4. -# max_num_seqs must be a power of two to align with CUDA graph capture sizes -# (stage 0) and must match --batch-size in end2end.py / benchmark scripts. -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - model_stage: qwen3_tts - max_num_seqs: 4 - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: false - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 512 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - # Use named connector to apply runtime.connectors.extra. - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - model_stage: code2wav - max_num_seqs: 4 - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.2 - distributed_executor_backend: "mp" - # Must be divisible by num_code_groups and cover (left_context + chunk). - max_num_batched_tokens: 65536 - # Flat codec prompt can exceed 32k tokens (Q * frames); align with max_tokens below. - max_model_len: 65536 - engine_input_source: [0] - final_output: true - final_output_type: audio - # Distributed connector configuration - input_connectors: - from_stage_0: connector_of_shared_memory - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -runtime: - enabled: true - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - # Frame-aligned codec streaming transport. - codec_streaming: true - # Connector polling / timeout (unit: loop count, sleep interval in seconds). - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - # Match the decoder sliding attention window to avoid chunk-boundary noise. - codec_chunk_frames: 25 - codec_left_context_frames: 72 - - edges: - - from: 0 - to: 1 diff --git a/vllm_omni/model_executor/stage_configs/qwen3_tts_no_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/qwen3_tts_no_async_chunk.yaml deleted file mode 100644 index 3f412fc4dc..0000000000 --- a/vllm_omni/model_executor/stage_configs/qwen3_tts_no_async_chunk.yaml +++ /dev/null @@ -1,64 +0,0 @@ -async_chunk: false -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - model_stage: qwen3_tts - max_num_seqs: 1 - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: false - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 512 - max_model_len: 4096 - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.2 - distributed_executor_backend: "mp" - max_num_batched_tokens: 65536 - max_model_len: 65536 - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav - final_output: true - final_output_type: audio - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 diff --git a/vllm_omni/platforms/npu/stage_configs/qwen2_5_omni.yaml b/vllm_omni/platforms/npu/stage_configs/qwen2_5_omni.yaml deleted file mode 100644 index fe2fa45ed0..0000000000 --- a/vllm_omni/platforms/npu/stage_configs/qwen2_5_omni.yaml +++ /dev/null @@ -1,92 +0,0 @@ -# stage config for running qwen2.5-omni for multi-stage omni runtime. -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: true # haven't supported talker ACL graph on NPU - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true - devices: "2" # Example: use a different NPU than the previous stage; use "0" if single NPU - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.15 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 diff --git a/vllm_omni/platforms/npu/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/platforms/npu/stage_configs/qwen3_omni_moe.yaml deleted file mode 100644 index 2638c99cd4..0000000000 --- a/vllm_omni/platforms/npu/stage_configs/qwen3_omni_moe.yaml +++ /dev/null @@ -1,99 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) - -# The following config has been verified on 5x A2/A3-64G NPUs. -stage_args: - - stage_id: 0 - runtime: - devices: "0,1" - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - hf_config_name: thinker_config - tensor_parallel_size: 2 - # profiler_config: - # profiler: torch - # torch_profiler_dir: ./perf - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - runtime: - devices: "2" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: true # haven't supported talker ACL graph on NPU - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - # tensor_parallel_size: 2 - enable_prefix_caching: false - distributed_executor_backend: "mp" - hf_config_name: talker_config - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 2 - runtime: - devices: "2" - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 1000000 - hf_config_name: thinker_config - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/vllm_omni/platforms/npu/stage_configs/qwen3_omni_moe_async_chunk.yaml b/vllm_omni/platforms/npu/stage_configs/qwen3_omni_moe_async_chunk.yaml deleted file mode 100644 index 9aa20baecf..0000000000 --- a/vllm_omni/platforms/npu/stage_configs/qwen3_omni_moe_async_chunk.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes) -# Stage 2: Code2Wav (16-layer RVQ codes → audio waveform) - -# The following config has been verified on 2x H100-80G GPUs. -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "0,1" - engine_args: - max_num_seqs: 10 - model_stage: thinker - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - hf_config_name: thinker_config - tensor_parallel_size: 2 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "2" - engine_args: - max_num_seqs: 10 - model_stage: talker - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - enable_prefix_caching: false - max_num_batched_tokens: 32768 - distributed_executor_backend: "mp" - hf_config_name: talker_config - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk - engine_input_source: [0] - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.0 - stop_token_ids: [2150] - - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "2" - engine_args: - max_num_seqs: 10 - model_stage: code2wav - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 51200 # [TODO] if max_num_batched_tokens < max_num_seqs * 800, there will be precision problem. - hf_config_name: thinker_config - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml b/vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml deleted file mode 100644 index 586937d676..0000000000 --- a/vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml +++ /dev/null @@ -1,91 +0,0 @@ -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - model_stage: qwen3_tts - max_num_seqs: 1 - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 512 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - # Use named connector to apply runtime.connectors.extra. - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.2 - distributed_executor_backend: "mp" - max_num_batched_tokens: 65536 - max_model_len: 65536 - engine_input_source: [0] - final_output: true - final_output_type: audio - # Distributed connector configuration - input_connectors: - from_stage_0: connector_of_shared_memory - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -runtime: - enabled: true - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - # Frame-aligned codec streaming transport. - codec_streaming: true - # Connector polling / timeout (unit: loop count, sleep interval in seconds). - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - # Align with Omni: small chunks with sufficient context overlap. - codec_chunk_frames: 25 - codec_left_context_frames: 72 - - edges: - - from: 0 - to: 1 diff --git a/vllm_omni/platforms/rocm/stage_configs/qwen2_5_omni.yaml b/vllm_omni/platforms/rocm/stage_configs/qwen2_5_omni.yaml deleted file mode 100644 index c0860eac7f..0000000000 --- a/vllm_omni/platforms/rocm/stage_configs/qwen2_5_omni.yaml +++ /dev/null @@ -1,96 +0,0 @@ -# stage config for running qwen2.5-omni for multi-stage omni runtime. - -# The following config has been verified on 2x H100-80G GPU. -stage_args: - - stage_id: 0 - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - max_num_batched_tokens: 32768 - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - - stage_id: 1 - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - max_num_batched_tokens: 32768 - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - - stage_id: 2 - runtime: - process: true - devices: "2" # Example: use a different GPU than the previous stage; use "0" if single GPU - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.15 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - max_num_batched_tokens: 32768 - engine_output_type: audio - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 diff --git a/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml deleted file mode 100644 index 0ca150bee6..0000000000 --- a/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml +++ /dev/null @@ -1,97 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) - -# The following config has been verified on 2x H100-80G GPUs. -stage_args: - - stage_id: 0 - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - hf_config_name: thinker_config - tensor_parallel_size: 1 - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - runtime: - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - # tensor_parallel_size: 2 - enable_prefix_caching: false - max_num_batched_tokens: 32768 - distributed_executor_backend: "mp" - hf_config_name: talker_config - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 2 - runtime: - devices: "1" - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.1 - distributed_executor_backend: "mp" - max_num_batched_tokens: 1000000 - hf_config_name: thinker_config - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/vllm_omni/platforms/xpu/stage_configs/qwen2_5_omni.yaml b/vllm_omni/platforms/xpu/stage_configs/qwen2_5_omni.yaml deleted file mode 100644 index 56306e364a..0000000000 --- a/vllm_omni/platforms/xpu/stage_configs/qwen2_5_omni.yaml +++ /dev/null @@ -1,95 +0,0 @@ -# stage config for running qwen2.5-omni for multi-stage omni runtime. - -# The following config is verified with 2 * Intel Arc Pro B60 XPU. -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 # thinker weight is around 16.74GB for Qwen2.5-Omni-7B - enforce_eager: false - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.5 # talker weight is 6.03GB for Qwen2.5-Omni-7B - enforce_eager: false - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true - devices: "1" - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.3 # code2wav weight is around 1.46GB for Qwen2.5-Omni-7B - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 diff --git a/vllm_omni/platforms/xpu/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/platforms/xpu/stage_configs/qwen3_omni_moe.yaml deleted file mode 100644 index 49914bebc4..0000000000 --- a/vllm_omni/platforms/xpu/stage_configs/qwen3_omni_moe.yaml +++ /dev/null @@ -1,102 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) - -# The following config is verified with 8 * Intel Arc Pro B60 XPU. -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "0,1,2,3" - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 # thinker weight is around 61.08GB for Qwen3-Omni-30B-A3B-Instruct - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - hf_config_name: thinker_config - tensor_parallel_size: 4 - max_cudagraph_capture_size: 0 - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "4" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 # talker weight is around 8.5GB for Qwen3-Omni-30B-A3B-Instruct - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - enable_prefix_caching: false - max_num_batched_tokens: 32768 - distributed_executor_backend: "mp" - hf_config_name: talker_config - max_cudagraph_capture_size: 0 - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "4" - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.3 # code2wav weight is around 0.4GB for Qwen3-Omni-30B-A3B-Instruct - distributed_executor_backend: "mp" - max_num_batched_tokens: 1000000 - hf_config_name: thinker_config - max_cudagraph_capture_size: 0 - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: True - repetition_penalty: 1.1