From f97afaf6db3ea6f19f61b2b194c019cbaf46456e Mon Sep 17 00:00:00 2001 From: princepride Date: Mon, 20 Apr 2026 07:49:55 +0000 Subject: [PATCH 01/13] Remove Bagel yaml and update examples Signed-off-by: princepride --- .../examples/offline_inference/bagel.md | 348 +++++++++--------- .../examples/online_serving/bagel.md | 299 +++++++-------- examples/offline_inference/bagel/README.md | 347 ++++++++--------- examples/offline_inference/bagel/end2end.py | 8 +- examples/online_serving/bagel/README.md | 298 +++++++-------- .../bagel/run_server_stage_cli.sh | 2 +- .../stage_configs/bagel_mooncake_ci.yaml | 79 ---- .../stage_configs/bagel_sharedmemory_ci.yaml | 76 ---- .../offline_inference/test_bagel_img2img.py | 18 +- .../e2e/offline_inference/test_bagel_lora.py | 17 +- .../offline_inference/test_bagel_text2img.py | 28 +- .../test_bagel_understanding.py | 17 +- .../test_quantization_fp8.py | 4 +- tests/e2e/online_serving/test_bagel_online.py | 6 +- tests/helpers/stage_config.py | 55 +++ vllm_omni/config/pipeline_registry.py | 8 + vllm_omni/deploy/bagel.yaml | 51 +++ vllm_omni/deploy/bagel_single_stage.yaml | 22 ++ .../model_executor/models/bagel/pipeline.py | 73 ++++ .../model_executor/stage_configs/bagel.yaml | 102 ----- .../stage_configs/bagel_multiconnector.yaml | 102 ----- .../stage_configs/bagel_single_stage.yaml | 24 -- .../stage_configs/bagel_think.yaml | 77 ---- .../stage_configs/bagel_usp2.yaml | 72 ---- .../platforms/xpu/stage_configs/bagel.yaml | 81 ---- 25 files changed, 867 insertions(+), 1347 deletions(-) delete mode 100644 tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml delete mode 100644 tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml create mode 100644 vllm_omni/deploy/bagel.yaml create mode 100644 vllm_omni/deploy/bagel_single_stage.yaml create mode 100644 vllm_omni/model_executor/models/bagel/pipeline.py delete mode 100644 vllm_omni/model_executor/stage_configs/bagel.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/bagel_think.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/bagel_usp2.yaml delete mode 100644 vllm_omni/platforms/xpu/stage_configs/bagel.yaml diff --git a/docs/user_guide/examples/offline_inference/bagel.md b/docs/user_guide/examples/offline_inference/bagel.md index 1fb4d404578..a1b41691916 100644 --- a/docs/user_guide/examples/offline_inference/bagel.md +++ b/docs/user_guide/examples/offline_inference/bagel.md @@ -2,46 +2,61 @@ Source . - -## Set up +## Setup Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup. -## Run examples +## Architecture + +BAGEL-7B-MoT is a Mixture-of-Transformers (MoT) model supporting both image generation and understanding. It offers two deployment topologies: -**Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, please modify the stage configuration to distribute the model across devices. +| Topology | Stages | Description | +| :------- | :----- | :---------- | +| **Two-stage** (default) | Stage 0 (Thinker, AR) + Stage 1 (DiT, Diffusion) | Thinker handles text/understanding via vLLM AR engine; DiT handles image generation. KV cache is transferred between stages. | +| **Single-stage** | Stage 0 (DiT, Diffusion) only | The DiT stage contains a full LLM, ViT, VAE, and tokenizer internally. All modalities are handled within a single diffusion process. | -Get into the bagel folder +Both topologies support all four modalities: `text2img`, `img2img`, `img2text`, `text2text`. + +## Quick Start ```bash cd examples/offline_inference/bagel + +# Default two-stage mode (auto-detected) +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2img \ + --prompts "A cute cat" + +# Single-stage mode +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2img \ + --prompts "A cute cat" \ + --stage-configs-path vllm_omni/deploy/bagel_single_stage.yaml ``` -### Modality Control +> **Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. For dual-GPU setups, modify the deploy YAML to distribute stages across devices. -BAGEL-7B-MoT supports multiple modality modes. You can control the mode using the `--modality` argument: +## Modality Control -#### Text to Image (text2img) +Control the mode using the `--modality` argument: -- **Pipeline**: Text → Thinker → DiT → VAE Decode → Image -- **Stages Used**: Stage 0 (Thinker) + Stage 1 (DiT) -- **KV Transfer**: Thinker sends KV cache to DiT for conditioned generation +| Modality | Input | Output | Description | +| :------- | :---- | :----- | :---------- | +| `text2img` | Text | Image | Generate images from text prompts | +| `img2img` | Image + Text | Image | Transform images using text guidance | +| `img2text` | Image + Text | Text | Generate text descriptions from images | +| `text2text` | Text | Text | Pure text generation (language model mode) | -Generate images from text prompts: +### Text to Image (text2img) ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ - --prompts "A cute cat" + --prompts "A cute cat" \ + --steps 50 ``` -#### Image to Image (img2img) - -- **Pipeline**: Image → VAE Encode → DiT → VAE Decode → New Image -- **Stages Used**: Stage 1 (DiT) only -- **Special**: Bypasses the Thinker stage, direct image-to-image transformation - -Transform images based on text prompts: +### Image to Image (img2img) ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ @@ -50,13 +65,7 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --prompts "Let the woman wear a blue dress" ``` -#### Image to Text (img2text) - -- **Pipeline**: Image → ViT + VAE Encode → Thinker → Text Output -- **Stages Used**: Stage 0 (Thinker) only -- **Special**: Uses both VAE latent encoding AND ViT semantic encoding for comprehensive image understanding - -Generate text descriptions from images: +### Image to Text (img2text) ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ @@ -65,196 +74,201 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --prompts "Describe this image in detail" ``` -#### Text to Text (text2text) - -- **Pipeline**: Text → Thinker → Text Output -- **Stages Used**: Stage 0 (Thinker) only -- **Special**: No visual components involved, operates as pure language model - -Pure text generation: +### Text to Text (text2text) ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2text \ --prompts "What is the capital of France?" -# You can load prompts from a text file (one prompt per line): +# Load prompts from a text file (one prompt per line): python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2text \ --txt-prompts /path/to/prompts.txt ``` -### Inference Steps +## Think Mode -Control the number of inference steps for image generation: +Think mode enables the model to generate `...` planning/reasoning tokens before producing the final output. This improves generation quality for complex prompts. + +- **Two-stage**: The Thinker (AR) stage decodes think tokens, then transfers the augmented KV cache to the DiT stage for image generation. +- **Single-stage**: The DiT's internal LLM generates think tokens in-place before proceeding to denoise. ```bash -# You can adjust steps to 100 to improve image quality +# Think + text2img: plan before generating python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ - --steps 50 \ - --prompts "A cute cat" -``` + --prompts "A futuristic city with flying cars" \ + --think \ + --max-think-tokens 1000 + +# Think + img2img: reason about the edit +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality img2img \ + --image-path /path/to/image.jpg \ + --prompts "Make it look like a watercolor painting" \ + --think + +# Think + img2text: reason before describing +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality img2text \ + --image-path /path/to/image.jpg \ + --prompts "What is happening in this image?" \ + --think -### Key arguments - -BAGEL-7B-MoT supports **multiple modality modes** for different use cases. - -The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml) - -#### 📌 Command Line Arguments (end2end.py) - -| Argument | Type | Default | Description | -| :--------------------- | :----- | :---------------------------- | :----------------------------------------------------------- | -| `--model` | string | `ByteDance-Seed/BAGEL-7B-MoT` | Model path or name | -| `--modality` | choice | `text2img` | Modality mode: `text2img`, `img2img`, `img2text`, `text2text` | -| `--prompts` | list | `None` | Input text prompts directly | -| `--txt-prompts` | string | `None` | Path to txt file with one prompt per line | -| `--image-path` | string | `None` | Input image path (for `img2img`/`img2text`) | -| `--steps` | int | `50` | Number of inference steps | -| `--stage-configs-path` | string | `None` | Custom stage config file path | -| `--worker-backend` | choice | `process` | Worker backend: `process` or `ray` | -| `--ray-address` | string | `None` | Ray cluster address | -| `--enable-stats` | flag | `False` | Enable statistics logging | -| `--init-sleep-seconds` | int | `20` | Initialization sleep time | -| `--batch-timeout` | int | `5` | Batch timeout | -| `--init-timeout` | int | `300` | Initialization timeout | - ------- - -#### ⚙️ Stage Configuration Parameters (bagel.yaml) - - **Stage 0 - Thinker (LLM Stage)** - -| Parameter | Value | Description | -| :------------------------------- | :------------------------------ | :----------------------- | -| `stage_type` | `llm` | Stage type | -| `devices` | `"0"` | GPU device ID | -| `max_num_seqs` | `1` | Maximum batch size | -| `model_stage` | `thinker` | Model stage identifier | -| `model_arch` | `BagelForConditionalGeneration` | Model architecture | -| `gpu_memory_utilization` | `0.4` | GPU memory utilization | -| `tensor_parallel_size` | `1` | Tensor parallel size | -| `max_num_batched_tokens` | `32768` | Maximum batched tokens | -| `omni_kv_config.need_send_cache` | `true` | Whether to send KV cache | - ------- - -**Stage 1 - DiT (Diffusion Stage)** - -| Parameter | Value | Description | -| :------------------------------- | :---------- | :-------------------------- | -| `stage_type` | `diffusion` | Stage type | -| `devices` | `"0"` | GPU device ID | -| `max_num_seqs` | `1` | Maximum batch size | -| `model_stage` | `dit` | Model stage identifier | -| `gpu_memory_utilization` | `0.4` | GPU memory utilization | -| `omni_kv_config.need_recv_cache` | `true` | Whether to receive KV cache | -| `engine_input_source` | `[0]` | Input source from Stage 0 | - ------- - -#### Tensor Parallelism (TP) - -For larger models or multi-GPU environments, you can enable Tensor Parallelism (TP) by modifying the stage configuration (e.g., [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml)). - -1. **Set `tensor_parallel_size`**: Increase this value (e.g., to `2` or `4`). -2. **Set `devices`**: Specify the comma-separated GPU IDs to be used for the stage (e.g., `"0,1"`). - -Example configuration for TP=2 on GPUs 0 and 1: -```yaml - engine_args: - tensor_parallel_size: 2 - ... - runtime: - devices: "0,1" +# Think + text2text: chain-of-thought reasoning +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2text \ + --prompts "Solve: 23 * 47" \ + --think ``` ------- +Think mode parameters: -#### 🔗 Runtime Configuration +| Argument | Default | Description | +| :------- | :------ | :---------- | +| `--think` | `False` | Enable thinking mode | +| `--max-think-tokens` | `1000` | Maximum tokens for think generation | +| `--do-sample` | `False` | Enable sampling (vs. greedy) for text generation | +| `--text-temperature` | `0.3` | Temperature for text generation sampling | -| Parameter | Value | Description | -| :-------------------- | :------ | :------------------------------- | -| `shm_threshold_bytes` | `65536` | Shared memory threshold (64KB) | +## Classifier-Free Guidance (CFG) -## Using Mooncake Connector +CFG controls the trade-off between prompt fidelity and diversity. These parameters apply to image generation modalities (`text2img`, `img2img`). + +```bash +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2img \ + --prompts "A photorealistic portrait" \ + --cfg-text-scale 6.0 \ + --cfg-img-scale 2.0 \ + --negative-prompt "blurry, low quality, distorted" \ + --cfg-interval 0.4 1.0 \ + --cfg-renorm-type global \ + --cfg-renorm-min 0.0 +``` -[Mooncake](https://github.com/kvcache-ai/Mooncake) is a high-performance distributed KV cache transfer engine that enables efficient cross-node data movement via TCP or RDMA, making it ideal for multi-node disaggregated inference. +| Argument | Default | Description | +| :------- | :------ | :---------- | +| `--cfg-text-scale` | `4.0` | Text CFG scale (higher = more prompt-adherent) | +| `--cfg-img-scale` | `1.5` | Image CFG scale (for img2img) | +| `--negative-prompt` | `None` | Negative prompt for CFG conditioning | +| `--cfg-interval` | pipeline default | CFG active interval `[start, end]` as fractions of total timesteps | +| `--cfg-renorm-type` | `None` | Renormalization type: `global`, `text_channel`, `channel` | +| `--cfg-renorm-min` | `None` | Minimum renormalization value | +| `--cfg-parallel-size` | `1` | CFG parallel size: `1` = batched (single GPU), `2` = 2-branch parallel, `3` = full 3-GPU parallel | -By default, BAGEL uses `SharedMemoryConnector` for inter-stage communication. You can switch to the Mooncake connector for better performance on multi-GPU setups and to enable multi-node deployment. +## Deployment Topologies -### Prerequisites +### Two-Stage (Default) -Install the Mooncake transfer engine: +The default topology auto-detected from the model. No extra flags needed. ```bash -# For CUDA-enabled systems (recommended) -pip install mooncake-transfer-engine - -# For non-CUDA systems -pip install mooncake-transfer-engine-non-cuda +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2img \ + --prompts "A cute cat" ``` -### Step 1: Start the Mooncake Master +The pipeline is defined in [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml). Stage 0 (Thinker) and Stage 1 (DiT) share GPU 0 by default. For dual-GPU setups, customize the deploy YAML and set `devices: "1"` for stage 1. + +### Single-Stage -On the **primary node**, start the Mooncake master service (run in a separate terminal or background with `&`): +Pass the single-stage deploy config via `--stage-configs-path`: ```bash -# Optional: enable disk-backed storage by creating a directory and passing --root_fs_dir. -# Without it, Mooncake runs in memory-only mode, which is sufficient for KV cache transfer. -mkdir -p ./mc_storage - -mooncake_master \ - --rpc_port=50051 \ - --enable_http_metadata_server=true \ - --http_metadata_server_host=0.0.0.0 \ - --http_metadata_server_port=8080 \ - --metrics_port=9003 \ - --root_fs_dir=./mc_storage/ \ - --cluster_id=mc-local-1 & +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2img \ + --prompts "A cute cat" \ + --stage-configs-path vllm_omni/deploy/bagel_single_stage.yaml ``` -### Step 2: Run Offline Inference with Mooncake +See [`bagel_single_stage.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel_single_stage.yaml) for configuration details. The `pipeline: bagel_single_stage` field selects the single-stage topology from the pipeline registry. -Use the provided Mooncake stage config [`bagel_multiconnector.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml). Before launching, update the `metadata_server` and `master` addresses in the YAML to match your Mooncake master node's IP (use `127.0.0.1` for single-node testing). +### Tensor Parallelism (TP) -```bash -cd examples/offline_inference/bagel +For larger models or multi-GPU environments: -# Text to Image with Mooncake +```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml + --tensor-parallel-size 2 +``` -# Image to Text with Mooncake -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality img2text \ - --image-path /path/to/image.jpg \ - --prompts "Describe this image" \ - --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml +Or customize the deploy YAML (see [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml)) with per-stage `tensor_parallel_size`. + +### FP8 Quantization -# Text to Text with Mooncake +```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2text \ - --prompts "What is the capital of France?" \ - --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml + --modality text2img \ + --prompts "A cute cat" \ + --quantization fp8 ``` -For more details on the Mooncake connector and multi-node setup, see the [Mooncake Store Connector documentation](https://github.com/vllm-project/vllm-omni/tree/main/docs/design/feature/omni_connectors/mooncake_store_connector.md). - ------- +## Command Line Reference + +### Core Arguments + +| Argument | Type | Default | Description | +| :------- | :--- | :------ | :---------- | +| `--model` | string | `ByteDance-Seed/BAGEL-7B-MoT` | Model path or HuggingFace name | +| `--modality` | choice | `text2img` | `text2img`, `img2img`, `img2text`, `text2text` | +| `--prompts` | list | `None` | Input text prompts | +| `--txt-prompts` | string | `None` | Path to text file with one prompt per line | +| `--image-path` | string | `None` | Input image path (required for `img2img`/`img2text`) | +| `--output` | string | `.` | Output directory for saved images | +| `--steps` | int | `50` | Number of diffusion inference steps | +| `--seed` | int | `None` | Random seed for reproducibility | + +### Think Mode Arguments + +| Argument | Type | Default | Description | +| :------- | :--- | :------ | :---------- | +| `--think` | flag | `False` | Enable `...` planning/reasoning | +| `--max-think-tokens` | int | `1000` | Maximum tokens for think generation | +| `--do-sample` | flag | `False` | Use sampling instead of greedy decoding | +| `--text-temperature` | float | `0.3` | Sampling temperature for text generation | + +### CFG Arguments + +| Argument | Type | Default | Description | +| :------- | :--- | :------ | :---------- | +| `--cfg-text-scale` | float | `4.0` | Text CFG guidance scale | +| `--cfg-img-scale` | float | `1.5` | Image CFG guidance scale | +| `--negative-prompt` | string | `None` | Negative prompt for CFG | +| `--cfg-parallel-size` | int | `1` | CFG parallel GPU count (1, 2, or 3) | +| `--cfg-interval` | float[2] | pipeline default | CFG active window `[start, end]` | +| `--cfg-renorm-type` | string | `None` | `global`, `text_channel`, or `channel` | +| `--cfg-renorm-min` | float | `None` | Minimum renormalization value | + +### Engine Arguments + +| Argument | Type | Default | Description | +| :------- | :--- | :------ | :---------- | +| `--stage-configs-path` | string | `None` | Path to deploy YAML (auto-detected if omitted) | +| `--worker-backend` | choice | `process` | `process` or `ray` | +| `--ray-address` | string | `None` | Ray cluster address | +| `--quantization` | string | `None` | Quantization method (e.g. `fp8`) | +| `--log-stats` | flag | `False` | Enable statistics logging | +| `--init-timeout` | int | `300` | Initialization timeout (seconds) | +| `--batch-timeout` | int | `5` | Batch timeout (seconds) | +| `--enable-diffusion-pipeline-profiler` | flag | `False` | Profile diffusion stage durations | ## FAQ -- If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. +- If you encounter OOM errors, try decreasing `max_model_len` or `gpu_memory_utilization` in the deploy YAML. + +**Two-stage VRAM usage:** + +| Stage | VRAM | +| :---- | :--- | +| Stage 0 (Thinker) | **15.04 GiB + KV Cache** | +| Stage 1 (DiT) | **26.50 GiB** | +| Total | **~42 GiB + KV Cache** | -| Stage | VRAM | -| :------------------ | :--------------------------- | -| Stage-0 (Thinker) | **15.04 GiB** **+ KV Cache** | -| Stage-1 (DiT) | **26.50 GiB** | -| Total | **~42 GiB + KV Cache** | +**Single-stage VRAM usage:** The DiT loads the full model (~42 GiB) in one process. ## Example materials diff --git a/docs/user_guide/examples/online_serving/bagel.md b/docs/user_guide/examples/online_serving/bagel.md index 9de31926aa1..78aa4f21bec 100644 --- a/docs/user_guide/examples/online_serving/bagel.md +++ b/docs/user_guide/examples/online_serving/bagel.md @@ -2,147 +2,107 @@ Source . - -## 🛠️ Installation +## Installation Please refer to [README.md](https://github.com/vllm-project/vllm-omni/tree/main/README.md) -## Run examples (BAGEL-7B-MoT) +## Architecture + +BAGEL-7B-MoT is a Mixture-of-Transformers (MoT) model supporting both image generation and understanding. It offers two deployment topologies: + +| Topology | Stages | Description | +| :------- | :----- | :---------- | +| **Two-stage** (default) | Stage 0 (Thinker, AR) + Stage 1 (DiT, Diffusion) | Thinker handles text/understanding via vLLM AR engine; DiT handles image generation. KV cache is transferred between stages. | +| **Single-stage** | Stage 0 (DiT, Diffusion) only | The DiT stage contains a full LLM, ViT, VAE, and tokenizer internally. All modalities are handled within a single diffusion process. | + +Both topologies support all four modalities: `text2img`, `img2img`, `img2text`, `text2text`. + +> **Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, modify the deploy YAML to distribute stages across devices. + +## Launch the Server -**Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, please modify the stage configuration to distribute the model across devices. +### Two-Stage (Default) -### Launch the Server +The default pipeline is auto-detected from the model. No extra flags needed: ```bash -# Use default configuration vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 ``` Or use the convenience script: ```bash -cd /workspace/vllm-omni/examples/online_serving/bagel +cd examples/online_serving/bagel bash run_server.sh ``` -```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --stage-configs-path /path/to/stage_configs_file -``` - -#### 🚀 Tensor Parallelism (TP) - -For larger models or multi-GPU environments, you can enable Tensor Parallelism (TP) for the server. +To use a custom deploy YAML: -1. **Modify Stage Config**: Create or modify a stage configuration yaml (e.g., [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml)). Set `tensor_parallel_size` to `2` (or more) and update `devices` to include multiple GPU IDs (e.g., `"0,1"`). - -```yaml - engine_args: - tensor_parallel_size: 2 - ... - runtime: - devices: "0,1" -``` - -2. **Launch Server**: ```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --stage-configs-path /path/to/your/custom_bagel.yaml +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ + --deploy-config /path/to/deploy_config.yaml ``` -#### Using Mooncake Connector +See [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml) for the default two-stage deploy configuration. -By default, BAGEL uses `SharedMemoryConnector` for inter-stage communication. You can use the [Mooncake](https://github.com/kvcache-ai/Mooncake) connector to transfer KV cache between stages, which also enables multi-node deployment. +### Single-Stage -**1. Install Mooncake** +The DiT stage contains a full LLM, ViT, VAE, and tokenizer, so it can handle all modalities (text2img, img2img, img2text, text2text, think) without a separate Thinker stage: ```bash -# For CUDA-enabled systems (recommended) -pip install mooncake-transfer-engine - -# For non-CUDA systems -pip install mooncake-transfer-engine-non-cuda +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ + --deploy-config vllm_omni/deploy/bagel_single_stage.yaml ``` -**2. Start Mooncake Master** on the primary node: +See [`bagel_single_stage.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel_single_stage.yaml) for configuration. The `pipeline: bagel_single_stage` field selects the single-stage topology from the pipeline registry. -```bash -# Optional: enable disk-backed storage by creating a directory and passing --root_fs_dir. -# Without it, Mooncake runs in memory-only mode, which is sufficient for KV cache transfer. -mkdir -p ./mc_storage - -mooncake_master \ - --rpc_port=50051 \ - --enable_http_metadata_server=true \ - --http_metadata_server_host=0.0.0.0 \ - --http_metadata_server_port=8080 \ - --metrics_port=9003 \ - --root_fs_dir=./mc_storage/ \ - --cluster_id=mc-local-1 & -``` +### Tensor Parallelism (TP) -**3. Launch the server** with the Mooncake stage config: +For larger models or multi-GPU environments, enable TP via CLI: ```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ - --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --tensor-parallel-size 2 ``` -> **Note**: Before launching, edit [`bagel_multiconnector.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml) and replace the `metadata_server` and `master` addresses with your Mooncake master node's actual IP. For single-node testing, `127.0.0.1` works. - -The client-side usage is identical to the default setup -- the Mooncake connector is transparent to the API. See the requests section below. - -For more details on the Mooncake connector configuration, see the [Mooncake Store Connector documentation](https://github.com/vllm-project/vllm-omni/tree/main/docs/design/feature/omni_connectors/mooncake_store_connector.md). +Or set `tensor_parallel_size` per stage in a custom deploy YAML. -#### Multi-Node Deployment +### Multi-Node Deployment -You can deploy each stage on a **separate node** for better resource utilization. In this example, the orchestrator (Stage 0 / Thinker) and Stage 1 (DiT) run on different machines, connected via Mooncake. +Deploy each stage on a **separate node** for better resource utilization. Replace `` with the actual IP address of your orchestrator node. -Replace `` below with the actual IP address of your orchestrator node (e.g., `10.244.227.244`). - -> [!WARNING] -> **Before launching**, edit [`bagel_multiconnector.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml) and replace the `metadata_server` and `master` addresses with your Mooncake master node's actual IP. Mismatched addresses will cause silent connection failures. - -**1. Start Mooncake Master** (on the orchestrator node): - -```bash -mooncake_master \ - --rpc_port=50051 \ - --enable_http_metadata_server=true \ - --http_metadata_server_host= \ - --http_metadata_server_port=8080 \ - --metrics_port=9003 -``` - -**2. Launch Stage 0 (Thinker / Orchestrator)** on the orchestrator node: +**1. Launch Stage 0 (Thinker / Orchestrator)** on the orchestrator node: ```bash vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ - --port 8000 \ # API server port for client requests - --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml \ + --port 8000 \ --stage-id 0 \ -oma \ -omp 8091 ``` -**3. Launch Stage 1 (DiT)** on the remote node in headless mode: +**2. Launch Stage 1 (DiT)** on the remote node in headless mode: ```bash vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ - --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml \ --stage-id 1 \ --headless \ -oma \ -omp 8091 ``` -**Mooncake Master arguments:** +Or use the convenience script: -| Argument | Description | -| :------- | :---------- | -| `--rpc_port` | Mooncake RPC port for control-plane coordination between stages | -| `--enable_http_metadata_server` | Enable the HTTP metadata server for service discovery | -| `--http_metadata_server_host` | IP address to bind the metadata server (use the orchestrator node's IP) | -| `--http_metadata_server_port` | Port for the HTTP metadata server | -| `--metrics_port` | Port for Prometheus-compatible metrics endpoint | +```bash +# Terminal 1: Stage 0 +bash run_server_stage_cli.sh --stage 0 + +# Terminal 2: Stage 1 +bash run_server_stage_cli.sh --stage 1 + +# With extra args +bash run_server_stage_cli.sh --stage 0 -- --tensor-parallel-size 2 +bash run_server_stage_cli.sh --stage 1 -- --gpu-memory-utilization 0.9 +``` **vllm serve arguments:** @@ -150,85 +110,31 @@ vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ | :------- | :---------- | | `--stage-id` | Which stage this process runs (0 = Thinker, 1 = DiT) | | `--headless` | Run without the API server (worker-only mode) | -| `-oma` | Orchestrator master address | -| `-omp` | Orchestrator master port for Stage 1 to connect to Stage 0 for task coordination | +| `-oma` / `--omni-master-address` | Orchestrator master address | +| `-omp` / `--omni-master-port` | Orchestrator master port | > [!IMPORTANT] > **Startup Order**: Stage 0 (orchestrator) must be launched **before** Stage 1 (headless). > Stage 0 will appear to hang on startup until Stage 1 (worker) connects — this is expected behavior. -**Network Requirements** - -All nodes must have network connectivity to each other. Ensure the following ports are open **between all participating nodes**: +### Inter-Stage Connectors -| Port | Protocol | Service | Direction | -| :--- | :------- | :------ | :-------- | -| 50051 | TCP | Mooncake Master RPC | Worker → Orchestrator | -| 8080 | TCP | Mooncake HTTP Metadata Server | Worker → Orchestrator | -| 8091 | TCP | Orchestrator Master (`-omp`) | Worker → Orchestrator | -| 8000 | TCP | API Server (`--port`) | Client → Orchestrator | -| 9003 | TCP | Metrics (optional) | Monitoring → Orchestrator | +When deploying stages across nodes, configure the connector type in the deploy YAML: -> **Tip**: If nodes are behind a firewall or in different VPCs/security groups, make sure the above ports are allowed in ingress/egress rules. All nodes should be reachable via their IP addresses (no NAT). Using nodes on the same subnet or VPC is recommended to minimize latency for Mooncake KV cache transfers. +- **SharedMemoryConnector** (default): Used for single-node deployments. No explicit configuration needed. +- **MooncakeTransferEngineConnector**: For multi-node setups with RDMA hardware. Defined in [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml) under `connectors.rdma_connector`. -### Send Multi-modal Request +To use Mooncake, create a custom deploy YAML that binds `output_connectors` / `input_connectors` on each stage to the `rdma_connector` defined in the `connectors` section. -Get into the bagel folder: +## Send Requests ```bash cd examples/online_serving/bagel ``` -Send request via Python - -```bash -python openai_chat_client.py --prompt "A cute cat" --modality text2img -``` - -The Python client supports the following command-line arguments: - -- `--prompt` (or `-p`): Text prompt for generation (default: `A cute cat`) -- `--output` (or `-o`): Output file path for image results (default: `bagel_output.png`) -- `--server` (or `-s`): Server URL (default: `http://localhost:8091`) -- `--image-url` (or `-i`): Input image URL or local file path (for img2img/img2text modes) -- `--modality` (or `-m`): Task modality (default: `text2img`). Options: `text2img`, `img2img`, `img2text`, `text2text` -- `--height`: Image height in pixels (default: 512) -- `--width`: Image width in pixels (default: 512) -- `--steps`: Number of inference steps (default: 25) -- `--seed`: Random seed (default: 42) -- `--negative`: Negative prompt for image generation - -Example with custom parameters: - -```bash -python openai_chat_client.py \ - --prompt "A futuristic city" \ - --modality text2img \ - --height 768 \ - --width 768 \ - --steps 50 \ - --seed 42 \ - --negative "blurry, low quality" -``` - -## Modality Control - -BAGEL-7B-MoT supports **multiple modality modes** for different use cases. - -The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/model_executor/stage_configs/bagel.yaml) - -| Modality | Input | Output | Description | -| ----------- | ------------ | ------ | -------------------------------------- | -| `text2img` | Text | Image | Generate images from text prompts | -| `img2img` | Image + Text | Image | Transform images using text guidance | -| `img2text` | Image + Text | Text | Generate text descriptions from images | -| `text2text` | Text | Text | Pure text generation | - ### Text to Image (text2img) -Generate images from text prompts: - -**Using Python client** +**Python client:** ```bash python openai_chat_client.py \ @@ -238,7 +144,7 @@ python openai_chat_client.py \ --steps 50 ``` -**Using curl** +**curl:** ```bash curl http://localhost:8091/v1/chat/completions \ @@ -253,12 +159,9 @@ curl http://localhost:8091/v1/chat/completions \ }' ``` - ### Image to Image (img2img) -Transform images based on text prompts: - -**Using Python client** +**Python client:** ```bash python openai_chat_client.py \ @@ -268,7 +171,7 @@ python openai_chat_client.py \ --output transformed.png ``` -**Using curl** +**curl:** ```bash IMAGE_BASE64=$(base64 -w 0 cat.jpg) @@ -293,14 +196,11 @@ EOF curl http://localhost:8091/v1/chat/completions \ -H "Content-Type: application/json" \ -d @payload.json - ``` ### Image to Text (img2text) -Generate text descriptions from images: - -**Using Python client** +**Python client:** ```bash python openai_chat_client.py \ @@ -309,7 +209,7 @@ python openai_chat_client.py \ --image-url /path/to/image.jpg ``` -**Using curl** +**curl:** ```bash IMAGE_BASE64=$(base64 -w 0 cat.jpg) @@ -334,9 +234,7 @@ curl http://localhost:8091/v1/chat/completions \ ### Text to Text (text2text) -Pure text generation: - -**Using Python client** +**Python client:** ```bash python openai_chat_client.py \ @@ -344,26 +242,81 @@ python openai_chat_client.py \ --modality text2text ``` -**Using curl** +**curl:** ```bash curl http://localhost:8091/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>user\nWhat is the capital of France?<|im_end|>\n<|im_start|>assistant\n"}]}] + "messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>user\nWhat is the capital of France?<|im_end|>\n<|im_start|>assistant\n"}]}], "modalities": ["text"] }' ``` +### Python Client Arguments + +| Argument | Default | Description | +| :------- | :------ | :---------- | +| `--prompt` / `-p` | `A cute cat` | Text prompt | +| `--output` / `-o` | `bagel_output.png` | Output file path | +| `--server` / `-s` | `http://localhost:8091` | Server URL | +| `--image-url` / `-i` | `None` | Input image URL or local path (img2img/img2text) | +| `--modality` / `-m` | `text2img` | `text2img`, `img2img`, `img2text`, `text2text` | +| `--height` | `512` | Image height in pixels | +| `--width` | `512` | Image width in pixels | +| `--steps` | `25` | Number of inference steps | +| `--seed` | `42` | Random seed | +| `--negative` | `None` | Negative prompt for CFG | + +Example with custom parameters: + +```bash +python openai_chat_client.py \ + --prompt "A futuristic city" \ + --modality text2img \ + --height 768 \ + --width 768 \ + --steps 50 \ + --seed 42 \ + --negative "blurry, low quality" +``` + +## Configuration Reference + +### Deploy YAML Files + +| File | Description | +| :--- | :---------- | +| [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml) | Two-stage default (Thinker + DiT on GPU 0) | +| [`bagel_single_stage.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel_single_stage.yaml) | Single-stage (DiT only) | + +### Key Deploy YAML Fields + +| Field | Scope | Description | +| :---- | :---- | :---------- | +| `pipeline` | top-level | Override auto-detected pipeline (e.g. `bagel_single_stage`) | +| `stages[].stage_id` | per-stage | Stage identifier (0, 1, ...) | +| `stages[].devices` | per-stage | GPU device IDs (e.g. `"0"`, `"0,1"`) | +| `stages[].max_num_seqs` | per-stage | Maximum concurrent sequences | +| `stages[].gpu_memory_utilization` | per-stage | Fraction of GPU memory to use | +| `stages[].enforce_eager` | per-stage | Disable CUDA graphs | +| `stages[].tensor_parallel_size` | per-stage | TP degree for this stage | +| `connectors` | top-level | Define available connector instances (SHM, Mooncake) | +| `platforms` | top-level | Platform-specific overrides (e.g. `xpu`) | + ## FAQ -- If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. +- If you encounter OOM errors, try decreasing `max_model_len` or `gpu_memory_utilization` in the deploy YAML. + +**Two-stage VRAM usage:** + +| Stage | VRAM | +| :---- | :--- | +| Stage 0 (Thinker) | **15.04 GiB + KV Cache** | +| Stage 1 (DiT) | **26.50 GiB** | +| Total | **~42 GiB + KV Cache** | -| Stage | VRAM | -| :------------------ | :--------------------------- | -| Stage-0 (Thinker) | **15.04 GiB** **+ KV Cache** | -| Stage-1 (DiT) | **26.50 GiB** | -| Total | **~42 GiB + KV Cache** | +**Single-stage VRAM usage:** The DiT loads the full model (~42 GiB) in one process. ## Example materials diff --git a/examples/offline_inference/bagel/README.md b/examples/offline_inference/bagel/README.md index 3e653d0e3ab..62d1be144a5 100644 --- a/examples/offline_inference/bagel/README.md +++ b/examples/offline_inference/bagel/README.md @@ -1,44 +1,60 @@ # BAGEL-7B-MoT -## Set up +## Setup Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup. -## Run examples +## Architecture -**Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, please modify the stage configuration to distribute the model across devices. +BAGEL-7B-MoT is a Mixture-of-Transformers (MoT) model supporting both image generation and understanding. It offers two deployment topologies: -Get into the bagel folder +| Topology | Stages | Description | +| :------- | :----- | :---------- | +| **Two-stage** (default) | Stage 0 (Thinker, AR) + Stage 1 (DiT, Diffusion) | Thinker handles text/understanding via vLLM AR engine; DiT handles image generation. KV cache is transferred between stages. | +| **Single-stage** | Stage 0 (DiT, Diffusion) only | The DiT stage contains a full LLM, ViT, VAE, and tokenizer internally. All modalities are handled within a single diffusion process. | + +Both topologies support all four modalities: `text2img`, `img2img`, `img2text`, `text2text`. + +## Quick Start ```bash cd examples/offline_inference/bagel + +# Default two-stage mode (auto-detected) +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2img \ + --prompts "A cute cat" + +# Single-stage mode +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2img \ + --prompts "A cute cat" \ + --stage-configs-path vllm_omni/deploy/bagel_single_stage.yaml ``` -### Modality Control +> **Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. For dual-GPU setups, modify the deploy YAML to distribute stages across devices. -BAGEL-7B-MoT supports multiple modality modes. You can control the mode using the `--modality` argument: +## Modality Control -#### Text to Image (text2img) +Control the mode using the `--modality` argument: -- **Pipeline**: Text → Thinker → DiT → VAE Decode → Image -- **Stages Used**: Stage 0 (Thinker) + Stage 1 (DiT) -- **KV Transfer**: Thinker sends KV cache to DiT for conditioned generation +| Modality | Input | Output | Description | +| :------- | :---- | :----- | :---------- | +| `text2img` | Text | Image | Generate images from text prompts | +| `img2img` | Image + Text | Image | Transform images using text guidance | +| `img2text` | Image + Text | Text | Generate text descriptions from images | +| `text2text` | Text | Text | Pure text generation (language model mode) | -Generate images from text prompts: +### Text to Image (text2img) ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ - --prompts "A cute cat" + --prompts "A cute cat" \ + --steps 50 ``` -#### Image to Image (img2img) - -- **Pipeline**: Image → VAE Encode → DiT → VAE Decode → New Image -- **Stages Used**: Stage 1 (DiT) only -- **Special**: Bypasses the Thinker stage, direct image-to-image transformation - -Transform images based on text prompts: +### Image to Image (img2img) ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ @@ -47,13 +63,7 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --prompts "Let the woman wear a blue dress" ``` -#### Image to Text (img2text) - -- **Pipeline**: Image → ViT + VAE Encode → Thinker → Text Output -- **Stages Used**: Stage 0 (Thinker) only -- **Special**: Uses both VAE latent encoding AND ViT semantic encoding for comprehensive image understanding - -Generate text descriptions from images: +### Image to Text (img2text) ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ @@ -62,193 +72,198 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --prompts "Describe this image in detail" ``` -#### Text to Text (text2text) - -- **Pipeline**: Text → Thinker → Text Output -- **Stages Used**: Stage 0 (Thinker) only -- **Special**: No visual components involved, operates as pure language model - -Pure text generation: +### Text to Text (text2text) ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2text \ --prompts "What is the capital of France?" -# You can load prompts from a text file (one prompt per line): +# Load prompts from a text file (one prompt per line): python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2text \ --txt-prompts /path/to/prompts.txt ``` -### Inference Steps +## Think Mode + +Think mode enables the model to generate `...` planning/reasoning tokens before producing the final output. This improves generation quality for complex prompts. -Control the number of inference steps for image generation: +- **Two-stage**: The Thinker (AR) stage decodes think tokens, then transfers the augmented KV cache to the DiT stage for image generation. +- **Single-stage**: The DiT's internal LLM generates think tokens in-place before proceeding to denoise. ```bash -# You can adjust steps to 100 to improve image quality +# Think + text2img: plan before generating python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ - --steps 50 \ - --prompts "A cute cat" -``` + --prompts "A futuristic city with flying cars" \ + --think \ + --max-think-tokens 1000 -### Key arguments - -BAGEL-7B-MoT supports **multiple modality modes** for different use cases. - -The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel.yaml) - -#### 📌 Command Line Arguments (end2end.py) - -| Argument | Type | Default | Description | -| :--------------------- | :----- | :---------------------------- | :----------------------------------------------------------- | -| `--model` | string | `ByteDance-Seed/BAGEL-7B-MoT` | Model path or name | -| `--modality` | choice | `text2img` | Modality mode: `text2img`, `img2img`, `img2text`, `text2text` | -| `--prompts` | list | `None` | Input text prompts directly | -| `--txt-prompts` | string | `None` | Path to txt file with one prompt per line | -| `--image-path` | string | `None` | Input image path (for `img2img`/`img2text`) | -| `--steps` | int | `50` | Number of inference steps | -| `--stage-configs-path` | string | `None` | Custom stage config file path | -| `--worker-backend` | choice | `process` | Worker backend: `process` or `ray` | -| `--ray-address` | string | `None` | Ray cluster address | -| `--enable-stats` | flag | `False` | Enable statistics logging | -| `--init-sleep-seconds` | int | `20` | Initialization sleep time | -| `--batch-timeout` | int | `5` | Batch timeout | -| `--init-timeout` | int | `300` | Initialization timeout | - ------- - -#### ⚙️ Stage Configuration Parameters (bagel.yaml) - - **Stage 0 - Thinker (LLM Stage)** - -| Parameter | Value | Description | -| :------------------------------- | :------------------------------ | :----------------------- | -| `stage_type` | `llm` | Stage type | -| `devices` | `"0"` | GPU device ID | -| `max_num_seqs` | `1` | Maximum batch size | -| `model_stage` | `thinker` | Model stage identifier | -| `model_arch` | `BagelForConditionalGeneration` | Model architecture | -| `gpu_memory_utilization` | `0.4` | GPU memory utilization | -| `tensor_parallel_size` | `1` | Tensor parallel size | -| `max_num_batched_tokens` | `32768` | Maximum batched tokens | -| `omni_kv_config.need_send_cache` | `true` | Whether to send KV cache | - ------- - -**Stage 1 - DiT (Diffusion Stage)** - -| Parameter | Value | Description | -| :------------------------------- | :---------- | :-------------------------- | -| `stage_type` | `diffusion` | Stage type | -| `devices` | `"0"` | GPU device ID | -| `max_num_seqs` | `1` | Maximum batch size | -| `model_stage` | `dit` | Model stage identifier | -| `gpu_memory_utilization` | `0.4` | GPU memory utilization | -| `omni_kv_config.need_recv_cache` | `true` | Whether to receive KV cache | -| `engine_input_source` | `[0]` | Input source from Stage 0 | - ------- - -#### Tensor Parallelism (TP) - -For larger models or multi-GPU environments, you can enable Tensor Parallelism (TP) by modifying the stage configuration (e.g., [`bagel.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel.yaml)). - -1. **Set `tensor_parallel_size`**: Increase this value (e.g., to `2` or `4`). -2. **Set `devices`**: Specify the comma-separated GPU IDs to be used for the stage (e.g., `"0,1"`). - -Example configuration for TP=2 on GPUs 0 and 1: -```yaml - engine_args: - tensor_parallel_size: 2 - ... - runtime: - devices: "0,1" +# Think + img2img: reason about the edit +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality img2img \ + --image-path /path/to/image.jpg \ + --prompts "Make it look like a watercolor painting" \ + --think + +# Think + img2text: reason before describing +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality img2text \ + --image-path /path/to/image.jpg \ + --prompts "What is happening in this image?" \ + --think + +# Think + text2text: chain-of-thought reasoning +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2text \ + --prompts "Solve: 23 * 47" \ + --think ``` ------- +Think mode parameters: + +| Argument | Default | Description | +| :------- | :------ | :---------- | +| `--think` | `False` | Enable thinking mode | +| `--max-think-tokens` | `1000` | Maximum tokens for think generation | +| `--do-sample` | `False` | Enable sampling (vs. greedy) for text generation | +| `--text-temperature` | `0.3` | Temperature for text generation sampling | -#### 🔗 Runtime Configuration +## Classifier-Free Guidance (CFG) -| Parameter | Value | Description | -| :-------------------- | :------ | :------------------------------- | -| `shm_threshold_bytes` | `65536` | Shared memory threshold (64KB) | +CFG controls the trade-off between prompt fidelity and diversity. These parameters apply to image generation modalities (`text2img`, `img2img`). -## Using Mooncake Connector +```bash +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2img \ + --prompts "A photorealistic portrait" \ + --cfg-text-scale 6.0 \ + --cfg-img-scale 2.0 \ + --negative-prompt "blurry, low quality, distorted" \ + --cfg-interval 0.4 1.0 \ + --cfg-renorm-type global \ + --cfg-renorm-min 0.0 +``` -[Mooncake](https://github.com/kvcache-ai/Mooncake) is a high-performance distributed KV cache transfer engine that enables efficient cross-node data movement via TCP or RDMA, making it ideal for multi-node disaggregated inference. +| Argument | Default | Description | +| :------- | :------ | :---------- | +| `--cfg-text-scale` | `4.0` | Text CFG scale (higher = more prompt-adherent) | +| `--cfg-img-scale` | `1.5` | Image CFG scale (for img2img) | +| `--negative-prompt` | `None` | Negative prompt for CFG conditioning | +| `--cfg-interval` | pipeline default | CFG active interval `[start, end]` as fractions of total timesteps | +| `--cfg-renorm-type` | `None` | Renormalization type: `global`, `text_channel`, `channel` | +| `--cfg-renorm-min` | `None` | Minimum renormalization value | +| `--cfg-parallel-size` | `1` | CFG parallel size: `1` = batched (single GPU), `2` = 2-branch parallel, `3` = full 3-GPU parallel | -By default, BAGEL uses `SharedMemoryConnector` for inter-stage communication. You can switch to the Mooncake connector for better performance on multi-GPU setups and to enable multi-node deployment. +## Deployment Topologies -### Prerequisites +### Two-Stage (Default) -Install the Mooncake transfer engine: +The default topology auto-detected from the model. No extra flags needed. ```bash -# For CUDA-enabled systems (recommended) -pip install mooncake-transfer-engine - -# For non-CUDA systems -pip install mooncake-transfer-engine-non-cuda +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2img \ + --prompts "A cute cat" ``` -### Step 1: Start the Mooncake Master +The pipeline is defined in [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml). Stage 0 (Thinker) and Stage 1 (DiT) share GPU 0 by default. For dual-GPU setups, customize the deploy YAML and set `devices: "1"` for stage 1. -On the **primary node**, start the Mooncake master service (run in a separate terminal or background with `&`): +### Single-Stage + +Pass the single-stage deploy config via `--stage-configs-path`: ```bash -# Optional: enable disk-backed storage by creating a directory and passing --root_fs_dir. -# Without it, Mooncake runs in memory-only mode, which is sufficient for KV cache transfer. -mkdir -p ./mc_storage - -mooncake_master \ - --rpc_port=50051 \ - --enable_http_metadata_server=true \ - --http_metadata_server_host=0.0.0.0 \ - --http_metadata_server_port=8080 \ - --metrics_port=9003 \ - --root_fs_dir=./mc_storage/ \ - --cluster_id=mc-local-1 & +python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ + --modality text2img \ + --prompts "A cute cat" \ + --stage-configs-path vllm_omni/deploy/bagel_single_stage.yaml ``` -### Step 2: Run Offline Inference with Mooncake +See [`bagel_single_stage.yaml`](../../../vllm_omni/deploy/bagel_single_stage.yaml) for configuration details. The `pipeline: bagel_single_stage` field selects the single-stage topology from the pipeline registry. -Use the provided Mooncake stage config [`bagel_multiconnector.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml). Before launching, update the `metadata_server` and `master` addresses in the YAML to match your Mooncake master node's IP (use `127.0.0.1` for single-node testing). +### Tensor Parallelism (TP) -```bash -cd examples/offline_inference/bagel +For larger models or multi-GPU environments: -# Text to Image with Mooncake +```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml + --tensor-parallel-size 2 +``` -# Image to Text with Mooncake -python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality img2text \ - --image-path /path/to/image.jpg \ - --prompts "Describe this image" \ - --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml +Or customize the deploy YAML (see [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml)) with per-stage `tensor_parallel_size`. + +### FP8 Quantization -# Text to Text with Mooncake +```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ - --modality text2text \ - --prompts "What is the capital of France?" \ - --stage-configs-path ../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml + --modality text2img \ + --prompts "A cute cat" \ + --quantization fp8 ``` -For more details on the Mooncake connector and multi-node setup, see the [Mooncake Store Connector documentation](../../../docs/design/feature/omni_connectors/mooncake_store_connector.md). - ------- +## Command Line Reference + +### Core Arguments + +| Argument | Type | Default | Description | +| :------- | :--- | :------ | :---------- | +| `--model` | string | `ByteDance-Seed/BAGEL-7B-MoT` | Model path or HuggingFace name | +| `--modality` | choice | `text2img` | `text2img`, `img2img`, `img2text`, `text2text` | +| `--prompts` | list | `None` | Input text prompts | +| `--txt-prompts` | string | `None` | Path to text file with one prompt per line | +| `--image-path` | string | `None` | Input image path (required for `img2img`/`img2text`) | +| `--output` | string | `.` | Output directory for saved images | +| `--steps` | int | `50` | Number of diffusion inference steps | +| `--seed` | int | `None` | Random seed for reproducibility | + +### Think Mode Arguments + +| Argument | Type | Default | Description | +| :------- | :--- | :------ | :---------- | +| `--think` | flag | `False` | Enable `...` planning/reasoning | +| `--max-think-tokens` | int | `1000` | Maximum tokens for think generation | +| `--do-sample` | flag | `False` | Use sampling instead of greedy decoding | +| `--text-temperature` | float | `0.3` | Sampling temperature for text generation | + +### CFG Arguments + +| Argument | Type | Default | Description | +| :------- | :--- | :------ | :---------- | +| `--cfg-text-scale` | float | `4.0` | Text CFG guidance scale | +| `--cfg-img-scale` | float | `1.5` | Image CFG guidance scale | +| `--negative-prompt` | string | `None` | Negative prompt for CFG | +| `--cfg-parallel-size` | int | `1` | CFG parallel GPU count (1, 2, or 3) | +| `--cfg-interval` | float[2] | pipeline default | CFG active window `[start, end]` | +| `--cfg-renorm-type` | string | `None` | `global`, `text_channel`, or `channel` | +| `--cfg-renorm-min` | float | `None` | Minimum renormalization value | + +### Engine Arguments + +| Argument | Type | Default | Description | +| :------- | :--- | :------ | :---------- | +| `--stage-configs-path` | string | `None` | Path to deploy YAML (auto-detected if omitted) | +| `--worker-backend` | choice | `process` | `process` or `ray` | +| `--ray-address` | string | `None` | Ray cluster address | +| `--quantization` | string | `None` | Quantization method (e.g. `fp8`) | +| `--log-stats` | flag | `False` | Enable statistics logging | +| `--init-timeout` | int | `300` | Initialization timeout (seconds) | +| `--batch-timeout` | int | `5` | Batch timeout (seconds) | +| `--enable-diffusion-pipeline-profiler` | flag | `False` | Profile diffusion stage durations | ## FAQ -- If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. +- If you encounter OOM errors, try decreasing `max_model_len` or `gpu_memory_utilization` in the deploy YAML. + +**Two-stage VRAM usage:** + +| Stage | VRAM | +| :---- | :--- | +| Stage 0 (Thinker) | **15.04 GiB + KV Cache** | +| Stage 1 (DiT) | **26.50 GiB** | +| Total | **~42 GiB + KV Cache** | -| Stage | VRAM | -| :------------------ | :--------------------------- | -| Stage-0 (Thinker) | **15.04 GiB** **+ KV Cache** | -| Stage-1 (DiT) | **26.50 GiB** | -| Total | **~42 GiB + KV Cache** | +**Single-stage VRAM usage:** The DiT loads the full model (~42 GiB) in one process. diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index ed5fa57e8d6..8ce09c5d617 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -147,13 +147,8 @@ def main(): omni_kwargs = {} stage_configs_path = args.stage_configs_path - is_single_stage = stage_configs_path and "single_stage" in stage_configs_path - if args.think and stage_configs_path is None: - stage_configs_path = "vllm_omni/model_executor/stage_configs/bagel_think.yaml" - print(f"[Info] Think mode enabled, using stage config: {stage_configs_path}") if stage_configs_path: omni_kwargs["stage_configs_path"] = stage_configs_path - is_single_stage = "single_stage" in stage_configs_path omni_kwargs.update( { @@ -215,9 +210,8 @@ def main(): formatted_prompts.append(prompt_dict) params_list = omni.default_sampling_params_list + is_single_stage = len(params_list) == 1 - # For single-stage DiT, think/text params go into the diffusion sampling params extra_args. - # For 2-stage, diffusion params are at index 1. diffusion_params_idx = 0 if is_single_stage else (1 if len(params_list) > 1 else 0) diffusion_params = params_list[diffusion_params_idx] diff --git a/examples/online_serving/bagel/README.md b/examples/online_serving/bagel/README.md index 0939bc5f387..7c1fa13569a 100644 --- a/examples/online_serving/bagel/README.md +++ b/examples/online_serving/bagel/README.md @@ -1,145 +1,106 @@ # BAGEL-7B-MoT -## 🛠️ Installation +## Installation Please refer to [README.md](../../../README.md) -## Run examples (BAGEL-7B-MoT) +## Architecture -**Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, please modify the stage configuration to distribute the model across devices. +BAGEL-7B-MoT is a Mixture-of-Transformers (MoT) model supporting both image generation and understanding. It offers two deployment topologies: -### Launch the Server +| Topology | Stages | Description | +| :------- | :----- | :---------- | +| **Two-stage** (default) | Stage 0 (Thinker, AR) + Stage 1 (DiT, Diffusion) | Thinker handles text/understanding via vLLM AR engine; DiT handles image generation. KV cache is transferred between stages. | +| **Single-stage** | Stage 0 (DiT, Diffusion) only | The DiT stage contains a full LLM, ViT, VAE, and tokenizer internally. All modalities are handled within a single diffusion process. | + +Both topologies support all four modalities: `text2img`, `img2img`, `img2text`, `text2text`. + +> **Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, modify the deploy YAML to distribute stages across devices. + +## Launch the Server + +### Two-Stage (Default) + +The default pipeline is auto-detected from the model. No extra flags needed: ```bash -# Use default configuration vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 ``` Or use the convenience script: ```bash -cd /workspace/vllm-omni/examples/online_serving/bagel +cd examples/online_serving/bagel bash run_server.sh ``` -```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --stage-configs-path /path/to/stage_configs_file -``` - -#### 🚀 Tensor Parallelism (TP) - -For larger models or multi-GPU environments, you can enable Tensor Parallelism (TP) for the server. - -1. **Modify Stage Config**: Create or modify a stage configuration yaml (e.g., [`bagel.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel.yaml)). Set `tensor_parallel_size` to `2` (or more) and update `devices` to include multiple GPU IDs (e.g., `"0,1"`). +To use a custom deploy YAML: -```yaml - engine_args: - tensor_parallel_size: 2 - ... - runtime: - devices: "0,1" -``` - -2. **Launch Server**: ```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --stage-configs-path /path/to/your/custom_bagel.yaml +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ + --deploy-config /path/to/deploy_config.yaml ``` -#### Using Mooncake Connector +See [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml) for the default two-stage deploy configuration. -By default, BAGEL uses `SharedMemoryConnector` for inter-stage communication. You can use the [Mooncake](https://github.com/kvcache-ai/Mooncake) connector to transfer KV cache between stages, which also enables multi-node deployment. +### Single-Stage -**1. Install Mooncake** +The DiT stage contains a full LLM, ViT, VAE, and tokenizer, so it can handle all modalities (text2img, img2img, img2text, text2text, think) without a separate Thinker stage: ```bash -# For CUDA-enabled systems (recommended) -pip install mooncake-transfer-engine - -# For non-CUDA systems -pip install mooncake-transfer-engine-non-cuda +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ + --deploy-config vllm_omni/deploy/bagel_single_stage.yaml ``` -**2. Start Mooncake Master** on the primary node: +See [`bagel_single_stage.yaml`](../../../vllm_omni/deploy/bagel_single_stage.yaml) for configuration. The `pipeline: bagel_single_stage` field selects the single-stage topology from the pipeline registry. -```bash -# Optional: enable disk-backed storage by creating a directory and passing --root_fs_dir. -# Without it, Mooncake runs in memory-only mode, which is sufficient for KV cache transfer. -mkdir -p ./mc_storage - -mooncake_master \ - --rpc_port=50051 \ - --enable_http_metadata_server=true \ - --http_metadata_server_host=0.0.0.0 \ - --http_metadata_server_port=8080 \ - --metrics_port=9003 \ - --root_fs_dir=./mc_storage/ \ - --cluster_id=mc-local-1 & -``` +### Tensor Parallelism (TP) -**3. Launch the server** with the Mooncake stage config: +For larger models or multi-GPU environments, enable TP via CLI: ```bash -vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ - --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml +vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --tensor-parallel-size 2 ``` -> **Note**: Before launching, edit [`bagel_multiconnector.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml) and replace the `metadata_server` and `master` addresses with your Mooncake master node's actual IP. For single-node testing, `127.0.0.1` works. - -The client-side usage is identical to the default setup -- the Mooncake connector is transparent to the API. See the requests section below. - -For more details on the Mooncake connector configuration, see the [Mooncake Store Connector documentation](../../../docs/design/feature/omni_connectors/mooncake_store_connector.md). - -#### Multi-Node Deployment +Or set `tensor_parallel_size` per stage in a custom deploy YAML. -You can deploy each stage on a **separate node** for better resource utilization. In this example, the orchestrator (Stage 0 / Thinker) and Stage 1 (DiT) run on different machines, connected via Mooncake. +### Multi-Node Deployment -Replace `` below with the actual IP address of your orchestrator node (e.g., `10.244.227.244`). +Deploy each stage on a **separate node** for better resource utilization. Replace `` with the actual IP address of your orchestrator node. -> [!WARNING] -> **Before launching**, edit [`bagel_multiconnector.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml) and replace the `metadata_server` and `master` addresses with your Mooncake master node's actual IP. Mismatched addresses will cause silent connection failures. - -**1. Start Mooncake Master** (on the orchestrator node): - -```bash -mooncake_master \ - --rpc_port=50051 \ - --enable_http_metadata_server=true \ - --http_metadata_server_host= \ - --http_metadata_server_port=8080 \ - --metrics_port=9003 -``` - -**2. Launch Stage 0 (Thinker / Orchestrator)** on the orchestrator node: +**1. Launch Stage 0 (Thinker / Orchestrator)** on the orchestrator node: ```bash vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ - --port 8000 \ # API server port for client requests - --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml \ + --port 8000 \ --stage-id 0 \ -oma \ -omp 8091 ``` -**3. Launch Stage 1 (DiT)** on the remote node in headless mode: +**2. Launch Stage 1 (DiT)** on the remote node in headless mode: ```bash vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ - --stage-configs-path vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml \ --stage-id 1 \ --headless \ -oma \ -omp 8091 ``` -**Mooncake Master arguments:** +Or use the convenience script: -| Argument | Description | -| :------- | :---------- | -| `--rpc_port` | Mooncake RPC port for control-plane coordination between stages | -| `--enable_http_metadata_server` | Enable the HTTP metadata server for service discovery | -| `--http_metadata_server_host` | IP address to bind the metadata server (use the orchestrator node's IP) | -| `--http_metadata_server_port` | Port for the HTTP metadata server | -| `--metrics_port` | Port for Prometheus-compatible metrics endpoint | +```bash +# Terminal 1: Stage 0 +bash run_server_stage_cli.sh --stage 0 + +# Terminal 2: Stage 1 +bash run_server_stage_cli.sh --stage 1 + +# With extra args +bash run_server_stage_cli.sh --stage 0 -- --tensor-parallel-size 2 +bash run_server_stage_cli.sh --stage 1 -- --gpu-memory-utilization 0.9 +``` **vllm serve arguments:** @@ -147,85 +108,31 @@ vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ | :------- | :---------- | | `--stage-id` | Which stage this process runs (0 = Thinker, 1 = DiT) | | `--headless` | Run without the API server (worker-only mode) | -| `-oma` | Orchestrator master address | -| `-omp` | Orchestrator master port for Stage 1 to connect to Stage 0 for task coordination | +| `-oma` / `--omni-master-address` | Orchestrator master address | +| `-omp` / `--omni-master-port` | Orchestrator master port | > [!IMPORTANT] > **Startup Order**: Stage 0 (orchestrator) must be launched **before** Stage 1 (headless). > Stage 0 will appear to hang on startup until Stage 1 (worker) connects — this is expected behavior. -**Network Requirements** - -All nodes must have network connectivity to each other. Ensure the following ports are open **between all participating nodes**: +### Inter-Stage Connectors -| Port | Protocol | Service | Direction | -| :--- | :------- | :------ | :-------- | -| 50051 | TCP | Mooncake Master RPC | Worker → Orchestrator | -| 8080 | TCP | Mooncake HTTP Metadata Server | Worker → Orchestrator | -| 8091 | TCP | Orchestrator Master (`-omp`) | Worker → Orchestrator | -| 8000 | TCP | API Server (`--port`) | Client → Orchestrator | -| 9003 | TCP | Metrics (optional) | Monitoring → Orchestrator | +When deploying stages across nodes, configure the connector type in the deploy YAML: -> **Tip**: If nodes are behind a firewall or in different VPCs/security groups, make sure the above ports are allowed in ingress/egress rules. All nodes should be reachable via their IP addresses (no NAT). Using nodes on the same subnet or VPC is recommended to minimize latency for Mooncake KV cache transfers. +- **SharedMemoryConnector** (default): Used for single-node deployments. No explicit configuration needed. +- **MooncakeTransferEngineConnector**: For multi-node setups with RDMA hardware. Defined in [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml) under `connectors.rdma_connector`. -### Send Multi-modal Request +To use Mooncake, create a custom deploy YAML that binds `output_connectors` / `input_connectors` on each stage to the `rdma_connector` defined in the `connectors` section. -Get into the bagel folder: +## Send Requests ```bash cd examples/online_serving/bagel ``` -Send request via Python - -```bash -python openai_chat_client.py --prompt "A cute cat" --modality text2img -``` - -The Python client supports the following command-line arguments: - -- `--prompt` (or `-p`): Text prompt for generation (default: `A cute cat`) -- `--output` (or `-o`): Output file path for image results (default: `bagel_output.png`) -- `--server` (or `-s`): Server URL (default: `http://localhost:8091`) -- `--image-url` (or `-i`): Input image URL or local file path (for img2img/img2text modes) -- `--modality` (or `-m`): Task modality (default: `text2img`). Options: `text2img`, `img2img`, `img2text`, `text2text` -- `--height`: Image height in pixels (default: 512) -- `--width`: Image width in pixels (default: 512) -- `--steps`: Number of inference steps (default: 25) -- `--seed`: Random seed (default: 42) -- `--negative`: Negative prompt for image generation - -Example with custom parameters: - -```bash -python openai_chat_client.py \ - --prompt "A futuristic city" \ - --modality text2img \ - --height 768 \ - --width 768 \ - --steps 50 \ - --seed 42 \ - --negative "blurry, low quality" -``` - -## Modality Control - -BAGEL-7B-MoT supports **multiple modality modes** for different use cases. - -The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel.yaml) - -| Modality | Input | Output | Description | -| ----------- | ------------ | ------ | -------------------------------------- | -| `text2img` | Text | Image | Generate images from text prompts | -| `img2img` | Image + Text | Image | Transform images using text guidance | -| `img2text` | Image + Text | Text | Generate text descriptions from images | -| `text2text` | Text | Text | Pure text generation | - ### Text to Image (text2img) -Generate images from text prompts: - -**Using Python client** +**Python client:** ```bash python openai_chat_client.py \ @@ -235,7 +142,7 @@ python openai_chat_client.py \ --steps 50 ``` -**Using curl** +**curl:** ```bash curl http://localhost:8091/v1/chat/completions \ @@ -250,12 +157,9 @@ curl http://localhost:8091/v1/chat/completions \ }' ``` - ### Image to Image (img2img) -Transform images based on text prompts: - -**Using Python client** +**Python client:** ```bash python openai_chat_client.py \ @@ -265,7 +169,7 @@ python openai_chat_client.py \ --output transformed.png ``` -**Using curl** +**curl:** ```bash IMAGE_BASE64=$(base64 -w 0 cat.jpg) @@ -290,14 +194,11 @@ EOF curl http://localhost:8091/v1/chat/completions \ -H "Content-Type: application/json" \ -d @payload.json - ``` ### Image to Text (img2text) -Generate text descriptions from images: - -**Using Python client** +**Python client:** ```bash python openai_chat_client.py \ @@ -306,7 +207,7 @@ python openai_chat_client.py \ --image-url /path/to/image.jpg ``` -**Using curl** +**curl:** ```bash IMAGE_BASE64=$(base64 -w 0 cat.jpg) @@ -331,9 +232,7 @@ curl http://localhost:8091/v1/chat/completions \ ### Text to Text (text2text) -Pure text generation: - -**Using Python client** +**Python client:** ```bash python openai_chat_client.py \ @@ -341,23 +240,78 @@ python openai_chat_client.py \ --modality text2text ``` -**Using curl** +**curl:** ```bash curl http://localhost:8091/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>user\nWhat is the capital of France?<|im_end|>\n<|im_start|>assistant\n"}]}] + "messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>user\nWhat is the capital of France?<|im_end|>\n<|im_start|>assistant\n"}]}], "modalities": ["text"] }' ``` +### Python Client Arguments + +| Argument | Default | Description | +| :------- | :------ | :---------- | +| `--prompt` / `-p` | `A cute cat` | Text prompt | +| `--output` / `-o` | `bagel_output.png` | Output file path | +| `--server` / `-s` | `http://localhost:8091` | Server URL | +| `--image-url` / `-i` | `None` | Input image URL or local path (img2img/img2text) | +| `--modality` / `-m` | `text2img` | `text2img`, `img2img`, `img2text`, `text2text` | +| `--height` | `512` | Image height in pixels | +| `--width` | `512` | Image width in pixels | +| `--steps` | `25` | Number of inference steps | +| `--seed` | `42` | Random seed | +| `--negative` | `None` | Negative prompt for CFG | + +Example with custom parameters: + +```bash +python openai_chat_client.py \ + --prompt "A futuristic city" \ + --modality text2img \ + --height 768 \ + --width 768 \ + --steps 50 \ + --seed 42 \ + --negative "blurry, low quality" +``` + +## Configuration Reference + +### Deploy YAML Files + +| File | Description | +| :--- | :---------- | +| [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml) | Two-stage default (Thinker + DiT on GPU 0) | +| [`bagel_single_stage.yaml`](../../../vllm_omni/deploy/bagel_single_stage.yaml) | Single-stage (DiT only) | + +### Key Deploy YAML Fields + +| Field | Scope | Description | +| :---- | :---- | :---------- | +| `pipeline` | top-level | Override auto-detected pipeline (e.g. `bagel_single_stage`) | +| `stages[].stage_id` | per-stage | Stage identifier (0, 1, ...) | +| `stages[].devices` | per-stage | GPU device IDs (e.g. `"0"`, `"0,1"`) | +| `stages[].max_num_seqs` | per-stage | Maximum concurrent sequences | +| `stages[].gpu_memory_utilization` | per-stage | Fraction of GPU memory to use | +| `stages[].enforce_eager` | per-stage | Disable CUDA graphs | +| `stages[].tensor_parallel_size` | per-stage | TP degree for this stage | +| `connectors` | top-level | Define available connector instances (SHM, Mooncake) | +| `platforms` | top-level | Platform-specific overrides (e.g. `xpu`) | + ## FAQ -- If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. +- If you encounter OOM errors, try decreasing `max_model_len` or `gpu_memory_utilization` in the deploy YAML. + +**Two-stage VRAM usage:** + +| Stage | VRAM | +| :---- | :--- | +| Stage 0 (Thinker) | **15.04 GiB + KV Cache** | +| Stage 1 (DiT) | **26.50 GiB** | +| Total | **~42 GiB + KV Cache** | -| Stage | VRAM | -| :------------------ | :--------------------------- | -| Stage-0 (Thinker) | **15.04 GiB** **+ KV Cache** | -| Stage-1 (DiT) | **26.50 GiB** | -| Total | **~42 GiB + KV Cache** | +**Single-stage VRAM usage:** The DiT loads the full model (~42 GiB) in one process. diff --git a/examples/online_serving/bagel/run_server_stage_cli.sh b/examples/online_serving/bagel/run_server_stage_cli.sh index 2d0b4bc369e..55f64fcb965 100644 --- a/examples/online_serving/bagel/run_server_stage_cli.sh +++ b/examples/online_serving/bagel/run_server_stage_cli.sh @@ -20,7 +20,7 @@ MASTER_ADDRESS="${MASTER_ADDRESS:-127.0.0.1}" MASTER_PORT="${MASTER_PORT:-8092}" STAGE="all" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -STAGE_CONFIGS_PATH="${STAGE_CONFIGS_PATH:-$SCRIPT_DIR/../../../vllm_omni/model_executor/stage_configs/bagel.yaml}" +STAGE_CONFIGS_PATH="${STAGE_CONFIGS_PATH:-$SCRIPT_DIR/../../../vllm_omni/deploy/bagel.yaml}" EXTRA_ARGS=() usage() { diff --git a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml deleted file mode 100644 index b7768c071f6..00000000000 --- a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# stage config for running BAGEL with Mooncake connector for CI e2e tests. -# This config is optimized for single GPU tests with Mooncake inter-stage communication. - -stage_args: - - stage_id: 0 - stage_type: llm - prompt_expand_func: vllm_omni.model_executor.stage_input_processors.bagel.expand_cfg_prompts - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: BagelForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.45 - enforce_eager: true - trust_remote_code: true - engine_output_type: text - distributed_executor_backend: mp - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 - load_format: dummy - omni_kv_config: - need_send_cache: true - kv_transfer_criteria: - type: prefill_finished - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 52 - detokenize: true - repetition_penalty: 1.05 - output_connectors: - to_stage_1: mooncake_connector - - stage_id: 1 - stage_type: diffusion - cfg_kv_collect_func: vllm_omni.model_executor.stage_input_processors.bagel.collect_cfg_kv_caches - runtime: - devices: "0" - engine_args: - model_stage: dit - max_num_seqs: 1 - enforce_eager: true - trust_remote_code: true - distributed_executor_backend: mp - omni_kv_config: - need_recv_cache: true - engine_input_source: [0] - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 52 - input_connectors: - from_stage_0: mooncake_connector - -# Top-level runtime config with Mooncake connector -runtime: - enabled: true - connectors: - mooncake_connector: - name: MooncakeConnector - extra: - host: "${MOONCAKE_HOST}" - metadata_server: "http://${MOONCAKE_HOST}:${MOONCAKE_HTTP_PORT}/metadata" - master: "${MOONCAKE_HOST}:${MOONCAKE_RPC_PORT}" - segment: 64000000 - localbuf: 64000000 - proto: tcp - edges: - - from: 0 - to: 1 diff --git a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml deleted file mode 100644 index 504f3c98e92..00000000000 --- a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml +++ /dev/null @@ -1,76 +0,0 @@ -# stage config for running BAGEL with SharedMemory connector for CI e2e tests. -# This config is optimized for single GPU tests with SharedMemory inter-stage communication. - -stage_args: - - stage_id: 0 - stage_type: llm - prompt_expand_func: vllm_omni.model_executor.stage_input_processors.bagel.expand_cfg_prompts - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: OmniBagelForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.45 - enforce_eager: true - trust_remote_code: true - engine_output_type: text - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 - load_format: dummy - omni_kv_config: - need_send_cache: true - kv_transfer_criteria: - type: prefill_finished #or special token generated - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 52 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: diffusion - cfg_kv_collect_func: vllm_omni.model_executor.stage_input_processors.bagel.collect_cfg_kv_caches - runtime: - devices: "0" - engine_args: - model_stage: dit - max_num_seqs: 1 - enforce_eager: true - trust_remote_code: true - distributed_executor_backend: "mp" - omni_kv_config: - need_recv_cache: true - engine_input_source: [0] - - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 52 - -# Runtime edges -runtime: - enabled: true - # Distributed connectors configuration (optional) - # More connectors will be supported in the future. - connectors: - shared_memory_connector: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 # 64KB threshold - - - edges: - - from: 0 - to: 1 diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py index 66aec80c7c4..b4de059f2d0 100644 --- a/tests/e2e/offline_inference/test_bagel_img2img.py +++ b/tests/e2e/offline_inference/test_bagel_img2img.py @@ -15,7 +15,6 @@ """ import socket -from pathlib import Path from typing import Any import pytest @@ -24,10 +23,12 @@ from tests.helpers.mark import hardware_test from tests.helpers.runtime import OmniRunner -from tests.helpers.stage_config import modify_stage_config +from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config from vllm_omni.entrypoints.omni import Omni from vllm_omni.platforms import current_omni_platform +BAGEL_CI_DEPLOY = get_deploy_config_path("ci/bagel.yaml") + # Reference pixel data extracted from the known-good output image # Generated with seed=52, num_inference_steps=15, # prompt='Change the grass color to red', @@ -183,8 +184,8 @@ def _generate_bagel_img2img( return generated_image -def _resolve_stage_config(config_path: str, run_level: str) -> str: - """Resolve stage config based on run level. +def _resolve_deploy_config(config_path: str, run_level: str) -> str: + """Resolve deploy config based on run level. For advanced_model (real weights), strip load_format: dummy so the model falls back to loading real weights from HuggingFace. @@ -193,9 +194,9 @@ def _resolve_stage_config(config_path: str, run_level: str) -> str: return modify_stage_config( config_path, deletes={ - "stage_args": { - 0: ["engine_args.load_format"], - 1: ["engine_args.load_format"], + "stages": { + 0: ["load_format"], + 1: ["load_format"], } }, ) @@ -209,8 +210,7 @@ def _resolve_stage_config(config_path: str, run_level: str) -> str: def test_bagel_img2img_shared_memory_connector(run_level): """Test Bagel img2img with shared memory connector.""" input_image = _load_input_image() - config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") - config_path = _resolve_stage_config(config_path, run_level) + config_path = _resolve_deploy_config(BAGEL_CI_DEPLOY, run_level) with OmniRunner( "ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, diff --git a/tests/e2e/offline_inference/test_bagel_lora.py b/tests/e2e/offline_inference/test_bagel_lora.py index 75f41f9beea..aa4a2cd12bd 100644 --- a/tests/e2e/offline_inference/test_bagel_lora.py +++ b/tests/e2e/offline_inference/test_bagel_lora.py @@ -17,14 +17,13 @@ import json import os +from pathlib import Path from vllm_omni.inputs.data import OmniSamplingParams from vllm_omni.outputs import OmniRequestOutput os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -from pathlib import Path - import numpy as np import pytest import torch @@ -33,13 +32,13 @@ from tests.helpers.mark import hardware_test from tests.helpers.runtime import OmniRunner -from tests.helpers.stage_config import modify_stage_config +from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config from vllm_omni.entrypoints.omni import Omni from vllm_omni.lora.request import LoRARequest from vllm_omni.lora.utils import stable_lora_int_id MODEL = "ByteDance-Seed/BAGEL-7B-MoT" -BAGEL_STAGE_CONFIG = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") +BAGEL_STAGE_CONFIG = get_deploy_config_path("ci/bagel.yaml") DEFAULT_PROMPT = "<|im_start|>A cute cat<|im_end|>" @@ -48,14 +47,14 @@ # --------------------------------------------------------------------------- -def _resolve_stage_config(config_path: str, run_level: str) -> str: +def _resolve_deploy_config(config_path: str, run_level: str) -> str: if run_level == "advanced_model": return modify_stage_config( config_path, deletes={ - "stage_args": { - 0: ["engine_args.load_format"], - 1: ["engine_args.load_format"], + "stages": { + 0: ["load_format"], + 1: ["load_format"], } }, ) @@ -153,7 +152,7 @@ def _make_file_lora_request(adapter_dir: Path) -> LoRARequest: @hardware_test(res={"cuda": "H100", "rocm": "MI325"}) def test_bagel_lora_scale_and_deactivation(run_level, tmp_path): """Validate LoRA effect, bounded perturbation, and clean deactivation.""" - config_path = _resolve_stage_config(BAGEL_STAGE_CONFIG, run_level) + config_path = _resolve_deploy_config(BAGEL_STAGE_CONFIG, run_level) with OmniRunner(MODEL, stage_configs_path=config_path) as runner: omni = runner.omni lora_request = _make_file_lora_request(tmp_path / "bagel_lora") diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py index 0819f103a0a..65cd8425cd0 100644 --- a/tests/e2e/offline_inference/test_bagel_text2img.py +++ b/tests/e2e/offline_inference/test_bagel_text2img.py @@ -21,7 +21,6 @@ import subprocess import tempfile import time -from pathlib import Path from typing import Any import pytest @@ -29,10 +28,13 @@ from tests.helpers.mark import hardware_test from tests.helpers.runtime import OmniRunner -from tests.helpers.stage_config import modify_stage_config +from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config from vllm_omni.entrypoints.omni import Omni from vllm_omni.platforms import current_omni_platform +BAGEL_CI_DEPLOY = get_deploy_config_path("ci/bagel.yaml") +BAGEL_MOONCAKE_CI_DEPLOY = get_deploy_config_path("ci/bagel_mooncake.yaml") + # Reference pixel data extracted from the known-good output image # Each entry contains (x, y) position and expected (R, G, B) values # "Generated with seed=52, num_inference_steps=15, @@ -172,8 +174,8 @@ def _generate_bagel_image(omni: Omni, prompt: str = DEFAULT_PROMPT) -> Image.Ima return generated_image -def _resolve_stage_config(config_path: str, run_level: str) -> str: - """Resolve stage config based on run level. +def _resolve_deploy_config(config_path: str, run_level: str) -> str: + """Resolve deploy config based on run level. For advanced_model (real weights), strip load_format: dummy so the model falls back to loading real weights from HuggingFace. @@ -182,9 +184,9 @@ def _resolve_stage_config(config_path: str, run_level: str) -> str: return modify_stage_config( config_path, deletes={ - "stage_args": { - 0: ["engine_args.load_format"], - 1: ["engine_args.load_format"], + "stages": { + 0: ["load_format"], + 1: ["load_format"], } }, ) @@ -197,8 +199,7 @@ def _resolve_stage_config(config_path: str, run_level: str) -> str: @hardware_test(res={"cuda": "H100", "rocm": "MI325"}) def test_bagel_text2img_shared_memory_connector(run_level): """Test Bagel text2img with shared memory connector.""" - config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") - config_path = _resolve_stage_config(config_path, run_level) + config_path = _resolve_deploy_config(BAGEL_CI_DEPLOY, run_level) with OmniRunner( "ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, @@ -277,7 +278,7 @@ def _cleanup_mooncake_processes(timeout_secs: int = 5) -> None: def _load_mooncake_config(host: str, rpc_port: int, http_port: int) -> str: - """Load Mooncake config from YAML and substitute placeholders. + """Load Mooncake config from CI overlay and substitute placeholders. Args: host: Mooncake host address. @@ -287,16 +288,13 @@ def _load_mooncake_config(host: str, rpc_port: int, http_port: int) -> str: Returns: Path to the temporary config file with substituted values. """ - config_path = str(Path(__file__).parent / "stage_configs" / "bagel_mooncake_ci.yaml") - with open(config_path) as f: + with open(BAGEL_MOONCAKE_CI_DEPLOY) as f: config_content = f.read() - # Substitute placeholders config_content = config_content.replace("${MOONCAKE_HOST}", host) config_content = config_content.replace("${MOONCAKE_RPC_PORT}", str(rpc_port)) config_content = config_content.replace("${MOONCAKE_HTTP_PORT}", str(http_port)) - # Write to temp file temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) temp_file.write(config_content) temp_file.close() @@ -346,7 +344,7 @@ def test_bagel_text2img_mooncake_connector(run_level): http_port=MOONCAKE_HTTP_PORT, ) - temp_config_file = _resolve_stage_config(temp_config_file, run_level) + temp_config_file = _resolve_deploy_config(temp_config_file, run_level) with OmniRunner( "ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=temp_config_file, diff --git a/tests/e2e/offline_inference/test_bagel_understanding.py b/tests/e2e/offline_inference/test_bagel_understanding.py index c3ed97b42bd..e342152fc02 100644 --- a/tests/e2e/offline_inference/test_bagel_understanding.py +++ b/tests/e2e/offline_inference/test_bagel_understanding.py @@ -21,17 +21,16 @@ import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -from pathlib import Path import pytest from vllm.assets.image import ImageAsset from tests.helpers.mark import hardware_test from tests.helpers.runtime import OmniRunner -from tests.helpers.stage_config import modify_stage_config +from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config MODEL_NAME = "ByteDance-Seed/BAGEL-7B-MoT" -STAGE_CONFIG = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") +STAGE_CONFIG = get_deploy_config_path("ci/bagel.yaml") REFERENCE_TEXT_TEXT2TEXT = "The capital of France is Paris." @@ -44,15 +43,15 @@ ) -def _resolve_stage_config(config_path: str, run_level: str) -> str: +def _resolve_deploy_config(config_path: str, run_level: str) -> str: """Strip load_format: dummy for advanced_model (real weights).""" if run_level == "advanced_model": return modify_stage_config( config_path, deletes={ - "stage_args": { - 0: ["engine_args.load_format"], - 1: ["engine_args.load_format"], + "stages": { + 0: ["load_format"], + 1: ["load_format"], } }, ) @@ -74,7 +73,7 @@ def _extract_text(omni_outputs: list) -> str: @hardware_test(res={"cuda": "H100", "rocm": "MI325"}) def test_bagel_text2text(run_level): """Test Bagel text2text produces correct text output.""" - config_path = _resolve_stage_config(STAGE_CONFIG, run_level) + config_path = _resolve_deploy_config(STAGE_CONFIG, run_level) with OmniRunner( MODEL_NAME, stage_configs_path=config_path, @@ -106,7 +105,7 @@ def test_bagel_text2text(run_level): def test_bagel_img2text(run_level): """Test Bagel img2text produces correct text output.""" input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") - config_path = _resolve_stage_config(STAGE_CONFIG, run_level) + config_path = _resolve_deploy_config(STAGE_CONFIG, run_level) with OmniRunner( MODEL_NAME, stage_configs_path=config_path, diff --git a/tests/e2e/offline_inference/test_quantization_fp8.py b/tests/e2e/offline_inference/test_quantization_fp8.py index 92cf351e3d3..708092fc192 100644 --- a/tests/e2e/offline_inference/test_quantization_fp8.py +++ b/tests/e2e/offline_inference/test_quantization_fp8.py @@ -30,7 +30,6 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -from pathlib import Path from typing import Any import pytest @@ -38,6 +37,7 @@ from tests.helpers.mark import hardware_test from tests.helpers.runtime import OmniRunner +from tests.helpers.stage_config import get_deploy_config_path from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform @@ -102,7 +102,7 @@ def _generate_bagel_image( Returns (generated_image, peak_memory_gib). """ - config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") + config_path = get_deploy_config_path("ci/bagel.yaml") omni_kwargs: dict[str, Any] = { "model": "ByteDance-Seed/BAGEL-7B-MoT", "stage_configs_path": config_path, diff --git a/tests/e2e/online_serving/test_bagel_online.py b/tests/e2e/online_serving/test_bagel_online.py index 745e9a1161f..a8ec6548937 100644 --- a/tests/e2e/online_serving/test_bagel_online.py +++ b/tests/e2e/online_serving/test_bagel_online.py @@ -23,21 +23,19 @@ import base64 import os from io import BytesIO -from pathlib import Path import pytest from vllm.assets.image import ImageAsset from tests.helpers.mark import hardware_test from tests.helpers.runtime import OmniServerParams +from tests.helpers.stage_config import get_deploy_config_path os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" MODEL = "ByteDance-Seed/BAGEL-7B-MoT" -STAGE_CONFIGS_PATH = str( - Path(__file__).parent.parent / "offline_inference" / "stage_configs" / "bagel_sharedmemory_ci.yaml" -) +STAGE_CONFIGS_PATH = get_deploy_config_path("ci/bagel.yaml") TEXT2IMG_PROMPT = "A cute cat" IMG2IMG_PROMPT = "Change the grass color to red" diff --git a/tests/helpers/stage_config.py b/tests/helpers/stage_config.py index 29a80372ecf..9f416351406 100644 --- a/tests/helpers/stage_config.py +++ b/tests/helpers/stage_config.py @@ -411,6 +411,61 @@ def delete_by_path(config_dict: dict, path: str) -> None: }, }, }, + "bagel": { + "base_config": "bagel.yaml", + "stages": [ + { + "stage_id": 0, + "max_num_seqs": 3, + "gpu_memory_utilization": 0.45, + "load_format": "dummy", + }, + { + "stage_id": 1, + "max_num_seqs": 1, + "load_format": "dummy", + }, + ], + }, + "bagel_single_stage": { + "base_config": "bagel_single_stage.yaml", + "stages": [ + { + "stage_id": 0, + "max_num_seqs": 1, + "load_format": "dummy", + }, + ], + }, + "bagel_mooncake": { + "base_config": "bagel.yaml", + "stages": [ + { + "stage_id": 0, + "max_num_seqs": 1, + "gpu_memory_utilization": 0.45, + "load_format": "dummy", + }, + { + "stage_id": 1, + "max_num_seqs": 1, + "load_format": "dummy", + }, + ], + "connectors": { + "mooncake_connector": { + "name": "MooncakeConnector", + "extra": { + "host": "${MOONCAKE_HOST}", + "metadata_server": "http://${MOONCAKE_HOST}:${MOONCAKE_HTTP_PORT}/metadata", + "master": "${MOONCAKE_HOST}:${MOONCAKE_RPC_PORT}", + "segment": 64000000, + "localbuf": 64000000, + "proto": "tcp", + }, + }, + }, + }, # Single-stage thinker-only topology for the abort test. "qwen2_5_omni_thinker_only": { "async_chunk": False, diff --git a/vllm_omni/config/pipeline_registry.py b/vllm_omni/config/pipeline_registry.py index c07bc2610c3..c5cf716da38 100644 --- a/vllm_omni/config/pipeline_registry.py +++ b/vllm_omni/config/pipeline_registry.py @@ -43,6 +43,14 @@ "vllm_omni.model_executor.models.qwen3_tts.pipeline", "QWEN3_TTS_PIPELINE", ), + "bagel": ( + "vllm_omni.model_executor.models.bagel.pipeline", + "BAGEL_PIPELINE", + ), + "bagel_single_stage": ( + "vllm_omni.model_executor.models.bagel.pipeline", + "BAGEL_SINGLE_STAGE_PIPELINE", + ), } # --- Single-stage diffusion pipelines (populated in PR 3/N) --- diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml new file mode 100644 index 00000000000..13212d98fab --- /dev/null +++ b/vllm_omni/deploy/bagel.yaml @@ -0,0 +1,51 @@ +# BAGEL-7B-MoT deploy: CUDA defaults, verified on NVIDIA A100 (80GB). +# +# Stage 0 (Thinker) and Stage 1 (DiT) share a single GPU by default. +# For dual-GPU setups, set stage 1 devices to "1". +# +# Fields omitted from a stage fall back to StageDeployConfig dataclass +# defaults (see vllm_omni/config/stage_config.py). + +stages: + - stage_id: 0 + max_num_seqs: 3 + gpu_memory_utilization: 0.45 + enforce_eager: true + devices: "0" + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + top_k: 1 + max_tokens: 2048 + seed: 52 + detokenize: true + repetition_penalty: 1.05 + + - stage_id: 1 + max_num_seqs: 1 + enforce_eager: true + devices: "0" + default_sampling_params: + seed: 52 + +connectors: + shared_memory_connector: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + + rdma_connector: + name: MooncakeTransferEngineConnector + extra: + host: "auto" + zmq_port: 50051 + protocol: "rdma" + device_name: "" + memory_pool_size: 4294967296 + memory_pool_device: "cpu" + +platforms: + xpu: + stages: + - stage_id: 0 + gpu_memory_utilization: 0.9 diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml new file mode 100644 index 00000000000..d3caf808eca --- /dev/null +++ b/vllm_omni/deploy/bagel_single_stage.yaml @@ -0,0 +1,22 @@ +# BAGEL-7B-MoT single-stage deploy: all modalities handled by the DiT stage. +# +# The DiT stage contains a full LLM (Qwen2-MoT), ViT, VAE, and tokenizer, +# so it supports text2img, img2img, img2text, text2text, and think mode +# without a separate Thinker (AR) stage. +# +# Select this topology via: +# vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ +# --deploy-config vllm_omni/deploy/bagel_single_stage.yaml +# +# Or programmatically: +# Omni(model="...", deploy_config_path="vllm_omni/deploy/bagel_single_stage.yaml") + +pipeline: bagel_single_stage + +stages: + - stage_id: 0 + max_num_seqs: 1 + enforce_eager: true + devices: "0" + default_sampling_params: + seed: 52 diff --git a/vllm_omni/model_executor/models/bagel/pipeline.py b/vllm_omni/model_executor/models/bagel/pipeline.py new file mode 100644 index 00000000000..a9c24cc914f --- /dev/null +++ b/vllm_omni/model_executor/models/bagel/pipeline.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""BAGEL-7B-MoT pipeline topologies (frozen). + +Two-stage (default): + Stage 0: Thinker — multimodal understanding + text generation (AR) + Stage 1: DiT — diffusion image generation + +Single-stage: + Stage 0: DiT — self-contained diffusion stage that handles all modalities + (text2img, img2img, img2text, text2text, think) internally via its + own LLM, ViT, VAE, and tokenizer. +""" + +from vllm_omni.config.stage_config import ( + PipelineConfig, + StageExecutionType, + StagePipelineConfig, +) + +_PROC = "vllm_omni.model_executor.stage_input_processors.bagel" + +BAGEL_PIPELINE = PipelineConfig( + model_type="bagel", + model_arch="OmniBagelForConditionalGeneration", + hf_architectures=("BagelForConditionalGeneration",), + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="thinker", + execution_type=StageExecutionType.LLM_AR, + input_sources=(), + final_output=True, + final_output_type="text", + owns_tokenizer=True, + requires_multimodal_data=True, + model_arch="OmniBagelForConditionalGeneration", + engine_output_type="text", + prompt_expand_func=f"{_PROC}.expand_cfg_prompts", + omni_kv_config={ + "need_send_cache": True, + "kv_transfer_criteria": {"type": "prefill_finished"}, + }, + sampling_constraints={"detokenize": True}, + ), + StagePipelineConfig( + stage_id=1, + model_stage="dit", + execution_type=StageExecutionType.DIFFUSION, + input_sources=(0,), + final_output=True, + final_output_type="image", + cfg_kv_collect_func=f"{_PROC}.collect_cfg_kv_caches", + omni_kv_config={"need_recv_cache": True}, + ), + ), +) + +BAGEL_SINGLE_STAGE_PIPELINE = PipelineConfig( + model_type="bagel_single_stage", + model_arch="BagelForConditionalGeneration", + hf_architectures=("BagelForConditionalGeneration",), + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="dit", + execution_type=StageExecutionType.DIFFUSION, + input_sources=(), + final_output=True, + final_output_type="image", + ), + ), +) diff --git a/vllm_omni/model_executor/stage_configs/bagel.yaml b/vllm_omni/model_executor/stage_configs/bagel.yaml deleted file mode 100644 index 75f7c8a0637..00000000000 --- a/vllm_omni/model_executor/stage_configs/bagel.yaml +++ /dev/null @@ -1,102 +0,0 @@ -# Stage 0: Thinker (multimodal understanding + text generation) - -# By default this config uses the shared-memory connector for stage-0 -> stage-1 forwarding. -# To switch to RDMA, add output_connectors/input_connectors that point to -# rdma_connector and keep the rest of the pipeline unchanged. - -stage_args: - - stage_id: 0 - stage_type: llm - prompt_expand_func: vllm_omni.model_executor.stage_input_processors.bagel.expand_cfg_prompts - runtime: - devices: "0" - # 3 = 1 user prompt + 2 CFG companions (text-unconditional + image-unconditional). - engine_args: - model_stage: thinker - max_num_seqs: 3 - model_arch: OmniBagelForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.45 - enforce_eager: true - trust_remote_code: true - engine_output_type: text - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 - omni_kv_config: - need_send_cache: true - kv_transfer_criteria: - type: prefill_finished #or special token generated - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 52 - detokenize: True - repetition_penalty: 1.05 - # Optional RDMA override: - # output_connectors: - # to_stage_1: rdma_connector - - - stage_id: 1 - stage_type: diffusion - cfg_kv_collect_func: vllm_omni.model_executor.stage_input_processors.bagel.collect_cfg_kv_caches - runtime: - devices: "0" - engine_args: - model_stage: dit - max_num_seqs: 1 - enforce_eager: true - trust_remote_code: true - distributed_executor_backend: "mp" - omni_kv_config: - need_recv_cache: true - engine_input_source: [0] - - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 52 - # Optional RDMA override: - # input_connectors: - # from_stage_0: rdma_connector - -# Runtime edges -runtime: - enabled: true - # Distributed connectors configuration (optional) - # More connectors will be supported in the future. - connectors: - shared_memory_connector: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 # 64KB threshold - - # Optional RDMA connector template for Bagel. To enable it, point - # stage-0 output_connectors/to_stage_1 and stage-1 input_connectors/from_stage_0 - # to rdma_connector instead of relying on the default shared-memory path. - rdma_connector: - name: MooncakeTransferEngineConnector - extra: - host: "auto" - zmq_port: 50051 - protocol: "rdma" - device_name: "" - # Memory pool for RDMA-registered buffers. - # Supports both CPU pinned memory ("cpu") and GPU VRAM ("cuda"). - # CPU mode works on all topologies; GPU mode (GPUDirect RDMA) requires - # NIC-GPU direct PCIe connectivity (PIX topology). - # Recommended: 4 GB for CPU, 2 GB for GPU (to conserve VRAM). - memory_pool_size: 4294967296 # 4 GB - memory_pool_device: "cpu" - - edges: - - from: 0 - to: 1 diff --git a/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml deleted file mode 100644 index 7a0d851f0fd..00000000000 --- a/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml +++ /dev/null @@ -1,102 +0,0 @@ -# Stage 0: Thinker (multimodal understanding + text generation) - -stage_args: - - stage_id: 0 - stage_type: llm - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: BagelForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.45 - enforce_eager: true - trust_remote_code: true - engine_output_type: text - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 - omni_kv_config: - need_send_cache: true - kv_transfer_criteria: - type: prefill_finished #or special token generated - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 52 - detokenize: True - repetition_penalty: 1.05 - output_connectors: - to_stage_1: mooncake_connector - - - - stage_id: 1 - stage_type: diffusion - runtime: - devices: "0" - engine_args: - model_stage: dit - max_num_seqs: 1 - enforce_eager: true - trust_remote_code: true - distributed_executor_backend: "mp" - omni_kv_config: - need_recv_cache: true - engine_input_source: [0] - - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 52 - input_connectors: - from_stage_0: mooncake_connector - - -# Runtime edges -runtime: - enabled: true - # Distributed connectors configuration (optional) - # More connectors will be supported in the future. - connectors: - # Mooncake connector for cross-node/intra-node communication - mooncake_connector: - name: MooncakeStoreConnector - extra: - host: "127.0.0.1" - metadata_server: "http://10.90.67.86:8080/metadata" - master: "10.90.67.86:50051" - segment: 512000000 # 512MB - localbuf: 64000000 # 64MB - proto: "tcp" - - # PR1 (#1019) note: - # - Keep this transfer-engine connector config as a ready-to-use template. - # - Bagel does NOT consume this connector in PR1(#1019). - # - output_connectors/input_connectors above still point to mooncake_connector. - # - We will switch Bagel to this connector in the next PR. - rdma_connector: - name: MooncakeTransferEngineConnector - extra: - # NOTE: - # - role/sender_host/sender_zmq_port are internal fields resolved by - # orchestration logic and should not be set in user YAML. - host: "auto" # Auto-detect local IP for RDMA - zmq_port: 50051 # ZMQ base port (actual port uses runtime offsets) - protocol: "rdma" - device_name: "" # e.g. "mlx5_0"; empty for auto-detect - memory_pool_size: 2147483648 # 2GB - memory_pool_device: "cpu" # "cuda" for GPUDirect RDMA, "cpu" for pinned memory - - - edges: - - from: 0 - to: 1 diff --git a/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml b/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml deleted file mode 100644 index b2d4b07b13b..00000000000 --- a/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Stage 0: Thinker (multimodal understanding + text generation) - -stage_args: - - - stage_id: 0 - stage_type: diffusion - runtime: - devices: "0" - engine_args: - model_stage: dit - max_num_seqs: 1 - enforce_eager: true - trust_remote_code: true - distributed_executor_backend: "mp" - - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 52 - -# Runtime edges -runtime: - enabled: true diff --git a/vllm_omni/model_executor/stage_configs/bagel_think.yaml b/vllm_omni/model_executor/stage_configs/bagel_think.yaml deleted file mode 100644 index 2575e6736dd..00000000000 --- a/vllm_omni/model_executor/stage_configs/bagel_think.yaml +++ /dev/null @@ -1,77 +0,0 @@ -# BAGEL Think Model: AR stage decodes thinking tokens before KV transfer to DiT. -# -# Differences from bagel.yaml: -# - No kv_transfer_criteria: AR stage decodes until EOS, then transfers full -# KV cache (including thinking tokens) via _free_request path. -# - prompt_expand_func: uses expand_cfg_prompts_think which sets max_tokens=1 -# on companion requests so they stop immediately after prefill. -# - max_tokens: 2048 for thinking text generation. - -stage_args: - - stage_id: 0 - stage_type: llm - prompt_expand_func: vllm_omni.model_executor.stage_input_processors.bagel.expand_cfg_prompts_think - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 3 - model_arch: OmniBagelForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.45 - enforce_eager: true - trust_remote_code: true - engine_output_type: text - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 - omni_kv_config: - need_send_cache: true - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.3 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 52 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: diffusion - cfg_kv_collect_func: vllm_omni.model_executor.stage_input_processors.bagel.collect_cfg_kv_caches - runtime: - devices: "0" - engine_args: - model_stage: dit - max_num_seqs: 1 - enforce_eager: true - trust_remote_code: true - distributed_executor_backend: "mp" - omni_kv_config: - need_recv_cache: true - engine_input_source: [0] - - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 52 - -# Runtime edges -runtime: - enabled: true - - connectors: - shared_memory_connector: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - - edges: - - from: 0 - to: 1 diff --git a/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml b/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml deleted file mode 100644 index 4599f8b059c..00000000000 --- a/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml +++ /dev/null @@ -1,72 +0,0 @@ -# Stage config for BAGEL SP: ulysses=2 (2 GPUs) - -stage_args: - - stage_id: 0 - stage_type: llm - prompt_expand_func: vllm_omni.model_executor.stage_input_processors.bagel.expand_cfg_prompts - runtime: - devices: "0" - max_batch_size: 1 - engine_args: - model_stage: thinker - model_arch: OmniBagelForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.45 - enforce_eager: true - trust_remote_code: true - engine_output_type: text - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 - omni_kv_config: - need_send_cache: true - kv_transfer_criteria: - type: prefill_finished - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 52 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: diffusion - cfg_kv_collect_func: vllm_omni.model_executor.stage_input_processors.bagel.collect_cfg_kv_caches - runtime: - # devices: "0,1,2,3" - devices: "0,1" - max_batch_size: 1 - engine_args: - model_stage: dit - enforce_eager: true - trust_remote_code: true - distributed_executor_backend: "mp" - parallel_config: - ulysses_degree: 2 - # ring_degree: 2 - omni_kv_config: - need_recv_cache: true - engine_input_source: [0] - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 52 - -runtime: - enabled: true - connectors: - shared_memory_connector: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - edges: - - from: 0 - to: 1 diff --git a/vllm_omni/platforms/xpu/stage_configs/bagel.yaml b/vllm_omni/platforms/xpu/stage_configs/bagel.yaml deleted file mode 100644 index 7b27f6a443a..00000000000 --- a/vllm_omni/platforms/xpu/stage_configs/bagel.yaml +++ /dev/null @@ -1,81 +0,0 @@ -# stage config for running bagel-7b-mot with architecture of OmniLLM. - -stage_args: - - stage_id: 0 - stage_type: llm - prompt_expand_func: vllm_omni.model_executor.stage_input_processors.bagel.expand_cfg_prompts - runtime: - devices: "0" - engine_args: - # 3 = 1 user prompt + 2 CFG companions (text-unconditional + image-unconditional). - max_num_seqs: 3 - model_stage: thinker - model_arch: OmniBagelForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: true - trust_remote_code: true - engine_output_type: text - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 16384 - tensor_parallel_size: 1 - quantization: fp8 - omni_kv_config: - need_send_cache: true - kv_transfer_criteria: - type: prefill_finished #or special token generated - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 52 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: diffusion - cfg_kv_collect_func: vllm_omni.model_executor.stage_input_processors.bagel.collect_cfg_kv_caches - runtime: - devices: "1" - engine_args: - max_num_seqs: 1 - model_stage: dit - gpu_memory_utilization: 0.9 - enforce_eager: true - trust_remote_code: true - engine_output_type: image - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 - omni_kv_config: - need_recv_cache: true - engine_input_source: [0] - - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 52 - -# Runtime edges -runtime: - enabled: true - # Distributed connectors configuration (optional) - # More connectors will be supported in the future. - connectors: - shared_memory_connector: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 # 64KB threshold - - - edges: - - from: 0 - to: 1 From cc4667c93a729f86f9686b221552f980c66f5c8d Mon Sep 17 00:00:00 2001 From: princepride Date: Mon, 20 Apr 2026 08:07:57 +0000 Subject: [PATCH 02/13] fix some bug Signed-off-by: princepride --- vllm_omni/deploy/bagel.yaml | 2 ++ vllm_omni/deploy/bagel_single_stage.yaml | 1 + 2 files changed, 3 insertions(+) diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml index 13212d98fab..0f5577dc961 100644 --- a/vllm_omni/deploy/bagel.yaml +++ b/vllm_omni/deploy/bagel.yaml @@ -6,6 +6,8 @@ # Fields omitted from a stage fall back to StageDeployConfig dataclass # defaults (see vllm_omni/config/stage_config.py). +async_chunk: false + stages: - stage_id: 0 max_num_seqs: 3 diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml index d3caf808eca..055ad1ec5f9 100644 --- a/vllm_omni/deploy/bagel_single_stage.yaml +++ b/vllm_omni/deploy/bagel_single_stage.yaml @@ -12,6 +12,7 @@ # Omni(model="...", deploy_config_path="vllm_omni/deploy/bagel_single_stage.yaml") pipeline: bagel_single_stage +async_chunk: false stages: - stage_id: 0 From f79f2c1a7ea96dbe4c5c7a55971865d765ccf57e Mon Sep 17 00:00:00 2001 From: princepride Date: Mon, 20 Apr 2026 08:25:35 +0000 Subject: [PATCH 03/13] fix some bug Signed-off-by: princepride --- tests/e2e/offline_inference/test_bagel_lora.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/offline_inference/test_bagel_lora.py b/tests/e2e/offline_inference/test_bagel_lora.py index aa4a2cd12bd..45c238a116e 100644 --- a/tests/e2e/offline_inference/test_bagel_lora.py +++ b/tests/e2e/offline_inference/test_bagel_lora.py @@ -187,9 +187,9 @@ def test_bagel_lora_scale_and_deactivation(run_level, tmp_path): f"LoRA scale has no effect: diff_1x={diff_1x:.2f}, diff_2x={diff_2x:.2f}" ) - # (c) Output is not corrupted + # (c) Output is not corrupted (scale=2.0 can produce ~2x the diff of scale=1.0) assert diff_1x < 80, f"LoRA output looks corrupted: diff_1x={diff_1x}" - assert diff_2x < 80, f"LoRA output looks corrupted: diff_2x={diff_2x}" + assert diff_2x < 160, f"LoRA output looks corrupted: diff_2x={diff_2x}" # (d) Deactivation fully restores base model assert diff_restored == 0.0, f"Base model not restored after LoRA deactivation: diff={diff_restored}" From 2f652a34a87ce62b64ef63bd64ce6c9c49bf1d67 Mon Sep 17 00:00:00 2001 From: princepride Date: Mon, 20 Apr 2026 13:06:29 +0000 Subject: [PATCH 04/13] Address review findings: think pipeline, mooncake bindings, XPU overrides - Add BAGEL_THINK_PIPELINE with expand_cfg_prompts_think and no kv_transfer_criteria so Thinker decodes think tokens before KV transfer - Create bagel_think.yaml (inherits bagel.yaml, sets pipeline: bagel_think) - Restore --think auto-select in end2end.py - Fix Mooncake CI overlay: add output_connectors/input_connectors on stages - Complete XPU platform overrides (fp8, max_num_batched_tokens, stage 1 GPU) - Set hf_architectures=() on single-stage and think pipelines - Remove non-existent --tensor-parallel-size from offline READMEs - Clarify --deploy-config vs deprecated --stage-configs-path in online READMEs - Remove enforce_eager from all stage 0 in bagel deploy YAMLs - Tighten LoRA diff_2x threshold from 160 to 120 - Add explanatory comment for single-stage detection heuristic Signed-off-by: princepride Made-with: Cursor --- .../examples/offline_inference/bagel.md | 16 ++++++-- .../examples/online_serving/bagel.md | 2 +- examples/offline_inference/bagel/README.md | 16 ++++++-- examples/offline_inference/bagel/end2end.py | 6 +++ examples/online_serving/bagel/README.md | 2 +- .../e2e/offline_inference/test_bagel_lora.py | 2 +- tests/helpers/stage_config.py | 18 +++++++++ vllm_omni/config/pipeline_registry.py | 4 ++ vllm_omni/deploy/bagel.yaml | 6 ++- vllm_omni/deploy/bagel_single_stage.yaml | 1 - vllm_omni/deploy/bagel_think.yaml | 16 ++++++++ .../model_executor/models/bagel/pipeline.py | 40 ++++++++++++++++++- 12 files changed, 115 insertions(+), 14 deletions(-) create mode 100644 vllm_omni/deploy/bagel_think.yaml diff --git a/docs/user_guide/examples/offline_inference/bagel.md b/docs/user_guide/examples/offline_inference/bagel.md index a1b41691916..0c3730e93b0 100644 --- a/docs/user_guide/examples/offline_inference/bagel.md +++ b/docs/user_guide/examples/offline_inference/bagel.md @@ -187,17 +187,25 @@ See [`bagel_single_stage.yaml`](https://github.com/vllm-project/vllm-omni/tree/m ### Tensor Parallelism (TP) -For larger models or multi-GPU environments: +For larger models or multi-GPU environments, customize the deploy YAML (see [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml)) and set per-stage `tensor_parallel_size` and `devices`: + +```yaml +# Example: TP=2 on GPUs 0,1 for the Thinker stage +stages: + - stage_id: 0 + tensor_parallel_size: 2 + devices: "0,1" +``` + +Then pass the custom deploy YAML: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --tensor-parallel-size 2 + --stage-configs-path /path/to/custom_bagel.yaml ``` -Or customize the deploy YAML (see [`bagel.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel.yaml)) with per-stage `tensor_parallel_size`. - ### FP8 Quantization ```bash diff --git a/docs/user_guide/examples/online_serving/bagel.md b/docs/user_guide/examples/online_serving/bagel.md index 78aa4f21bec..0b713e428c4 100644 --- a/docs/user_guide/examples/online_serving/bagel.md +++ b/docs/user_guide/examples/online_serving/bagel.md @@ -36,7 +36,7 @@ cd examples/online_serving/bagel bash run_server.sh ``` -To use a custom deploy YAML: +To use a custom deploy YAML (note: `--stage-configs-path` is deprecated in favor of `--deploy-config`): ```bash vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ diff --git a/examples/offline_inference/bagel/README.md b/examples/offline_inference/bagel/README.md index 62d1be144a5..b8410a5bca8 100644 --- a/examples/offline_inference/bagel/README.md +++ b/examples/offline_inference/bagel/README.md @@ -185,17 +185,25 @@ See [`bagel_single_stage.yaml`](../../../vllm_omni/deploy/bagel_single_stage.yam ### Tensor Parallelism (TP) -For larger models or multi-GPU environments: +For larger models or multi-GPU environments, customize the deploy YAML (see [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml)) and set per-stage `tensor_parallel_size` and `devices`: + +```yaml +# Example: TP=2 on GPUs 0,1 for the Thinker stage +stages: + - stage_id: 0 + tensor_parallel_size: 2 + devices: "0,1" +``` + +Then pass the custom deploy YAML: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --tensor-parallel-size 2 + --stage-configs-path /path/to/custom_bagel.yaml ``` -Or customize the deploy YAML (see [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml)) with per-stage `tensor_parallel_size`. - ### FP8 Quantization ```bash diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 8ce09c5d617..4c84d2356bb 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -147,6 +147,9 @@ def main(): omni_kwargs = {} stage_configs_path = args.stage_configs_path + if args.think and stage_configs_path is None: + stage_configs_path = "vllm_omni/deploy/bagel_think.yaml" + print(f"[Info] Think mode enabled, using deploy config: {stage_configs_path}") if stage_configs_path: omni_kwargs["stage_configs_path"] = stage_configs_path @@ -210,6 +213,9 @@ def main(): formatted_prompts.append(prompt_dict) params_list = omni.default_sampling_params_list + # Bagel exposes 1 sampling param set for single-stage (DiT-only) and + # 2 for two-stage (Thinker + DiT). This heuristic may need updating + # if future pipelines break that 1:1 mapping. is_single_stage = len(params_list) == 1 diffusion_params_idx = 0 if is_single_stage else (1 if len(params_list) > 1 else 0) diff --git a/examples/online_serving/bagel/README.md b/examples/online_serving/bagel/README.md index 7c1fa13569a..d134072ca24 100644 --- a/examples/online_serving/bagel/README.md +++ b/examples/online_serving/bagel/README.md @@ -34,7 +34,7 @@ cd examples/online_serving/bagel bash run_server.sh ``` -To use a custom deploy YAML: +To use a custom deploy YAML (note: `--stage-configs-path` is deprecated in favor of `--deploy-config`): ```bash vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ diff --git a/tests/e2e/offline_inference/test_bagel_lora.py b/tests/e2e/offline_inference/test_bagel_lora.py index 45c238a116e..785d0c7fb8f 100644 --- a/tests/e2e/offline_inference/test_bagel_lora.py +++ b/tests/e2e/offline_inference/test_bagel_lora.py @@ -189,7 +189,7 @@ def test_bagel_lora_scale_and_deactivation(run_level, tmp_path): # (c) Output is not corrupted (scale=2.0 can produce ~2x the diff of scale=1.0) assert diff_1x < 80, f"LoRA output looks corrupted: diff_1x={diff_1x}" - assert diff_2x < 160, f"LoRA output looks corrupted: diff_2x={diff_2x}" + assert diff_2x < 120, f"LoRA output looks corrupted: diff_2x={diff_2x}" # (d) Deactivation fully restores base model assert diff_restored == 0.0, f"Base model not restored after LoRA deactivation: diff={diff_restored}" diff --git a/tests/helpers/stage_config.py b/tests/helpers/stage_config.py index 9f416351406..2bb017b811f 100644 --- a/tests/helpers/stage_config.py +++ b/tests/helpers/stage_config.py @@ -427,6 +427,22 @@ def delete_by_path(config_dict: dict, path: str) -> None: }, ], }, + "bagel_think": { + "base_config": "bagel_think.yaml", + "stages": [ + { + "stage_id": 0, + "max_num_seqs": 3, + "gpu_memory_utilization": 0.45, + "load_format": "dummy", + }, + { + "stage_id": 1, + "max_num_seqs": 1, + "load_format": "dummy", + }, + ], + }, "bagel_single_stage": { "base_config": "bagel_single_stage.yaml", "stages": [ @@ -445,11 +461,13 @@ def delete_by_path(config_dict: dict, path: str) -> None: "max_num_seqs": 1, "gpu_memory_utilization": 0.45, "load_format": "dummy", + "output_connectors": {"to_stage_1": "mooncake_connector"}, }, { "stage_id": 1, "max_num_seqs": 1, "load_format": "dummy", + "input_connectors": {"from_stage_0": "mooncake_connector"}, }, ], "connectors": { diff --git a/vllm_omni/config/pipeline_registry.py b/vllm_omni/config/pipeline_registry.py index c5cf716da38..e70cb17fcaf 100644 --- a/vllm_omni/config/pipeline_registry.py +++ b/vllm_omni/config/pipeline_registry.py @@ -47,6 +47,10 @@ "vllm_omni.model_executor.models.bagel.pipeline", "BAGEL_PIPELINE", ), + "bagel_think": ( + "vllm_omni.model_executor.models.bagel.pipeline", + "BAGEL_THINK_PIPELINE", + ), "bagel_single_stage": ( "vllm_omni.model_executor.models.bagel.pipeline", "BAGEL_SINGLE_STAGE_PIPELINE", diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml index 0f5577dc961..1a772b03498 100644 --- a/vllm_omni/deploy/bagel.yaml +++ b/vllm_omni/deploy/bagel.yaml @@ -12,7 +12,6 @@ stages: - stage_id: 0 max_num_seqs: 3 gpu_memory_utilization: 0.45 - enforce_eager: true devices: "0" default_sampling_params: temperature: 0.4 @@ -51,3 +50,8 @@ platforms: stages: - stage_id: 0 gpu_memory_utilization: 0.9 + max_num_batched_tokens: 16384 + quantization: fp8 + - stage_id: 1 + gpu_memory_utilization: 0.9 + devices: "1" diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml index 055ad1ec5f9..8470124ec78 100644 --- a/vllm_omni/deploy/bagel_single_stage.yaml +++ b/vllm_omni/deploy/bagel_single_stage.yaml @@ -17,7 +17,6 @@ async_chunk: false stages: - stage_id: 0 max_num_seqs: 1 - enforce_eager: true devices: "0" default_sampling_params: seed: 52 diff --git a/vllm_omni/deploy/bagel_think.yaml b/vllm_omni/deploy/bagel_think.yaml new file mode 100644 index 00000000000..d7adf7f66b3 --- /dev/null +++ b/vllm_omni/deploy/bagel_think.yaml @@ -0,0 +1,16 @@ +# BAGEL-7B-MoT think-mode deploy. +# +# Inherits all settings from bagel.yaml; only overrides the pipeline to +# bagel_think which uses expand_cfg_prompts_think and omits +# kv_transfer_criteria so the Thinker decodes tokens before +# transferring KV to DiT. +# +# Select this topology via: +# python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT --think +# +# Or explicitly: +# vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni \ +# --deploy-config vllm_omni/deploy/bagel_think.yaml + +base_config: bagel.yaml +pipeline: bagel_think diff --git a/vllm_omni/model_executor/models/bagel/pipeline.py b/vllm_omni/model_executor/models/bagel/pipeline.py index a9c24cc914f..c68a531c294 100644 --- a/vllm_omni/model_executor/models/bagel/pipeline.py +++ b/vllm_omni/model_executor/models/bagel/pipeline.py @@ -6,6 +6,11 @@ Stage 0: Thinker — multimodal understanding + text generation (AR) Stage 1: DiT — diffusion image generation +Two-stage think: + Same as two-stage but the Thinker decodes ... tokens before + KV transfer. Uses expand_cfg_prompts_think (companion max_tokens=1) and + omits kv_transfer_criteria so transfer happens after EOS, not after prefill. + Single-stage: Stage 0: DiT — self-contained diffusion stage that handles all modalities (text2img, img2img, img2text, text2text, think) internally via its @@ -56,10 +61,43 @@ ), ) +BAGEL_THINK_PIPELINE = PipelineConfig( + model_type="bagel_think", + model_arch="OmniBagelForConditionalGeneration", + hf_architectures=(), + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="thinker", + execution_type=StageExecutionType.LLM_AR, + input_sources=(), + final_output=True, + final_output_type="text", + owns_tokenizer=True, + requires_multimodal_data=True, + model_arch="OmniBagelForConditionalGeneration", + engine_output_type="text", + prompt_expand_func=f"{_PROC}.expand_cfg_prompts_think", + omni_kv_config={"need_send_cache": True}, + sampling_constraints={"detokenize": True}, + ), + StagePipelineConfig( + stage_id=1, + model_stage="dit", + execution_type=StageExecutionType.DIFFUSION, + input_sources=(0,), + final_output=True, + final_output_type="image", + cfg_kv_collect_func=f"{_PROC}.collect_cfg_kv_caches", + omni_kv_config={"need_recv_cache": True}, + ), + ), +) + BAGEL_SINGLE_STAGE_PIPELINE = PipelineConfig( model_type="bagel_single_stage", model_arch="BagelForConditionalGeneration", - hf_architectures=("BagelForConditionalGeneration",), + hf_architectures=(), stages=( StagePipelineConfig( stage_id=0, From 01ebc5d78b13d72dce27256b5f0a46c7379c25e0 Mon Sep 17 00:00:00 2001 From: princepride Date: Mon, 20 Apr 2026 13:44:51 +0000 Subject: [PATCH 05/13] fix some bug Signed-off-by: princepride --- vllm_omni/model_executor/models/bagel/bagel.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm_omni/model_executor/models/bagel/bagel.py b/vllm_omni/model_executor/models/bagel/bagel.py index cbb775680cc..c9c986e4208 100644 --- a/vllm_omni/model_executor/models/bagel/bagel.py +++ b/vllm_omni/model_executor/models/bagel/bagel.py @@ -581,6 +581,9 @@ def flush_pending_metadata(self, req_ids: list[str]) -> None: existing = self._ropes_metadata.get(rid) if existing and "image_shape" in existing and "image_shape" not in meta: continue + ropes = meta.get("ropes") + if ropes: + meta["ropes"] = [int(r.item()) if isinstance(r, torch.Tensor) else r for r in ropes] self._ropes_metadata[rid] = meta def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: @@ -724,7 +727,7 @@ def forward( positions = self._adjust_positions_for_img2img(positions, input_ids) use_mot = True else: - rope = int(positions[seq_len - 1].item()) + 1 + rope = positions[seq_len - 1] + 1 self._ropes_pending.append({"ropes": [rope]}) if use_mot: From e3aa65e920c3c59135b16d6419e77622ceec03a4 Mon Sep 17 00:00:00 2001 From: princepride Date: Tue, 21 Apr 2026 06:14:50 +0000 Subject: [PATCH 06/13] =?UTF-8?q?fix:=20restore=20KV=20transfer=20fields?= =?UTF-8?q?=20lost=20during=20PipelineConfig=E2=86=92StageConfig=20merge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit merge_pipeline_deploy silently dropped omni_kv_config, prompt_expand_func, and cfg_kv_collect_func when building StageConfig from the pipeline registry, breaking multi-stage KV transfer for Bagel img2img. - stage_config: propagate omni_kv_config/prompt_expand_func/cfg_kv_collect_func - bagel.yaml: declare input_connectors so transfer config discovers the edge - stage_init_utils: resolve base_config inheritance before parsing connectors - utils: add deploy/ as fallback in resolve_model_config_path Made-with: Cursor Signed-off-by: princepride --- vllm_omni/config/stage_config.py | 6 ++++++ vllm_omni/deploy/bagel.yaml | 2 ++ vllm_omni/engine/stage_init_utils.py | 14 ++++++++++++-- vllm_omni/entrypoints/utils.py | 4 ++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index 392a550be68..0a53ea42f8c 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -717,6 +717,8 @@ def _build_engine_args( engine_args.update(ds.engine_extras) if deploy.async_chunk: engine_args["async_chunk"] = True + if ps.omni_kv_config: + engine_args["omni_kv_config"] = dict(ps.omni_kv_config) return engine_args @@ -736,6 +738,10 @@ def _build_extras( extras["output_connectors"] = dict(ds.output_connectors) if ds is not None and ds.input_connectors: extras["input_connectors"] = dict(ds.input_connectors) + if ps.prompt_expand_func: + extras["prompt_expand_func"] = ps.prompt_expand_func + if ps.cfg_kv_collect_func: + extras["cfg_kv_collect_func"] = ps.cfg_kv_collect_func if ps.extras: extras.update(ps.extras) return extras diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml index 1a772b03498..a10dfc2e680 100644 --- a/vllm_omni/deploy/bagel.yaml +++ b/vllm_omni/deploy/bagel.yaml @@ -26,6 +26,8 @@ stages: max_num_seqs: 1 enforce_eager: true devices: "0" + input_connectors: + from_stage_0: shared_memory_connector default_sampling_params: seed: 52 diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 89dfdc163cc..3b2348d8ee2 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -590,12 +590,22 @@ def release_device_locks(lock_fds: list[int]) -> None: def load_omni_transfer_config_for_model(model: str, config_path: str | None) -> Any: - """Load omni transfer config from an explicit path or resolved model config.""" + """Load omni transfer config from an explicit path or resolved model config. + + Resolves ``base_config`` inheritance (CI overlay → base deploy YAML) so + that connectors defined in the base config are visible to the transfer + config parser. + """ from vllm_omni.distributed.omni_connectors import load_omni_transfer_config try: resolved_config_path = config_path or resolve_model_config_path(model) - return load_omni_transfer_config(resolved_config_path) + if resolved_config_path is None: + return None + from vllm_omni.config.stage_config import resolve_deploy_yaml + + resolved_dict = resolve_deploy_yaml(resolved_config_path) + return load_omni_transfer_config(config_dict=resolved_dict) except Exception as e: logger.warning("[stage_init] Failed to load transfer config: %s", e) return None diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index 5757d389900..dc35c86f64e 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -328,6 +328,10 @@ def resolve_model_config_path(model: str) -> str: if os.path.exists(complete_config_path): return str(complete_config_path) + deploy_config_path = PROJECT_ROOT / "vllm_omni" / "deploy" / model_type_str + if os.path.exists(deploy_config_path): + return str(deploy_config_path) + stage_config_file = f"vllm_omni/model_executor/stage_configs/{normalized_model_type}.yaml" stage_config_path = PROJECT_ROOT / stage_config_file if not os.path.exists(stage_config_path): From d3ef6af810b14efb64fd53f2b40f7a55542a9710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Tue, 21 Apr 2026 22:37:10 +0800 Subject: [PATCH 07/13] Add bagel_single_stage pipeline to registry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- vllm_omni/config/pipeline_registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_omni/config/pipeline_registry.py b/vllm_omni/config/pipeline_registry.py index 4636a11c6b9..699f146cae0 100644 --- a/vllm_omni/config/pipeline_registry.py +++ b/vllm_omni/config/pipeline_registry.py @@ -54,6 +54,7 @@ "bagel_single_stage": ( "vllm_omni.model_executor.models.bagel.pipeline", "BAGEL_SINGLE_STAGE_PIPELINE", + ), "voxcpm2": ( "vllm_omni.model_executor.models.voxcpm2.pipeline", "VOXCPM2_PIPELINE", From 5629b7c90bfcf6ba2cc93a72f660eff42aef0e9c Mon Sep 17 00:00:00 2001 From: princepride Date: Wed, 22 Apr 2026 02:03:02 +0000 Subject: [PATCH 08/13] change --stage-configs-path to --deploy-config Signed-off-by: princepride --- .../examples/offline_inference/bagel.md | 11 ++++---- examples/offline_inference/bagel/end2end.py | 27 +++++++++++++++---- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/docs/user_guide/examples/offline_inference/bagel.md b/docs/user_guide/examples/offline_inference/bagel.md index 0c3730e93b0..0d3498b28d9 100644 --- a/docs/user_guide/examples/offline_inference/bagel.md +++ b/docs/user_guide/examples/offline_inference/bagel.md @@ -31,7 +31,7 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --stage-configs-path vllm_omni/deploy/bagel_single_stage.yaml + --deploy-config vllm_omni/deploy/bagel_single_stage.yaml ``` > **Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. For dual-GPU setups, modify the deploy YAML to distribute stages across devices. @@ -174,13 +174,13 @@ The pipeline is defined in [`bagel.yaml`](https://github.com/vllm-project/vllm-o ### Single-Stage -Pass the single-stage deploy config via `--stage-configs-path`: +Pass the single-stage deploy config via `--deploy-config`: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --stage-configs-path vllm_omni/deploy/bagel_single_stage.yaml + --deploy-config vllm_omni/deploy/bagel_single_stage.yaml ``` See [`bagel_single_stage.yaml`](https://github.com/vllm-project/vllm-omni/tree/main/vllm_omni/deploy/bagel_single_stage.yaml) for configuration details. The `pipeline: bagel_single_stage` field selects the single-stage topology from the pipeline registry. @@ -203,7 +203,7 @@ Then pass the custom deploy YAML: python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --stage-configs-path /path/to/custom_bagel.yaml + --deploy-config /path/to/custom_bagel.yaml ``` ### FP8 Quantization @@ -255,7 +255,8 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ | Argument | Type | Default | Description | | :------- | :--- | :------ | :---------- | -| `--stage-configs-path` | string | `None` | Path to deploy YAML (auto-detected if omitted) | +| `--deploy-config` | string | `None` | Path to deploy YAML (auto-detected if omitted) | +| `--stage-configs-path` | string | `None` | [Deprecated] Legacy path to `stage_args` YAML; prefer `--deploy-config` | | `--worker-backend` | choice | `process` | `process` or `ray` | | `--ray-address` | string | `None` | Ray cluster address | | `--quantization` | string | `None` | Quantization method (e.g. `fp8`) | diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 4c84d2356bb..4d298a63e9d 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -53,7 +53,19 @@ def parse_args(): parser.add_argument("--shm-threshold-bytes", type=int, default=65536) parser.add_argument("--worker-backend", type=str, default="process", choices=["process", "ray"]) parser.add_argument("--ray-address", type=str, default=None) - parser.add_argument("--stage-configs-path", type=str, default=None) + parser.add_argument( + "--deploy-config", + type=str, + default=None, + help="Path to deploy YAML (new format). If unset, auto-loads " + "vllm_omni/deploy/bagel.yaml based on the HF model_type.", + ) + parser.add_argument( + "--stage-configs-path", + type=str, + default=None, + help="[Deprecated] Legacy path to stage_args YAML. Prefer --deploy-config for new-format YAMLs.", + ) parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.") parser.add_argument("--cfg-text-scale", type=float, default=4.0, help="Text CFG scale (default: 4.0)") @@ -146,11 +158,16 @@ def main(): from vllm_omni.entrypoints.omni import Omni omni_kwargs = {} + if args.deploy_config is not None and args.stage_configs_path is not None: + raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive; prefer --deploy-config.") + deploy_config = args.deploy_config stage_configs_path = args.stage_configs_path - if args.think and stage_configs_path is None: - stage_configs_path = "vllm_omni/deploy/bagel_think.yaml" - print(f"[Info] Think mode enabled, using deploy config: {stage_configs_path}") - if stage_configs_path: + if args.think and deploy_config is None and stage_configs_path is None: + deploy_config = "vllm_omni/deploy/bagel_think.yaml" + print(f"[Info] Think mode enabled, using deploy config: {deploy_config}") + if deploy_config: + omni_kwargs["deploy_config"] = deploy_config + elif stage_configs_path: omni_kwargs["stage_configs_path"] = stage_configs_path omni_kwargs.update( From d4bd9cd5f94105613bddd547855ccefb38b9d32b Mon Sep 17 00:00:00 2001 From: princepride Date: Wed, 22 Apr 2026 07:21:39 +0000 Subject: [PATCH 09/13] fix some bug Signed-off-by: princepride --- tests/e2e/offline_inference/test_quantization_fp8.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/e2e/offline_inference/test_quantization_fp8.py b/tests/e2e/offline_inference/test_quantization_fp8.py index f92281e780c..9801e0ae797 100644 --- a/tests/e2e/offline_inference/test_quantization_fp8.py +++ b/tests/e2e/offline_inference/test_quantization_fp8.py @@ -30,7 +30,6 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -from pathlib import Path from typing import Any import pytest From ed271e14046ecf6774e67d163c6cd46eb5932c2f Mon Sep 17 00:00:00 2001 From: princepride Date: Wed, 22 Apr 2026 09:33:26 +0000 Subject: [PATCH 10/13] fix some bug Signed-off-by: princepride --- tests/e2e/offline_inference/test_omni_sleep_mode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/offline_inference/test_omni_sleep_mode.py b/tests/e2e/offline_inference/test_omni_sleep_mode.py index fc4001ff596..5a3ae9ab728 100644 --- a/tests/e2e/offline_inference/test_omni_sleep_mode.py +++ b/tests/e2e/offline_inference/test_omni_sleep_mode.py @@ -38,8 +38,8 @@ def get_dynamic_devices(stage_idx, num_stages, tp_size): # Test 1: Diffusion Model (2-Stage BAGEL) @pytest.mark.advanced_model @pytest.mark.omni -@pytest.mark.parametrize("tp_size", [1, 2]) -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("tp_size", [1]) +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=1) @pytest.mark.asyncio async def test_diffusion_model_sleep_tp(tp_size): num_gpus = torch.cuda.device_count() From 463fe2c4ebeeef9f5432607e6b5ab39a927df0fc Mon Sep 17 00:00:00 2001 From: princepride Date: Wed, 22 Apr 2026 09:42:39 +0000 Subject: [PATCH 11/13] change to use from_cli_args Signed-off-by: princepride --- examples/offline_inference/bagel/end2end.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 4d298a63e9d..aa6a2fbd76c 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -185,7 +185,7 @@ def main(): if args.quantization: omni_kwargs["quantization_config"] = args.quantization - omni = Omni(model=model_name, **omni_kwargs) + omni = Omni.from_cli_args(args, model=model_name, **omni_kwargs) formatted_prompts = [] for p in prompts: From 349ec52cc052055fbd2ff4280d4688790d95ac11 Mon Sep 17 00:00:00 2001 From: princepride Date: Wed, 22 Apr 2026 13:05:59 +0000 Subject: [PATCH 12/13] fix doc bug Signed-off-by: princepride --- docs/user_guide/examples/online_serving/glm_image.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user_guide/examples/online_serving/glm_image.md b/docs/user_guide/examples/online_serving/glm_image.md index f7027b906db..b67eed1e036 100644 --- a/docs/user_guide/examples/online_serving/glm_image.md +++ b/docs/user_guide/examples/online_serving/glm_image.md @@ -104,9 +104,9 @@ guides. When using `/v1/chat/completions`, pass these inside `extra_body` in the curl JSON, or via the `extra_body` keyword argument in the OpenAI Python SDK (see the -[Diffusion Chat API guide](../../../../serving/diffusion_chat_api.md)). -When using the dedicated [`/v1/images/generations`](../../../../serving/image_generation_api.md) -or [`/v1/images/edits`](../../../../serving/image_edit_api.md) endpoints, pass +[Diffusion Chat API guide](../../../serving/diffusion_chat_api.md)). +When using the dedicated [`/v1/images/generations`](../../../serving/image_generation_api.md) +or [`/v1/images/edits`](../../../serving/image_edit_api.md) endpoints, pass the supported generation controls as top-level fields directly. For image dimensions and count, use `size` and `n` rather than `height` or `width`. From 6951940fb1915aa6c5b2080eab242dedae860c73 Mon Sep 17 00:00:00 2001 From: princepride Date: Thu, 23 Apr 2026 07:52:43 +0000 Subject: [PATCH 13/13] remove --stage-configs-path Signed-off-by: princepride --- examples/offline_inference/bagel/README.md | 10 +++++----- examples/offline_inference/bagel/end2end.py | 16 ++-------------- examples/online_serving/bagel/README.md | 2 +- .../online_serving/bagel/run_server_stage_cli.sh | 14 +++++++------- vllm_omni/deploy/bagel.yaml | 11 ----------- 5 files changed, 15 insertions(+), 38 deletions(-) diff --git a/examples/offline_inference/bagel/README.md b/examples/offline_inference/bagel/README.md index b8410a5bca8..9955fd90db9 100644 --- a/examples/offline_inference/bagel/README.md +++ b/examples/offline_inference/bagel/README.md @@ -29,7 +29,7 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --stage-configs-path vllm_omni/deploy/bagel_single_stage.yaml + --deploy-config vllm_omni/deploy/bagel_single_stage.yaml ``` > **Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. For dual-GPU setups, modify the deploy YAML to distribute stages across devices. @@ -172,13 +172,13 @@ The pipeline is defined in [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml). ### Single-Stage -Pass the single-stage deploy config via `--stage-configs-path`: +Pass the single-stage deploy config via `--deploy-config`: ```bash python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --stage-configs-path vllm_omni/deploy/bagel_single_stage.yaml + --deploy-config vllm_omni/deploy/bagel_single_stage.yaml ``` See [`bagel_single_stage.yaml`](../../../vllm_omni/deploy/bagel_single_stage.yaml) for configuration details. The `pipeline: bagel_single_stage` field selects the single-stage topology from the pipeline registry. @@ -201,7 +201,7 @@ Then pass the custom deploy YAML: python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ --modality text2img \ --prompts "A cute cat" \ - --stage-configs-path /path/to/custom_bagel.yaml + --deploy-config /path/to/custom_bagel.yaml ``` ### FP8 Quantization @@ -253,7 +253,7 @@ python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \ | Argument | Type | Default | Description | | :------- | :--- | :------ | :---------- | -| `--stage-configs-path` | string | `None` | Path to deploy YAML (auto-detected if omitted) | +| `--deploy-config` | string | `None` | Path to deploy YAML (auto-detected if omitted) | | `--worker-backend` | choice | `process` | `process` or `ray` | | `--ray-address` | string | `None` | Ray cluster address | | `--quantization` | string | `None` | Quantization method (e.g. `fp8`) | diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index aa6a2fbd76c..dfb07c3d376 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -57,14 +57,7 @@ def parse_args(): "--deploy-config", type=str, default=None, - help="Path to deploy YAML (new format). If unset, auto-loads " - "vllm_omni/deploy/bagel.yaml based on the HF model_type.", - ) - parser.add_argument( - "--stage-configs-path", - type=str, - default=None, - help="[Deprecated] Legacy path to stage_args YAML. Prefer --deploy-config for new-format YAMLs.", + help="Path to deploy YAML. If unset, auto-loads vllm_omni/deploy/bagel.yaml based on the HF model_type.", ) parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.") @@ -158,17 +151,12 @@ def main(): from vllm_omni.entrypoints.omni import Omni omni_kwargs = {} - if args.deploy_config is not None and args.stage_configs_path is not None: - raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive; prefer --deploy-config.") deploy_config = args.deploy_config - stage_configs_path = args.stage_configs_path - if args.think and deploy_config is None and stage_configs_path is None: + if args.think and deploy_config is None: deploy_config = "vllm_omni/deploy/bagel_think.yaml" print(f"[Info] Think mode enabled, using deploy config: {deploy_config}") if deploy_config: omni_kwargs["deploy_config"] = deploy_config - elif stage_configs_path: - omni_kwargs["stage_configs_path"] = stage_configs_path omni_kwargs.update( { diff --git a/examples/online_serving/bagel/README.md b/examples/online_serving/bagel/README.md index d134072ca24..2660e78f89f 100644 --- a/examples/online_serving/bagel/README.md +++ b/examples/online_serving/bagel/README.md @@ -34,7 +34,7 @@ cd examples/online_serving/bagel bash run_server.sh ``` -To use a custom deploy YAML (note: `--stage-configs-path` is deprecated in favor of `--deploy-config`): +To use a custom deploy YAML, pass it via `--deploy-config`: ```bash vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 \ diff --git a/examples/online_serving/bagel/run_server_stage_cli.sh b/examples/online_serving/bagel/run_server_stage_cli.sh index 55f64fcb965..536e2131c9b 100644 --- a/examples/online_serving/bagel/run_server_stage_cli.sh +++ b/examples/online_serving/bagel/run_server_stage_cli.sh @@ -20,7 +20,7 @@ MASTER_ADDRESS="${MASTER_ADDRESS:-127.0.0.1}" MASTER_PORT="${MASTER_PORT:-8092}" STAGE="all" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -STAGE_CONFIGS_PATH="${STAGE_CONFIGS_PATH:-$SCRIPT_DIR/../../../vllm_omni/deploy/bagel.yaml}" +DEPLOY_CONFIG="${DEPLOY_CONFIG:-$SCRIPT_DIR/../../../vllm_omni/deploy/bagel.yaml}" EXTRA_ARGS=() usage() { @@ -33,7 +33,7 @@ Options: --port PORT API port for stage 0 (default: $PORT) --master-address ADDRESS Master/orchestrator address (default: $MASTER_ADDRESS) --master-port PORT Master/orchestrator port (default: $MASTER_PORT) - --stage-configs-path PATH Stage config YAML path (default: $STAGE_CONFIGS_PATH) + --deploy-config PATH Deploy config YAML path (default: $DEPLOY_CONFIG) --help Show this help message Examples: @@ -71,8 +71,8 @@ while [[ $# -gt 0 ]]; do MASTER_PORT="$2" shift 2 ;; - --stage-configs-path) - STAGE_CONFIGS_PATH="$2" + --deploy-config) + DEPLOY_CONFIG="$2" shift 2 ;; --help|-h) @@ -103,7 +103,7 @@ print_config() { echo "API Port: $PORT" echo "Master Address: $MASTER_ADDRESS" echo "Master Port: $MASTER_PORT" - echo "Stage Configs: $STAGE_CONFIGS_PATH" + echo "Deploy Config: $DEPLOY_CONFIG" echo "Selected Stage: $STAGE" if [[ ${#EXTRA_ARGS[@]} -gt 0 ]]; then echo "Extra Args: ${EXTRA_ARGS[*]}" @@ -114,7 +114,7 @@ run_stage_0() { echo "Starting Stage 0 (Thinker) as master..." vllm serve "$MODEL" --omni \ --port "$PORT" \ - --stage-configs-path "$STAGE_CONFIGS_PATH" \ + --deploy-config "$DEPLOY_CONFIG" \ --stage-id 0 \ -oma "$MASTER_ADDRESS" \ -omp "$MASTER_PORT" \ @@ -124,7 +124,7 @@ run_stage_0() { run_stage_1() { echo "Starting Stage 1 (DiT) in headless mode..." vllm serve "$MODEL" --omni \ - --stage-configs-path "$STAGE_CONFIGS_PATH" \ + --deploy-config "$DEPLOY_CONFIG" \ --stage-id 1 \ --headless \ -oma "$MASTER_ADDRESS" \ diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml index a10dfc2e680..9d2f1f8fffa 100644 --- a/vllm_omni/deploy/bagel.yaml +++ b/vllm_omni/deploy/bagel.yaml @@ -46,14 +46,3 @@ connectors: device_name: "" memory_pool_size: 4294967296 memory_pool_device: "cpu" - -platforms: - xpu: - stages: - - stage_id: 0 - gpu_memory_utilization: 0.9 - max_num_batched_tokens: 16384 - quantization: fp8 - - stage_id: 1 - gpu_memory_utilization: 0.9 - devices: "1"