From f3d2fd12abf12b7a9db620c04f175a9d7759ea7e Mon Sep 17 00:00:00 2001 From: lishunyang Date: Wed, 11 Feb 2026 03:26:29 +0800 Subject: [PATCH 1/6] feat: add unified profiler with online serving and stage-aware endpoints Signed-off-by: lishunyang --- docs/contributing/profiling.md | 296 ++++++++++++------ examples/offline_inference/bagel/end2end.py | 31 +- .../image_to_image/image_edit.py | 43 +-- .../image_to_video/image_to_video.py | 39 +-- .../lora_inference/lora_inference.py | 31 +- .../offline_inference/qwen2_5_omni/end2end.py | 34 +- .../offline_inference/qwen3_omni/end2end.py | 38 ++- .../offline_inference/qwen3_tts/end2end.py | 39 ++- .../text_to_audio/text_to_audio.py | 26 +- .../text_to_image/text_to_image.py | 38 +-- .../text_to_video/text_to_video.py | 38 +-- tests/e2e/online_serving/test_async_omni.py | 2 +- .../openai_api/test_image_server.py | 2 +- tests/entrypoints/test_omni_llm.py | 2 +- tests/profiler/__init__.py | 2 + tests/profiler/test_api_router.py | 118 +++++++ tests/profiler/test_config.py | 160 ++++++++++ tests/profiler/test_torch_profiler.py | 196 ++++++++++++ vllm_omni/benchmarks/patch/patch.py | 5 +- vllm_omni/config/__init__.py | 2 + vllm_omni/diffusion/diffusion_engine.py | 131 +------- vllm_omni/diffusion/profiler/base.py | 58 ---- .../diffusion/profiler/torch_profiler.py | 126 -------- .../diffusion/worker/diffusion_worker.py | 31 +- vllm_omni/entrypoints/async_omni_llm.py | 24 -- vllm_omni/entrypoints/omni.py | 102 +++--- vllm_omni/entrypoints/omni_diffusion.py | 45 +-- vllm_omni/entrypoints/omni_llm.py | 42 +++ vllm_omni/entrypoints/omni_stage.py | 141 +++++---- vllm_omni/entrypoints/openai/api_server.py | 19 +- vllm_omni/entrypoints/serve/__init__.py | 0 .../entrypoints/serve/profile/__init__.py | 0 .../entrypoints/serve/profile/api_router.py | 70 +++++ vllm_omni/inputs/data.py | 1 - .../{diffusion => }/profiler/__init__.py | 6 +- vllm_omni/profiler/config.py | 84 +++++ vllm_omni/profiler/torch_profiler.py | 117 +++++++ 37 files changed, 1447 insertions(+), 692 deletions(-) create mode 100644 tests/profiler/__init__.py create mode 100644 tests/profiler/test_api_router.py create mode 100644 tests/profiler/test_config.py create mode 100644 tests/profiler/test_torch_profiler.py delete mode 100644 vllm_omni/diffusion/profiler/base.py delete mode 100644 vllm_omni/diffusion/profiler/torch_profiler.py create mode 100644 vllm_omni/entrypoints/serve/__init__.py create mode 100644 vllm_omni/entrypoints/serve/profile/__init__.py create mode 100644 vllm_omni/entrypoints/serve/profile/api_router.py rename vllm_omni/{diffusion => }/profiler/__init__.py (52%) create mode 100644 vllm_omni/profiler/config.py create mode 100644 vllm_omni/profiler/torch_profiler.py diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index a7df8c32297..cc4d931572d 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -2,148 +2,256 @@ > **Warning:** Profiling incurs significant overhead. Use only for development and debugging, never in production. -vLLM-Omni uses the PyTorch Profiler to analyze performance across both **multi-stage omni-modality models** and **diffusion models**. +vLLM-Omni provides a profiling module (`vllm_omni/profiler/`) aligned with upstream vLLM 0.16.0 semantics. It captures **performance traces** (TensorBoard/Chrome traces) using `tensorboard_trace_handler` and supports delay/max iteration control. -### 1. Set the Output Directory -Before running any script, set this environment variable. The system detects this and automatically saves traces here. +## Quick Start -```bash -export VLLM_TORCH_PROFILER_DIR=./profiles +```python +from vllm_omni import Omni +from vllm_omni.profiler import ProfilerConfig + +# Configure profiler at initialization +omni = Omni( + model="Tongyi-MAI/Z-Image-Turbo", + profiler_config=ProfilerConfig( + profiler="torch", + torch_profiler_dir="./profiles", + ) +) + +# Profile your workload +omni.start_profile() +outputs = omni.generate({"prompt": "a cat"}, sampling_params) +omni.stop_profile() + +# Trace files are written to ./profiles/ by each worker ``` -### 2. Profiling Omni-Modality Models +## Command Line Usage -It is best to limit profiling to one iteration to keep trace files manageable. +All offline inference examples support profiling via CLI arguments: ```bash -export VLLM_PROFILER_MAX_ITERS=1 +# Enable profiling +python text_to_image.py --model MODEL --profile-dir ./profiles ``` -**Selective Stage Profiling** -The profiler is default to function across all stages. But It is highly recommended to profile specific stages by passing the stages list, preventing from producing too large trace files: +## ProfilerConfig + +```python +from vllm_omni.profiler import ProfilerConfig + +ProfilerConfig( + profiler="torch", # Required: "torch" or "cuda" + torch_profiler_dir="./profiles", # Required when profiler="torch" + torch_profiler_with_stack=True, # Enable stack tracing + torch_profiler_with_flops=False, # Enable FLOPS counting + torch_profiler_use_gzip=True, # Save traces in gzip format + torch_profiler_dump_cuda_time_total=True, # Dump CUDA time stats on stop + torch_profiler_record_shapes=False, # Record tensor shapes + torch_profiler_with_memory=False, # Enable memory profiling + delay_iterations=0, # Skip N iterations before starting + max_iterations=0, # Stop after N iterations (0=unlimited) +) +``` + +### Serialization + +`ProfilerConfig` supports `to_dict()` / `from_dict()` for cross-process RPC serialization. + +## Output Files + +| File | Format | How to View | +|------|--------|-------------| +| `*.trace.json.gz` | TensorBoard trace | TensorBoard, chrome://tracing, or ui.perfetto.dev | +| `profiler_out_*.txt` | CUDA time stats | Any text editor | + +--- + +## Profiling Omni-Modality Models + +### Selective Stage Profiling + +Profile specific stages to keep trace files manageable: + ```python # Profile all stages -omni_llm.start_profile() +omni.start_profile() # Only profile Stage 1 -omni_llm.start_profile(stages=[1]) +omni.start_profile(stages=[1]) + +# Stage 0 (Thinker) and Stage 2 (Audio Decoder) for Qwen Omni +omni.start_profile(stages=[0, 2]) ``` -```python -# Stage 0 (Thinker) and Stage 2 (Audio Decoder) for qwen omni -omni_llm.start_profile(stages=[0, 2]) +### Examples + +- **Qwen2.5-Omni**: [examples/offline_inference/qwen2_5_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py) +- **Qwen3-Omni**: [examples/offline_inference/qwen3_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py) + +--- + +## Profiling Diffusion Models + +Diffusion profiling is end-to-end, capturing encoding, denoising loops, and decoding. + +### Minimizing Trace Size + +For profiling, minimize dimensions to keep trace files manageable: + +```bash +python image_to_video.py \ + --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \ + --image input.png \ + --prompt "A cat playing with yarn" \ + --profile-dir ./profiles \ + \ + # Minimize dimensions for profiling: + --height 48 \ + --width 64 \ + --num_frames 2 \ + --num_inference_steps 2 ``` -**Python Usage**: Wrap your generation logic with `start_profile()` and `stop_profile()`. +### Examples -```python -from vllm_omni import omni_llm +- **Image Edit**: [examples/offline_inference/image_to_image/image_edit.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py) +- **Image to Video**: [examples/offline_inference/image_to_video/](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video) +- **Text to Image**: [examples/offline_inference/text_to_image/text_to_image.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/text_to_image/text_to_image.py) -profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) +--- -# 1. Start profiling if enabled -if profiler_enabled: - omni_llm.start_profile(stages=[0]) +## Viewing Traces -# Initialize generator -omni_generator = omni_llm.generate(prompts, sampling_params_list, py_generator=args.py_generator) +### Performance Traces (`.trace.json.gz`) -total_requests = len(prompts) -processed_count = 0 +- [TensorBoard](https://www.tensorflow.org/tensorboard) (recommended) +- [Perfetto UI](https://ui.perfetto.dev/) +- `chrome://tracing` (Chrome only) -# Main Processing Loop -for stage_outputs in omni_generator: +--- - # ... [Output processing logic for text/audio would go here] ... +## API Reference - # Update count to track when to stop profiling - processed_count += len(stage_outputs.request_output) +### ProfilerConfig - # 2. Check if all requests are done to stop the profiler safely - if profiler_enabled and processed_count >= total_requests: - print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...") +```python +@dataclass +class ProfilerConfig: + profiler: Literal["torch", "cuda"] | None = None + torch_profiler_dir: str = "" + torch_profiler_with_stack: bool = True + torch_profiler_with_flops: bool = False + torch_profiler_use_gzip: bool = True + torch_profiler_dump_cuda_time_total: bool = True + torch_profiler_record_shapes: bool = False + torch_profiler_with_memory: bool = False + delay_iterations: int = 0 + max_iterations: int = 0 +``` + +### TorchProfiler + +```python +class TorchProfiler: + def __init__(self, config: ProfilerConfig, worker_name: str = "", local_rank: int = 0): ... + def start(self) -> None: ... + def stop(self) -> None: ... + def step(self) -> None: ... + def shutdown(self) -> None: ... + @property + def is_running(self) -> bool: ... +``` - # Stop the profiler while workers are still active - omni_llm.stop_profile() +### Omni Methods - # Wait for traces to flush to disk - print("[Info] Waiting 30s for workers to write trace files to disk...") - time.sleep(30) - print("[Info] Trace export wait time finished.") +```python +# Start profiling for specified stages (None = all) +omni.start_profile(stages: list[int] | None = None) -> None -omni_llm.close() +# Stop profiling for specified stages (None = all) +omni.stop_profile(stages: list[int] | None = None) -> None ``` +--- -**Examples**: +## Best Practices -1. **Qwen2.5-Omni**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py) +1. **Profile specific stages**: Use `omni.start_profile(stages=[0])` to reduce overhead and file size -2. **Qwen3-Omni**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py) +2. **Minimize dimensions for diffusion**: Use small height/width/frames/steps when profiling +3. **Compare before/after**: Profile before and after optimizations to measure impact -### 3. Profiling diffusion models +4. **Use during development only**: Disable profiling in production for performance -Diffusion profiling is End-to-End, capturing encoding, denoising loops, and decoding. +--- -**CLI Usage:** -```python +## Troubleshooting -python image_to_video.py \ - --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \ - --image qwen-bear.png \ - --prompt "A cat playing with yarn, smooth motion" \ - \ - # Minimize Spatial Dimensions (Optional but helpful): - # Drastically reduces memory usage so the profiler doesn't - # crash due to overhead, though for accurate performance - # tuning you often want target resolutions. - --height 48 \ - --width 64 \ - \ - # Minimize Temporal Dimension (Frames): - # Video models process 3D tensors (Time, Height, Width). - # Reducing frames to the absolute minimum (2) keeps the - # tensor size small, ensuring the trace file doesn't become - # multi-gigabytes in size. - --num_frames 2 \ - \ - # Minimize Iteration Loop (Steps): - # This is the most critical setting for profiling. - # Diffusion models run the same loop X times. - # Profiling 2 steps gives you the exact same performance - # data as 50 steps, but saves minutes of runtime and - # prevents the trace viewer from freezing. - --num_inference_steps 2 \ - \ - --guidance_scale 5.0 \ - --guidance_scale_high 6.0 \ - --boundary_ratio 0.875 \ - --flow_shift 12.0 \ - --fps 16 \ - --output i2v_output.mp4 +| Issue | Cause | Solution | +|-------|-------|----------| +| Import error | Missing module | Check `vllm_omni/profiler/__init__.py` | +| OOM during profiling | Profiler overhead | Reduce model dimensions | +| Huge trace files | Too many steps/frames | Reduce `num_inference_steps`, `num_frames` | + +--- + +## Online Serving Profiling +When running the vLLM-Omni API server, profiling can be enabled via CLI +and controlled via HTTP endpoints at runtime. + +### Starting the Server with Profiling Enabled + +```bash +python -m vllm_omni.entrypoints.openai.api_server \ + --model Qwen/Qwen2.5-Omni-7B \ + --profiler-config profiler=torch,torch_profiler_dir=./profiles ``` -**Examples**: +### HTTP Endpoints -1. **Qwen image edit**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py) +| Method | Endpoint | Body | Description | +|--------|----------|------|-------------| +| POST | `/start_profile` | `{"stages": [0, 1, 2]}` (optional) | Start profiling | +| POST | `/stop_profile` | `{"stages": [0, 1, 2]}` (optional) | Stop profiling | -2. **Wan-AI/Wan2.2-I2V-A14B-Diffusers**: [https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video) +If `stages` is omitted or null, all stages are profiled. -> **Note:** -As of now, asynchronous (online) profiling is not fully supported in vLLM-Omni. While start_profile() and stop_profile() methods exist, they are only reliable in offline inference scripts (e.g., the provided end2end.py examples). Do not use them in server-mode or streaming scenarios—traces may be incomplete or fail to flush. +### Stage IDs for Qwen Omni Models -### 4. Analyzing Omni Traces +| Stage | Qwen2.5-Omni | Qwen3-Omni | +|-------|-------------|------------| +| 0 | Thinker (understanding) | Thinker (MoE understanding) | +| 1 | Talker (text → RVQ codes) | Talker (code predictor) | +| 2 | Code2Wav (codes → audio) | Code2Wav (codes → audio) | -Output files are saved to your configured ```VLLM_TORCH_PROFILER_DIR```. +### Examples -**Output** -**Chrome Trace** (```.json.gz```): Visual timeline of kernels and stages. Open in Perfetto UI. +```bash +# Profile all stages (default) +curl -X POST http://localhost:8000/start_profile -**Viewing Tools:** +# Profile only the Thinker stage +curl -X POST http://localhost:8000/start_profile \ + -H "Content-Type: application/json" \ + -d '{"stages": [0]}' + +# Profile Thinker and Talker stages +curl -X POST http://localhost:8000/start_profile \ + -H "Content-Type: application/json" \ + -d '{"stages": [0, 1]}' + +# Stop profiling (traces written to torch_profiler_dir) +curl -X POST http://localhost:8000/stop_profile +``` -- [Perfetto](https://ui.perfetto.dev/)(recommended) -- ```chrome://tracing```(Chrome only) +### Tips -**Note**: vLLM-Omni reuses the PyTorch Profiler infrastructure from vLLM. See the official vLLM profiler documentation: [vLLM Profiling Guide](https://docs.vllm.ai/en/stable/contributing/profiling/) +1. **Profile one stage at a time** for smaller, more focused traces +2. **Profile the Thinker** (stage 0) to analyze LLM bottlenecks +3. **Profile the Talker** (stage 1) to analyze codec generation +4. **Profile Code2Wav** (stage 2) to analyze audio synthesis +5. Trace files are named per-stage (e.g., `stage-0_*.trace.json.gz`) diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 8fa412065d8..e4c31506fb5 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -1,8 +1,10 @@ import argparse import os +import time from typing import cast from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType +from vllm_omni.profiler import ProfilerConfig def parse_args(): @@ -46,6 +48,14 @@ def parse_args(): parser.add_argument("--stage-configs-path", type=str, default=None) parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.") + # Profiler arguments + parser.add_argument( + "--profile-dir", + type=str, + default=None, + help="Directory to save profiling outputs. Enables profiling when set.", + ) + args = parser.parse_args() return args @@ -114,6 +124,12 @@ def main(): else: from vllm_omni.entrypoints.omni import Omni + # Build profiler config from arguments + profiler_config = None + if args.profile_dir: + profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir) + print(f"[Profiler] Output dir: {args.profile_dir}") + omni_kwargs = {} if args.stage_configs_path: omni_kwargs["stage_configs_path"] = args.stage_configs_path @@ -130,7 +146,7 @@ def main(): } ) - omni = Omni(model=model_name, **omni_kwargs) + omni = Omni(model=model_name, profiler_config=profiler_config, **omni_kwargs) formatted_prompts = [] for p in args.prompts: @@ -160,7 +176,20 @@ def main(): if len(params_list) > 1: params_list[1].num_inference_steps = args.steps # type: ignore # The second stage is an OmniDiffusionSamplingParam + if profiler_config: + print("[Profiler] Starting profiling...") + omni.start_profile() + + generation_start = time.perf_counter() omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list)) + generation_end = time.perf_counter() + generation_time = generation_end - generation_start + print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)") + + if profiler_config: + print("\n[Profiler] Stopping profiler and collecting results...") + omni.stop_profile() + print("[Profiler] Profiling stopped.") for i, req_output in enumerate(omni_outputs): images = getattr(req_output, "images", None) diff --git a/examples/offline_inference/image_to_image/image_edit.py b/examples/offline_inference/image_to_image/image_edit.py index 8f330e09d20..2562799e23e 100644 --- a/examples/offline_inference/image_to_image/image_edit.py +++ b/examples/offline_inference/image_to_image/image_edit.py @@ -82,6 +82,7 @@ from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform +from vllm_omni.profiler import ProfilerConfig def parse_args() -> argparse.Namespace: @@ -306,6 +307,14 @@ def parse_args() -> argparse.Namespace: default=1, help="Number of ready layers (blocks) to keep on GPU during generation.", ) + + # Profiler arguments + parser.add_argument( + "--profile-dir", + type=str, + default=None, + help="Directory to save profiling outputs. Enables profiling when set.", + ) return parser.parse_args() @@ -358,9 +367,16 @@ def main(): # Note: coefficients will use model-specific defaults based on model_type } + # Build profiler config from arguments + profiler_config = None + if args.profile_dir: + profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir) + print(f"[Profiler] Output dir: {args.profile_dir}") + # Initialize Omni with appropriate pipeline omni = Omni( model=args.model, + profiler_config=profiler_config, enable_layerwise_offload=args.enable_layerwise_offload, layerwise_num_gpu_layers=args.layerwise_num_gpu_layers, vae_use_slicing=args.vae_use_slicing, @@ -373,9 +389,6 @@ def main(): ) print("Pipeline loaded") - # Check if profiling is requested via environment variable - profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) - # Time profiling for generation print(f"\n{'=' * 60}") print("Generation Configuration:") @@ -393,12 +406,12 @@ def main(): ) print(f"{'=' * 60}\n") - generation_start = time.perf_counter() - - if profiler_enabled: + if profiler_config: print("[Profiler] Starting profiling...") omni.start_profile() + generation_start = time.perf_counter() + # Generate edited image outputs = omni.generate( { @@ -422,22 +435,10 @@ def main(): # Print profiling results print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)") - if profiler_enabled: + if profiler_config: print("\n[Profiler] Stopping profiler and collecting results...") - profile_results = omni.stop_profile() - if profile_results and isinstance(profile_results, dict): - traces = profile_results.get("traces", []) - print("\n" + "=" * 60) - print("PROFILING RESULTS:") - for rank, trace in enumerate(traces): - print(f"\nRank {rank}:") - if trace: - print(f" • Trace: {trace}") - if not traces: - print(" No traces collected.") - print("=" * 60) - else: - print("[Profiler] No valid profiling data returned.") + omni.stop_profile() + print("[Profiler] Profiling stopped.") if not outputs: raise ValueError("No output generated from omni.generate()") diff --git a/examples/offline_inference/image_to_video/image_to_video.py b/examples/offline_inference/image_to_video/image_to_video.py index 8e8d3991559..e57831a9264 100644 --- a/examples/offline_inference/image_to_video/image_to_video.py +++ b/examples/offline_inference/image_to_video/image_to_video.py @@ -19,7 +19,6 @@ """ import argparse -import os from pathlib import Path import numpy as np @@ -31,6 +30,7 @@ from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform +from vllm_omni.profiler import ProfilerConfig def parse_args() -> argparse.Namespace: @@ -98,6 +98,14 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Disable torch.compile and force eager execution.", ) + + # Profiler arguments + parser.add_argument( + "--profile-dir", + type=str, + default=None, + help="Directory to save profiling outputs. Enables profiling when set.", + ) return parser.parse_args() @@ -131,13 +139,18 @@ def main(): # Resize image to target dimensions image = image.resize((width, height), PIL.Image.Resampling.LANCZOS) - # Check if profiling is requested via environment variable - profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) + # Build profiler config from arguments + profiler_config = None + if args.profile_dir: + profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir) + print(f"[Profiler] Output dir: {args.profile_dir}") + parallel_config = DiffusionParallelConfig( cfg_parallel_size=args.cfg_parallel_size, ) omni = Omni( model=args.model, + profiler_config=profiler_config, enable_layerwise_offload=args.enable_layerwise_offload, layerwise_num_gpu_layers=args.layerwise_num_gpu_layers, vae_use_slicing=args.vae_use_slicing, @@ -149,7 +162,7 @@ def main(): enforce_eager=args.enforce_eager, ) - if profiler_enabled: + if profiler_config: print("[Profiler] Starting profiling...") omni.start_profile() @@ -242,22 +255,10 @@ def main(): export_to_video(video_array, str(output_path), fps=args.fps) print(f"Saved generated video to {output_path}") - if profiler_enabled: + if profiler_config: print("\n[Profiler] Stopping profiler and collecting results...") - profile_results = omni.stop_profile() - if profile_results and isinstance(profile_results, dict): - traces = profile_results.get("traces", []) - print("\n" + "=" * 60) - print("PROFILING RESULTS:") - for rank, trace in enumerate(traces): - print(f"\nRank {rank}:") - if trace: - print(f" • Trace: {trace}") - if not traces: - print(" No traces collected.") - print("=" * 60) - else: - print("[Profiler] No valid profiling data returned.") + omni.stop_profile() + print("[Profiler] Profiling stopped.") if __name__ == "__main__": diff --git a/examples/offline_inference/lora_inference/lora_inference.py b/examples/offline_inference/lora_inference/lora_inference.py index 5e4299edb84..79938b150a7 100644 --- a/examples/offline_inference/lora_inference/lora_inference.py +++ b/examples/offline_inference/lora_inference/lora_inference.py @@ -2,12 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse +import time from pathlib import Path from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.lora.request import LoRARequest from vllm_omni.lora.utils import stable_lora_int_id +from vllm_omni.profiler import ProfilerConfig def parse_args() -> argparse.Namespace: @@ -56,6 +58,14 @@ def parse_args() -> argparse.Namespace: default=1.0, help="Scale factor for LoRA weights (default: 1.0).", ) + + # Profiler arguments + parser.add_argument( + "--profile-dir", + type=str, + default=None, + help="Directory to save profiling outputs. Enables profiling when set.", + ) return parser.parse_args() @@ -64,13 +74,19 @@ def main(): model = args.model + # Build profiler config from arguments + profiler_config = None + if args.profile_dir: + profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir) + print(f"[Profiler] Output dir: {args.profile_dir}") + omni_kwargs = {} if args.lora_path: omni_kwargs["lora_path"] = args.lora_path print(f"Using static LoRA from: {args.lora_path}") - omni = Omni(model=model, **omni_kwargs) + omni = Omni(model=model, profiler_config=profiler_config, **omni_kwargs) lora_request = None if args.lora_request_path: @@ -106,7 +122,15 @@ def main(): sampling_params.lora_request = lora_request sampling_params.lora_scale = args.lora_scale + if profiler_config: + print("[Profiler] Starting profiling...") + omni.start_profile() + + generation_start = time.perf_counter() outputs = omni.generate(args.prompt, sampling_params) + generation_end = time.perf_counter() + generation_time = generation_end - generation_start + print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)") if not outputs or len(outputs) == 0: raise ValueError("No output generated from omni.generate()") @@ -142,6 +166,11 @@ def main(): img.save(save_path) print(f"Saved generated image to {save_path}") + if profiler_config: + print("\n[Profiler] Stopping profiler and collecting results...") + omni.stop_profile() + print("[Profiler] Profiling stopped.") + if __name__ == "__main__": main() diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py index 7cd8e18737e..0ab7fbb2f7c 100644 --- a/examples/offline_inference/qwen2_5_omni/end2end.py +++ b/examples/offline_inference/qwen2_5_omni/end2end.py @@ -6,7 +6,6 @@ """ import os -import time from typing import NamedTuple import librosa @@ -21,6 +20,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni.entrypoints.omni import Omni +from vllm_omni.profiler import ProfilerConfig SEED = 42 @@ -320,8 +320,15 @@ def main(args): query_result = query_func(audio_path=audio_path, sampling_rate=sampling_rate) else: query_result = query_func() + # Build profiler config from arguments + profiler_config = None + if args.profile_dir: + profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir) + print(f"[Profiler] Output dir: {args.profile_dir}") + omni_llm = Omni( model=model_name, + profiler_config=profiler_config, log_stats=args.log_stats, stage_init_timeout=args.stage_init_timeout, batch_timeout=args.batch_timeout, @@ -377,8 +384,8 @@ def main(args): for i, prompt in enumerate(prompts): prompt["modalities"] = output_modalities - profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) - if profiler_enabled: + if profiler_config: + print("[Profiler] Starting profiling...") omni_llm.start_profile(stages=[0]) omni_generator = omni_llm.generate(prompts, sampling_params_list, py_generator=args.py_generator) @@ -416,14 +423,11 @@ def main(args): print(f"Request ID: {request_id}, Saved audio to {output_wav}") processed_count += len(stage_outputs.request_output) - if profiler_enabled and processed_count >= total_requests: + if profiler_config and processed_count >= total_requests: print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...") # Stop the profiler while workers are still alive omni_llm.stop_profile() - - print("[Info] Waiting 30s for workers to write massive trace files to disk...") - time.sleep(30) - print("[Info] Trace export wait finished.") + print("[Profiler] Profiling stopped.") omni_llm.close() @@ -539,6 +543,20 @@ def parse_args(): default=False, help="Use py_generator mode. The returned type of Omni.generate() is a Python Generator object.", ) + + # Profiler arguments + parser.add_argument( + "--profile-dir", + type=str, + default=None, + help="Directory to save profiling outputs. Enables profiling when set.", + ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Output directory for generated files (preferred over --output-wav).", + ) return parser.parse_args() diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 263377762aa..f00248a696e 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -6,22 +6,22 @@ """ import os -import time from typing import NamedTuple import librosa import numpy as np import soundfile as sf -import vllm from PIL import Image -from vllm import SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode from vllm.utils.argparse_utils import FlexibleArgumentParser +import vllm +from vllm import SamplingParams from vllm_omni.entrypoints.omni import Omni +from vllm_omni.profiler import ProfilerConfig SEED = 42 @@ -325,8 +325,15 @@ def main(args): else: query_result = query_func() + # Build profiler config from arguments + profiler_config = None + if args.profile_dir: + profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir) + print(f"[Profiler] Output dir: {args.profile_dir}") + omni_llm = Omni( model=model_name, + profiler_config=profiler_config, stage_configs_path=args.stage_configs_path, log_stats=args.log_stats, stage_init_timeout=args.stage_init_timeout, @@ -383,8 +390,8 @@ def main(args): for i, prompt in enumerate(prompts): prompt["modalities"] = output_modalities - profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) - if profiler_enabled: + if profiler_config: + print("[Profiler] Starting profiling...") omni_llm.start_profile(stages=[0]) omni_generator = omni_llm.generate(prompts, sampling_params_list, py_generator=args.py_generator) # Determine output directory: prefer --output-dir; fallback to --output-wav @@ -433,14 +440,11 @@ def main(args): print(f"Request ID: {request_id}, Saved audio to {output_wav}") processed_count += len(stage_outputs.request_output) - if profiler_enabled and processed_count >= total_requests: + if profiler_config and processed_count >= total_requests: print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...") # Stop the profiler while workers are still alive omni_llm.stop_profile() - - print("[Info] Waiting 30s for workers to write trace files to disk...") - time.sleep(30) - print("[Info] Trace export wait time finished.") + print("[Profiler] Profiling stopped.") omni_llm.close() @@ -559,6 +563,20 @@ def parse_args(): help="Use py_generator mode. The returned type of Omni.generate() is a Python Generator object.", ) + # Profiler arguments + parser.add_argument( + "--profile-dir", + type=str, + default=None, + help="Directory to save profiling outputs. Enables profiling when set.", + ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Output directory for generated files (preferred over --output-wav).", + ) + return parser.parse_args() diff --git a/examples/offline_inference/qwen3_tts/end2end.py b/examples/offline_inference/qwen3_tts/end2end.py index 93aeba3ca5f..94442d1cae9 100644 --- a/examples/offline_inference/qwen3_tts/end2end.py +++ b/examples/offline_inference/qwen3_tts/end2end.py @@ -5,16 +5,18 @@ """ import os +import time from typing import NamedTuple import soundfile as sf os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -from vllm import SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm import SamplingParams from vllm_omni import Omni +from vllm_omni.profiler import ProfilerConfig class QueryResult(NamedTuple): @@ -215,9 +217,16 @@ def main(args): else: query_result = query_func() + # Build profiler config from arguments + profiler_config = None + if args.profile_dir: + profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir) + print(f"[Profiler] Output dir: {args.profile_dir}") + model_name = query_result.model_name omni = Omni( model=model_name, + profiler_config=profiler_config, stage_configs_path=args.stage_configs_path, log_stats=args.log_stats, stage_init_timeout=args.stage_init_timeout, @@ -240,6 +249,11 @@ def main(args): output_dir = args.output_dir if getattr(args, "output_dir", None) else args.output_wav os.makedirs(output_dir, exist_ok=True) + if profiler_config: + print("[Profiler] Starting profiling...") + omni.start_profile() + + generation_start = time.perf_counter() omni_generator = omni.generate(query_result.inputs, sampling_params_list) for stage_outputs in omni_generator: for output in stage_outputs.request_output: @@ -258,6 +272,15 @@ def main(args): sf.write(output_wav, audio_numpy, samplerate=audio_samplerate, format="WAV") print(f"Request ID: {request_id}, Saved audio to {output_wav}") + generation_end = time.perf_counter() + generation_time = generation_end - generation_start + print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)") + + if profiler_config: + print("\n[Profiler] Stopping profiler and collecting results...") + omni.stop_profile() + print("[Profiler] Profiling stopped.") + def parse_args(): """Parse CLI arguments for offline TTS inference. @@ -366,6 +389,20 @@ def parse_args(): help="Mode tag for Base query x_vector_only_mode (default: icl).", ) + # Profiler arguments + parser.add_argument( + "--profile-dir", + type=str, + default=None, + help="Directory to save profiling outputs. Enables profiling when set.", + ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Output directory for generated files (preferred over --output-wav).", + ) + return parser.parse_args() diff --git a/examples/offline_inference/text_to_audio/text_to_audio.py b/examples/offline_inference/text_to_audio/text_to_audio.py index 0a9efcca5ff..c7e17eb7d62 100644 --- a/examples/offline_inference/text_to_audio/text_to_audio.py +++ b/examples/offline_inference/text_to_audio/text_to_audio.py @@ -23,6 +23,7 @@ from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform +from vllm_omni.profiler import ProfilerConfig def parse_args() -> argparse.Namespace: @@ -90,6 +91,14 @@ def parse_args() -> argparse.Namespace: default=44100, help="Sample rate for output audio (Stable Audio uses 44100 Hz).", ) + + # Profiler arguments + parser.add_argument( + "--profile-dir", + type=str, + default=None, + help="Directory to save profiling outputs. Enables profiling when set.", + ) return parser.parse_args() @@ -120,6 +129,12 @@ def main(): args = parse_args() generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed) + # Build profiler config from arguments + profiler_config = None + if args.profile_dir: + profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir) + print(f"[Profiler] Output dir: {args.profile_dir}") + print(f"\n{'=' * 60}") print("Stable Audio Open - Text-to-Audio Generation") print(f"{'=' * 60}") @@ -133,11 +148,15 @@ def main(): print(f"{'=' * 60}\n") # Initialize Omni with Stable Audio model - omni = Omni(model=args.model) + omni = Omni(model=args.model, profiler_config=profiler_config) # Calculate audio end time audio_end_in_s = args.audio_start + args.audio_length + if profiler_config: + print("[Profiler] Starting profiling...") + omni.start_profile() + # Time profiling for generation generation_start = time.perf_counter() @@ -214,6 +233,11 @@ def main(): print(f"\nGenerated {args.audio_length}s of audio at {args.sample_rate} Hz") + if profiler_config: + print("\n[Profiler] Stopping profiler and collecting results...") + omni.stop_profile() + print("[Profiler] Profiling stopped.") + if __name__ == "__main__": main() diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index 4b2817f8dee..03e8687a078 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse -import os import time from pathlib import Path @@ -13,6 +12,7 @@ from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform +from vllm_omni.profiler import ProfilerConfig def parse_args() -> argparse.Namespace: @@ -140,6 +140,14 @@ def parse_args() -> argparse.Namespace: default=1, help="Number of ranks used for VAE patch/tile parallelism (decode/encode).", ) + + # Profiler arguments + parser.add_argument( + "--profile-dir", + type=str, + default=None, + help="Directory to save profiling outputs. Enables profiling when set.", + ) return parser.parse_args() @@ -185,11 +193,15 @@ def main(): vae_patch_parallel_size=args.vae_patch_parallel_size, ) - # Check if profiling is requested via environment variable - profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) + # Build profiler config from arguments + profiler_config = None + if args.profile_dir: + profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir) + print(f"[Profiler] Output dir: {args.profile_dir}") omni = Omni( model=args.model, + profiler_config=profiler_config, enable_layerwise_offload=args.enable_layerwise_offload, layerwise_num_gpu_layers=args.layerwise_num_gpu_layers, vae_use_slicing=args.vae_use_slicing, @@ -202,7 +214,7 @@ def main(): enable_cpu_offload=args.enable_cpu_offload, ) - if profiler_enabled: + if profiler_config: print("[Profiler] Starting profiling...") omni.start_profile() @@ -243,22 +255,10 @@ def main(): # Print profiling results print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)") - if profiler_enabled: + if profiler_config: print("\n[Profiler] Stopping profiler and collecting results...") - profile_results = omni.stop_profile() - if profile_results and isinstance(profile_results, dict): - traces = profile_results.get("traces", []) - print("\n" + "=" * 60) - print("PROFILING RESULTS:") - for rank, trace in enumerate(traces): - print(f"\nRank {rank}:") - if trace: - print(f" • Trace: {trace}") - if not traces: - print(" No traces collected.") - print("=" * 60) - else: - print("[Profiler] No valid profiling data returned.") + omni.stop_profile() + print("[Profiler] Profiling stopped.") # Extract images from OmniRequestOutput # omni.generate() returns list[OmniRequestOutput], extract images from the first output diff --git a/examples/offline_inference/text_to_video/text_to_video.py b/examples/offline_inference/text_to_video/text_to_video.py index 903fc001f33..29090a3f47f 100644 --- a/examples/offline_inference/text_to_video/text_to_video.py +++ b/examples/offline_inference/text_to_video/text_to_video.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse -import os import time from pathlib import Path @@ -14,6 +13,7 @@ from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform +from vllm_omni.profiler import ProfilerConfig def parse_args() -> argparse.Namespace: @@ -115,6 +115,14 @@ def parse_args() -> argparse.Namespace: default=1, help="Number of GPUs used for tensor parallelism (TP) inside the DiT.", ) + + # Profiler arguments + parser.add_argument( + "--profile-dir", + type=str, + default=None, + help="Directory to save profiling outputs. Enables profiling when set.", + ) return parser.parse_args() @@ -149,11 +157,15 @@ def main(): tensor_parallel_size=args.tensor_parallel_size, ) - # Check if profiling is requested via environment variable - profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR")) + # Build profiler config from arguments + profiler_config = None + if args.profile_dir: + profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir) + print(f"[Profiler] Output dir: {args.profile_dir}") omni = Omni( model=args.model, + profiler_config=profiler_config, enable_layerwise_offload=args.enable_layerwise_offload, layerwise_num_gpu_layers=args.layerwise_num_gpu_layers, vae_use_slicing=args.vae_use_slicing, @@ -168,7 +180,7 @@ def main(): enforce_eager=args.enforce_eager, ) - if profiler_enabled: + if profiler_config: print("[Profiler] Starting profiling...") omni.start_profile() @@ -266,22 +278,10 @@ def main(): export_to_video(video_array, str(output_path), fps=args.fps) print(f"Saved generated video to {output_path}") - if profiler_enabled: + if profiler_config: print("\n[Profiler] Stopping profiler and collecting results...") - profile_results = omni.stop_profile() - if profile_results and isinstance(profile_results, dict): - traces = profile_results.get("traces", []) - print("\n" + "=" * 60) - print("PROFILING RESULTS:") - for rank, trace in enumerate(traces): - print(f"\nRank {rank}:") - if trace: - print(f" • Trace: {trace}") - if not traces: - print(" No traces collected.") - print("=" * 60) - else: - print("[Profiler] No valid profiling data returned.") + omni.stop_profile() + print("[Profiler] Profiling stopped.") if __name__ == "__main__": diff --git a/tests/e2e/online_serving/test_async_omni.py b/tests/e2e/online_serving/test_async_omni.py index cab3e6e2286..55ccfc3359c 100644 --- a/tests/e2e/online_serving/test_async_omni.py +++ b/tests/e2e/online_serving/test_async_omni.py @@ -5,10 +5,10 @@ from pathlib import Path import pytest -from vllm import SamplingParams from vllm.inputs import PromptType from tests.utils import hardware_test +from vllm import SamplingParams from vllm_omni.entrypoints.async_omni import AsyncOmni os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py index 0c6479ccea7..49031ee97f9 100644 --- a/tests/entrypoints/openai_api/test_image_server.py +++ b/tests/entrypoints/openai_api/test_image_server.py @@ -15,8 +15,8 @@ import pytest from fastapi.testclient import TestClient from PIL import Image -from vllm import SamplingParams +from vllm import SamplingParams from vllm_omni.entrypoints.openai.image_api_utils import ( encode_image_base64, parse_size, diff --git a/tests/entrypoints/test_omni_llm.py b/tests/entrypoints/test_omni_llm.py index 4f05575ca59..33fd002e73a 100644 --- a/tests/entrypoints/test_omni_llm.py +++ b/tests/entrypoints/test_omni_llm.py @@ -5,8 +5,8 @@ from unittest.mock import MagicMock import pytest -from vllm import SamplingParams +from vllm import SamplingParams from vllm_omni.entrypoints.stage_utils import SHUTDOWN_TASK pytestmark = [pytest.mark.core_model, pytest.mark.cpu] diff --git a/tests/profiler/__init__.py b/tests/profiler/__init__.py new file mode 100644 index 00000000000..208f01a7cb5 --- /dev/null +++ b/tests/profiler/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/tests/profiler/test_api_router.py b/tests/profiler/test_api_router.py new file mode 100644 index 00000000000..496acc2ffdc --- /dev/null +++ b/tests/profiler/test_api_router.py @@ -0,0 +1,118 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Unit tests for the stage-aware profiler HTTP endpoints. + +These tests use FastAPI TestClient with a mocked engine_client, +so they run without GPU or model weights. +""" + +from types import SimpleNamespace +from unittest.mock import AsyncMock + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from vllm_omni.entrypoints.serve.profile.api_router import attach_router, router + + +def _make_app(profiler_config=None) -> FastAPI: + """Create a minimal FastAPI app with mocked state for testing.""" + app = FastAPI() + + mock_engine = AsyncMock() + mock_engine.start_profile = AsyncMock() + mock_engine.stop_profile = AsyncMock() + + app.state.engine_client = mock_engine + app.state.args = SimpleNamespace(profiler_config=profiler_config) + return app + + +class TestAttachRouter: + """Tests for attach_router conditional logic.""" + + def test_attaches_when_profiler_set(self): + """Routes are registered when profiler_config.profiler is set.""" + config = SimpleNamespace(profiler="torch", torch_profiler_dir="/tmp") + app = _make_app(profiler_config=config) + attach_router(app) + + paths = {r.path for r in app.routes if hasattr(r, "path")} + assert "/start_profile" in paths + assert "/stop_profile" in paths + + def test_not_attached_when_no_config(self): + """Routes are NOT registered when profiler_config is None.""" + app = _make_app(profiler_config=None) + attach_router(app) + + paths = {r.path for r in app.routes if hasattr(r, "path")} + assert "/start_profile" not in paths + assert "/stop_profile" not in paths + + def test_not_attached_when_profiler_none(self): + """Routes are NOT registered when profiler_config.profiler is None.""" + config = SimpleNamespace(profiler=None) + app = _make_app(profiler_config=config) + attach_router(app) + + paths = {r.path for r in app.routes if hasattr(r, "path")} + assert "/start_profile" not in paths + assert "/stop_profile" not in paths + + +class TestStartProfileEndpoint: + """Tests for POST /start_profile.""" + + @pytest.fixture() + def client(self): + app = _make_app() + app.include_router(router) + return TestClient(app) + + def test_start_profile_no_body(self, client): + """Empty body profiles all stages.""" + resp = client.post("/start_profile") + assert resp.status_code == 200 + engine = client.app.state.engine_client + engine.start_profile.assert_awaited_once_with(stages=None) + + def test_start_profile_with_stages(self, client): + """Body with stages=[0] profiles only stage 0.""" + resp = client.post("/start_profile", json={"stages": [0]}) + assert resp.status_code == 200 + engine = client.app.state.engine_client + engine.start_profile.assert_awaited_once_with(stages=[0]) + + def test_start_profile_multiple_stages(self, client): + """Body with stages=[0,2] profiles stages 0 and 2.""" + resp = client.post("/start_profile", json={"stages": [0, 2]}) + assert resp.status_code == 200 + engine = client.app.state.engine_client + engine.start_profile.assert_awaited_once_with(stages=[0, 2]) + + +class TestStopProfileEndpoint: + """Tests for POST /stop_profile.""" + + @pytest.fixture() + def client(self): + app = _make_app() + app.include_router(router) + return TestClient(app) + + def test_stop_profile_no_body(self, client): + """Empty body stops all stages.""" + resp = client.post("/stop_profile") + assert resp.status_code == 200 + engine = client.app.state.engine_client + engine.stop_profile.assert_awaited_once_with(stages=None) + + def test_stop_profile_with_stages(self, client): + """Body with stages=[1] stops only stage 1.""" + resp = client.post("/stop_profile", json={"stages": [1]}) + assert resp.status_code == 200 + engine = client.app.state.engine_client + engine.stop_profile.assert_awaited_once_with(stages=[1]) diff --git a/tests/profiler/test_config.py b/tests/profiler/test_config.py new file mode 100644 index 00000000000..71c05c826fb --- /dev/null +++ b/tests/profiler/test_config.py @@ -0,0 +1,160 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import tempfile + +import pytest + +from vllm_omni.profiler import ProfilerConfig + + +class TestProfilerConfig: + def test_default_config(self): + """Test default configuration values.""" + config = ProfilerConfig() + assert config.profiler is None + assert config.torch_profiler_dir == "" + assert config.torch_profiler_with_stack is True + assert config.torch_profiler_with_flops is False + assert config.torch_profiler_use_gzip is True + assert config.torch_profiler_dump_cuda_time_total is True + assert config.torch_profiler_record_shapes is False + assert config.torch_profiler_with_memory is False + assert config.delay_iterations == 0 + assert config.max_iterations == 0 + + def test_torch_profiler_config(self): + """Test creating a torch profiler config.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = ProfilerConfig(profiler="torch", torch_profiler_dir=tmpdir) + assert config.profiler == "torch" + assert config.torch_profiler_dir == os.path.abspath(tmpdir) + + def test_dir_without_profiler_raises(self): + """Test that setting torch_profiler_dir without profiler='torch' raises.""" + with tempfile.TemporaryDirectory() as tmpdir: + with pytest.raises(ValueError, match="only applicable"): + ProfilerConfig(torch_profiler_dir=tmpdir) + + def test_torch_without_dir_raises(self): + """Test that profiler='torch' without dir raises.""" + with pytest.raises(ValueError, match="must be set"): + ProfilerConfig(profiler="torch") + + def test_to_dict(self): + """Test to_dict serialization.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = ProfilerConfig(profiler="torch", torch_profiler_dir=tmpdir) + d = config.to_dict() + assert isinstance(d, dict) + assert d["profiler"] == "torch" + assert d["torch_profiler_dir"] == os.path.abspath(tmpdir) + + def test_from_dict(self): + """Test from_dict deserialization.""" + with tempfile.TemporaryDirectory() as tmpdir: + d = { + "profiler": "torch", + "torch_profiler_dir": tmpdir, + "torch_profiler_with_stack": False, + } + config = ProfilerConfig.from_dict(d) + assert config.profiler == "torch" + assert config.torch_profiler_with_stack is False + + def test_from_dict_ignores_unknown_fields(self): + """Test from_dict ignores unknown fields.""" + with tempfile.TemporaryDirectory() as tmpdir: + d = { + "profiler": "torch", + "torch_profiler_dir": tmpdir, + "unknown_field": "value", + } + config = ProfilerConfig.from_dict(d) + assert config.profiler == "torch" + assert not hasattr(config, "unknown_field") + + def test_roundtrip(self): + """Test to_dict -> from_dict roundtrip.""" + with tempfile.TemporaryDirectory() as tmpdir: + original = ProfilerConfig( + profiler="torch", + torch_profiler_dir=tmpdir, + delay_iterations=5, + max_iterations=100, + ) + restored = ProfilerConfig.from_dict(original.to_dict()) + assert restored.profiler == original.profiler + assert restored.torch_profiler_dir == original.torch_profiler_dir + assert restored.delay_iterations == original.delay_iterations + assert restored.max_iterations == original.max_iterations + + def test_dir_expanded(self): + """Test that ~ in dir is expanded.""" + config = ProfilerConfig(profiler="torch", torch_profiler_dir="~/profiles") + assert "~" not in config.torch_profiler_dir + assert os.path.isabs(config.torch_profiler_dir) + + +class TestProfilerConfigFromAny: + """Tests for ProfilerConfig.from_any() — the online serving conversion path.""" + + def test_from_any_none(self): + """None input returns None.""" + assert ProfilerConfig.from_any(None) is None + + def test_from_any_own_instance(self): + """Our own ProfilerConfig is returned as-is.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = ProfilerConfig(profiler="torch", torch_profiler_dir=tmpdir) + result = ProfilerConfig.from_any(config) + assert result is config + + def test_from_any_dict(self): + """Dict input uses from_dict.""" + with tempfile.TemporaryDirectory() as tmpdir: + d = {"profiler": "torch", "torch_profiler_dir": tmpdir} + result = ProfilerConfig.from_any(d) + assert isinstance(result, ProfilerConfig) + assert result.profiler == "torch" + + def test_from_any_upstream_like_object(self): + """Object with .profiler and .torch_profiler_dir attributes (upstream-like).""" + + class UpstreamConfig: + profiler = "torch" + torch_profiler_dir = "/tmp/test_profiles" + + with tempfile.TemporaryDirectory() as tmpdir: + obj = UpstreamConfig() + obj.torch_profiler_dir = tmpdir + result = ProfilerConfig.from_any(obj) + assert isinstance(result, ProfilerConfig) + assert result.profiler == "torch" + assert result.torch_profiler_dir == os.path.abspath(tmpdir) + + def test_from_any_object_profiler_none(self): + """Object with profiler=None returns None.""" + + class NullConfig: + profiler = None + + assert ProfilerConfig.from_any(NullConfig()) is None + + def test_from_any_object_no_profiler_attr(self): + """Object without .profiler attribute returns None.""" + + class RandomObj: + foo = "bar" + + assert ProfilerConfig.from_any(RandomObj()) is None + + +class TestProfilerConfigReExport: + """Test that ProfilerConfig is accessible from vllm_omni.config.""" + + def test_import_from_config(self): + from vllm_omni.config import ProfilerConfig as ReExported + + assert ReExported is ProfilerConfig diff --git a/tests/profiler/test_torch_profiler.py b/tests/profiler/test_torch_profiler.py new file mode 100644 index 00000000000..eb6fd8baff9 --- /dev/null +++ b/tests/profiler/test_torch_profiler.py @@ -0,0 +1,196 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import glob +import os +import tempfile + +import pytest +import torch + +from vllm_omni.profiler import ProfilerConfig, TorchProfiler + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +class TestTorchProfiler: + """Tests for TorchProfiler (require CUDA).""" + + def _make_config(self, tmpdir: str) -> ProfilerConfig: + return ProfilerConfig(profiler="torch", torch_profiler_dir=tmpdir) + + def test_start_stop_lifecycle(self): + """Test basic start/stop lifecycle.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = self._make_config(tmpdir) + profiler = TorchProfiler(config, worker_name="test") + + assert not profiler.is_running + profiler.start() + assert profiler.is_running + + # Do some work + x = torch.randn(100, 100, device="cuda") + y = x @ x.T + del x, y + + profiler.stop() + assert not profiler.is_running + + def test_stop_without_start_is_noop(self): + """Test stop without start is safe.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = self._make_config(tmpdir) + profiler = TorchProfiler(config, worker_name="test") + profiler.stop() # Should not raise + assert not profiler.is_running + + def test_double_start_is_noop(self): + """Test that calling start twice does not error.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = self._make_config(tmpdir) + profiler = TorchProfiler(config, worker_name="test") + profiler.start() + profiler.start() # Second call should be no-op + assert profiler.is_running + profiler.stop() + assert not profiler.is_running + + def test_shutdown(self): + """Test shutdown stops a running profiler.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = self._make_config(tmpdir) + profiler = TorchProfiler(config, worker_name="test") + profiler.start() + assert profiler.is_running + profiler.shutdown() + assert not profiler.is_running + + def test_step_basic(self): + """Test step method doesn't crash.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = self._make_config(tmpdir) + profiler = TorchProfiler(config, worker_name="test") + profiler.start() + for _ in range(5): + profiler.step() + profiler.stop() + + def test_delay_iterations(self): + """Test that profiler doesn't start until delay iterations pass.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = ProfilerConfig( + profiler="torch", + torch_profiler_dir=tmpdir, + delay_iterations=3, + ) + profiler = TorchProfiler(config, worker_name="test") + profiler.start() + # Active but not yet running due to delay + assert not profiler.is_running + for _ in range(2): + profiler.step() + assert not profiler.is_running + # Third step triggers actual start + profiler.step() + assert profiler.is_running + profiler.stop() + + def test_max_iterations(self): + """Test that profiler stops after max iterations.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = ProfilerConfig( + profiler="torch", + torch_profiler_dir=tmpdir, + max_iterations=2, + ) + profiler = TorchProfiler(config, worker_name="test") + profiler.start() + assert profiler.is_running + profiler.step() # iter 1 + assert profiler.is_running + profiler.step() # iter 2 + assert profiler.is_running + profiler.step() # iter 3 -> exceeds max, stops + assert not profiler.is_running + + def test_config_driven_settings(self): + """Test that config fields are passed to the profiler.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = ProfilerConfig( + profiler="torch", + torch_profiler_dir=tmpdir, + torch_profiler_record_shapes=True, + torch_profiler_with_memory=True, + torch_profiler_with_flops=True, + ) + profiler = TorchProfiler(config, worker_name="test") + profiler.start() + x = torch.randn(10, 10, device="cuda") + _ = x @ x.T + profiler.stop() + + def test_trace_files_written(self): + """Test that stop() produces trace files in torch_profiler_dir.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = self._make_config(tmpdir) + profiler = TorchProfiler(config, worker_name="test-trace") + profiler.start() + x = torch.randn(64, 64, device="cuda") + _ = x @ x.T + del x + profiler.stop() + + # Verify trace file exists and is non-empty + trace_files = glob.glob(os.path.join(tmpdir, "*.trace.json.gz")) + assert len(trace_files) >= 1, f"Expected trace files in {tmpdir}, found: {os.listdir(tmpdir)}" + for tf in trace_files: + assert os.path.getsize(tf) > 0, f"Trace file {tf} is empty" + + def test_cuda_time_stats_written(self): + """Test that stop() writes profiler_out_*.txt when dump_cuda_time_total=True.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = ProfilerConfig( + profiler="torch", + torch_profiler_dir=tmpdir, + torch_profiler_dump_cuda_time_total=True, + ) + profiler = TorchProfiler(config, worker_name="test-stats", local_rank=0) + profiler.start() + x = torch.randn(64, 64, device="cuda") + _ = x @ x.T + del x + profiler.stop() + + stats_file = os.path.join(tmpdir, "profiler_out_0.txt") + assert os.path.exists(stats_file), f"Expected {stats_file}" + assert os.path.getsize(stats_file) > 0 + + def test_worker_name_in_trace_filename(self): + """Test that worker_name appears in the trace filename.""" + with tempfile.TemporaryDirectory() as tmpdir: + config = self._make_config(tmpdir) + profiler = TorchProfiler(config, worker_name="stage-0") + profiler.start() + x = torch.randn(32, 32, device="cuda") + _ = x @ x.T + del x + profiler.stop() + + trace_files = glob.glob(os.path.join(tmpdir, "*stage-0*")) + assert len(trace_files) >= 1, f"Expected trace with 'stage-0' in name, found: {os.listdir(tmpdir)}" + + +class TestTorchProfilerCPU: + """Tests that don't require CUDA.""" + + def test_import(self): + """Test that TorchProfiler class can be imported.""" + from vllm_omni.profiler import TorchProfiler + + assert TorchProfiler is not None + + def test_profiler_config_import(self): + """Test that ProfilerConfig can be imported.""" + from vllm_omni.profiler import ProfilerConfig + + assert ProfilerConfig is not None diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py index 775568a726f..faf923ec198 100644 --- a/vllm_omni/benchmarks/patch/patch.py +++ b/vllm_omni/benchmarks/patch/patch.py @@ -17,7 +17,6 @@ from pydub import AudioSegment from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase -from vllm.benchmarks import datasets from vllm.benchmarks.datasets import SampleRequest from vllm.benchmarks.lib.endpoint_request_func import ( ASYNC_REQUEST_FUNCS, @@ -32,6 +31,8 @@ ) from vllm.logger import init_logger +from vllm.benchmarks import datasets + logger = init_logger(__name__) from vllm_omni.benchmarks.data_modules.random_multi_modal_dataset import OmniRandomMultiModalDataset @@ -201,9 +202,9 @@ async def async_request_openai_chat_omni_completions( # ruff: noqa: E402 # Prevent import order from causing patch failures -from vllm.benchmarks import serve from vllm.benchmarks.serve import TaskType, calculate_metrics_for_embeddings, get_request, wait_for_endpoint +from vllm.benchmarks import serve from vllm_omni.benchmarks.metrics.metrics import MultiModalsBenchmarkMetrics, calculate_metrics # ruff: noqa: E402 diff --git a/vllm_omni/config/__init__.py b/vllm_omni/config/__init__.py index e2db6f4273c..22d6add7aad 100644 --- a/vllm_omni/config/__init__.py +++ b/vllm_omni/config/__init__.py @@ -4,8 +4,10 @@ from vllm_omni.config.lora import LoRAConfig from vllm_omni.config.model import OmniModelConfig +from vllm_omni.profiler import ProfilerConfig __all__ = [ "OmniModelConfig", "LoRAConfig", + "ProfilerConfig", ] diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 2ab94d5b775..6bd7cc57f04 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -4,11 +4,14 @@ import os import time from collections.abc import Iterable -from typing import Any +from typing import TYPE_CHECKING, Any import PIL.Image from vllm.logger import init_logger +if TYPE_CHECKING: + from vllm_omni.profiler import ProfilerConfig + from vllm_omni.diffusion.data import OmniDiffusionConfig from vllm_omni.diffusion.executor.abstract import DiffusionExecutor from vllm_omni.diffusion.registry import ( @@ -194,124 +197,20 @@ def make_engine(config: OmniDiffusionConfig) -> "DiffusionEngine": def add_req_and_wait_for_response(self, request: OmniDiffusionRequest): return self.executor.add_req(request) - def start_profile(self, trace_filename: str | None = None) -> None: - """ - Start torch profiling on all diffusion workers. - - Creates a directory (if needed) and sets up a base filename template - for per-rank profiler traces (typically saved as