From 8723b37ac956636e2be655a2edb7e386f74af0b6 Mon Sep 17 00:00:00 2001 From: Jinheng Li Date: Fri, 30 Jan 2026 15:37:45 +0800 Subject: [PATCH 01/13] [Profiler] Fix 404 by registering profile routes on app directly Move /start_profile and /stop_profile from the module-level router to direct app registration via _register_profiling_routes(), called after build_app() returns. This ensures the routes exist on the app regardless of how vllm's build_app() handles router inclusion. Co-Authored-By: Claude Opus 4.5 Signed-off-by: Jinheng Li --- vllm_omni/entrypoints/openai/api_server.py | 50 ++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index d15dc90fe5d..df2f6f1a206 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -246,6 +246,52 @@ async def show_available_models(self) -> ModelList: # Server entry points +def _register_profiling_routes(app) -> None: + """Register /start_profile and /stop_profile directly on the app. + + These are registered on the app (not the module-level router) to + guarantee availability regardless of how vllm's build_app() handles + router inclusion. + """ + + @app.post("/start_profile") + async def start_profile(raw_request: Request) -> JSONResponse: + """Start profiling on all stages. + + When the server is running under nsys with + ``--capture-range=cudaProfilerApi``, this also opens the CUDA + profiler capture region. + """ + engine_client = raw_request.app.state.engine_client + try: + await engine_client.start_profile() + except Exception as e: + logger.exception("Failed to start profile: %s", e) + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e), + ) from e + return JSONResponse(content={"status": "ok"}) + + @app.post("/stop_profile") + async def stop_profile(raw_request: Request) -> JSONResponse: + """Stop profiling on all stages. + + When running under nsys, this closes the CUDA profiler capture + region so nsys finalises the current capture. + """ + engine_client = raw_request.app.state.engine_client + try: + await engine_client.stop_profile() + except Exception as e: + logger.exception("Failed to stop profile: %s", e) + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e), + ) from e + return JSONResponse(content={"status": "ok"}) + + async def omni_run_server(args, **uvicorn_kwargs) -> None: """Run a single-worker API server. @@ -305,6 +351,10 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None, _remove_route_from_app(app, "/v1/models", {"GET"}) # Remove upstream /v1/models to use omni's handler app.include_router(router) + # Register profiling endpoints directly on the app so they are + # available regardless of how vllm's build_app handles routers. + _register_profiling_routes(app) + await omni_init_app_state(engine_client, app.state, args) # Conditionally register profiler endpoints based on stage YAML configs From d90941cb32eda6ce54677a8363c6c8bbddfda3a5 Mon Sep 17 00:00:00 2001 From: Jinheng Li Date: Fri, 30 Jan 2026 16:19:59 +0800 Subject: [PATCH 02/13] [Profiler] Fix 404: unconditionally include vllm's profile router vllm's build_app() only registers /start_profile and /stop_profile when profiler_config is explicitly set via CLI. For the omni server we always want these endpoints available so nsys profiling can be triggered via HTTP. Replace custom route handlers with a simple unconditional include of vllm's existing profile router. Co-Authored-By: Claude Opus 4.5 Signed-off-by: Jinheng Li --- vllm_omni/entrypoints/openai/api_server.py | 47 ++++------------------ 1 file changed, 7 insertions(+), 40 deletions(-) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index df2f6f1a206..972fee3b58d 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -247,49 +247,16 @@ async def show_available_models(self) -> ModelList: def _register_profiling_routes(app) -> None: - """Register /start_profile and /stop_profile directly on the app. + """Unconditionally register /start_profile and /stop_profile on the app. - These are registered on the app (not the module-level router) to - guarantee availability regardless of how vllm's build_app() handles - router inclusion. + vllm's build_app() only registers these routes when a profiler_config + is explicitly provided (e.g. --profiler-config). For omni we always + want them available so that nsys profiling can be triggered via HTTP + without extra CLI flags. """ + from vllm.entrypoints.serve.profile.api_router import router as profile_router - @app.post("/start_profile") - async def start_profile(raw_request: Request) -> JSONResponse: - """Start profiling on all stages. - - When the server is running under nsys with - ``--capture-range=cudaProfilerApi``, this also opens the CUDA - profiler capture region. - """ - engine_client = raw_request.app.state.engine_client - try: - await engine_client.start_profile() - except Exception as e: - logger.exception("Failed to start profile: %s", e) - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, - detail=str(e), - ) from e - return JSONResponse(content={"status": "ok"}) - - @app.post("/stop_profile") - async def stop_profile(raw_request: Request) -> JSONResponse: - """Stop profiling on all stages. - - When running under nsys, this closes the CUDA profiler capture - region so nsys finalises the current capture. - """ - engine_client = raw_request.app.state.engine_client - try: - await engine_client.stop_profile() - except Exception as e: - logger.exception("Failed to stop profile: %s", e) - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, - detail=str(e), - ) from e - return JSONResponse(content={"status": "ok"}) + app.include_router(profile_router) async def omni_run_server(args, **uvicorn_kwargs) -> None: From e0100fa0c0d5c941bca19201d7bd82f29e6c1b5c Mon Sep 17 00:00:00 2001 From: Jinheng Li Date: Tue, 3 Feb 2026 10:03:52 +0800 Subject: [PATCH 03/13] [Profiler] Address PR review feedback - Guard torch.cuda.profiler calls with torch.cuda.is_available() so non-CUDA platforms (ROCm, NPU, XPU) get no-ops instead of crashes - Add torch.cuda.profiler.start()/stop() inside DiffusionWorker.start_profile/stop_profile so nsys captures GPU activity in the actual diffusion worker subprocesses - Restructure profiling docs: move nsys online serving section to the top as the primary workflow, remove duplicate section Co-Authored-By: Claude Opus 4.5 Signed-off-by: Jinheng Li --- docs/contributing/profiling.md | 268 ++++++++---------- .../diffusion/worker/diffusion_worker.py | 72 +++-- 2 files changed, 176 insertions(+), 164 deletions(-) diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 7a2e64f1312..418fb707ae9 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -1,216 +1,192 @@ # Profiling vLLM-Omni -> **Warning:** Profiling incurs significant overhead. Use only for development and debugging, never in production. +> **Warning:** Profiling is for development and debugging only. It adds significant overhead and should not be enabled in production. -vLLM-Omni uses the PyTorch Profiler to analyze performance across both **multi-stage omni-modality models** and **diffusion models**. +vLLM-Omni supports two profiler backends through `profiler_config`: -### 1. Configure Profiling in the Stage YAML +- `torch`: detailed CPU/CUDA traces written to `torch_profiler_dir` +- `cuda`: low-overhead CUDA range control for NVIDIA Nsight Systems (`nsys`) -Enable profiling by adding `profiler_config` under `engine_args` for the stage(s) you want to profile in your stage config YAML: +## 1. Configure Profiling + +Use the same `profiler_config` shape everywhere: + +```yaml +profiler_config: + profiler: torch + torch_profiler_dir: ./perf +``` + +Supported fields: + +| Field | Description | +|---|---| +| `profiler` | Profiler backend. Supported values: `torch`, `cuda`. | +| `torch_profiler_dir` | Output directory for torch traces. Required when `profiler: torch`. | +| `delay_iterations` | Number of worker iterations to skip before profiling starts. | +| `max_iterations` | Maximum number of worker iterations to capture before auto-stop. | +| `warmup_iterations` | Torch-profiler warmup iterations. | +| `active_iterations` | Torch-profiler active iterations. | +| `wait_iterations` | Torch-profiler wait iterations before warmup. | + +For multi-stage omni pipelines, put `profiler_config` under the target stage's `engine_args`. ```yaml stage_args: - stage_id: 0 stage_type: llm engine_args: - # ... other engine args ... profiler_config: profiler: torch torch_profiler_dir: ./perf ``` -| Field | Description | -|---|---| -| `profiler` | Profiler backend to use. Currently supports `torch`. | -| `torch_profiler_dir` | Directory where trace files are saved. Created automatically if it doesn't exist. | - -> **Tip:** Only enable `profiler_config` on stages you actually need to profile. Stages without it will not start a profiler, keeping overhead minimal. - -### 2. Profiling Omni-Modality Models +For single-stage diffusion usage, pass `profiler_config` directly to `Omni(...)` or `vllm serve`. -**Selective Stage Profiling** +## 2. Profiling Omni Pipelines -It is highly recommended to profile specific stages to prevent producing overly large trace files: +It is usually best to profile only the stages you need. ```python -# Profile all stages -omni_llm.start_profile() +# Profile all stages. +omni.start_profile() -# Only profile Stage 1 -omni_llm.start_profile(stages=[1]) - -# Stage 0 (Thinker) and Stage 2 (Audio Decoder) for qwen omni -omni_llm.start_profile(stages=[0, 2]) +# Profile selected stages only. +omni.start_profile(stages=[0, 2]) +... +omni.stop_profile(stages=[0, 2]) ``` -> **Important:** Always pass the same `stages` list to both `start_profile()` and `stop_profile()`. If you omit `stages` from `stop_profile()`, it defaults to stopping all stages — including ones that were never started — which will produce errors. - -**Python Usage**: Wrap your generation logic with `start_profile()` and `stop_profile()`. +Always stop the same stage set that you started. If only some stages have `profiler_config`, pass an explicit `stages=[...]` list instead of relying on the default "all stages" behavior. -```python -profiler_stages = [0] # Only profile the stages you need +Examples: -# 1. Start profiling -omni.start_profile(stages=profiler_stages) +1. [Qwen2.5-Omni end2end](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py) +2. [Qwen3-Omni end2end](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py) -# Initialize generator -omni_generator = omni.generate(prompts, sampling_params_list, py_generator=args.py_generator) +## 3. Profiling Single-Stage Diffusion -total_requests = len(prompts) -processed_count = 0 +Single-stage diffusion models use the same `start_profile()` / `stop_profile()` controls, but you must provide `profiler_config` explicitly. -# Main Processing Loop -for stage_outputs in omni_generator: +### PyTorch profiler - # ... [Output processing logic for text/audio would go here] ... +```python +from vllm_omni import Omni + +omni = Omni( + model="Wan-AI/Wan2.2-I2V-A14B-Diffusers", + profiler_config={ + "profiler": "torch", + "torch_profiler_dir": "./perf", + }, +) + +omni.start_profile() +... +omni.stop_profile() +``` - # Update count to track when to stop profiling - processed_count += len(stage_outputs.request_output) +### Nsight Systems (`nsys`) - # 2. Check if all requests are done to stop the profiler safely - if profiler_enabled and processed_count >= total_requests: - print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...") +For Nsight Systems, use `profiler: cuda` and wrap the process with `nsys profile`. - # Stop the profiler while workers are still active - # Pass the same stages list used in start_profile() - omni_llm.stop_profile(stages=profiler_stages) +```bash +nsys profile \ + --trace-fork-before-exec=true \ + --cuda-graph-trace=node \ + --capture-range=cudaProfilerApi \ + --capture-range-end=repeat \ + -o diffusion_trace \ + python image_to_video.py ... +``` - # Wait for traces to flush to disk - print("[Info] Waiting 30s for workers to write trace files to disk...") - time.sleep(30) - print("[Info] Trace export wait time finished.") +The Python process being profiled must create the diffusion engine with: -omni_llm.close() +```python +profiler_config={"profiler": "cuda"} ``` +Then call `start_profile()` before the requests you want to capture and `stop_profile()` after them. The diffusion worker processes open and close the CUDA capture range themselves, so `nsys` sees the actual GPU work instead of only the parent process. -**CLI Usage** (using `end2end.py`): -```bash -# Profile only Stage 0 (Thinker) -python end2end.py --output-wav output_audio \ - --query-type text --enable-profiler --profiler-stages 0 +Examples: -# Profile Stage 0 and Stage 2 -python end2end.py --output-wav output_audio \ - --query-type text --enable-profiler --profiler-stages 0 2 +1. [Image edit example](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py) +2. [Image to video example](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video) -# Profile all stages (omit --profiler-stages) -python end2end.py --output-wav output_audio \ - --query-type text --enable-profiler -``` +## 4. Profiling Online Serving -**Examples**: +When any stage has `profiler_config.profiler` set, the server exposes: -1. **Qwen2.5-Omni**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py) +- `POST /start_profile` +- `POST /stop_profile` -2. **Qwen3-Omni**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py) +### Start the server -### 3. Profiling diffusion models +Multi-stage omni serving: -Diffusion profiling is End-to-End, capturing encoding, denoising loops, and decoding. Standalone diffusion scripts use `--profiler-dir` to enable profiling. - -**CLI Usage:** ```bash -python image_to_video.py \ - --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \ - --image qwen-bear.png \ - --prompt "A cat playing with yarn, smooth motion" \ - --profiler-dir \ - \ - # Minimize Spatial Dimensions (Optional but helpful): - # Drastically reduces memory usage so the profiler doesn't - # crash due to overhead, though for accurate performance - # tuning you often want target resolutions. - --height 48 \ - --width 64 \ - \ - # Minimize Temporal Dimension (Frames): - # Video models process 3D tensors (Time, Height, Width). - # Reducing frames to the absolute minimum (2) keeps the - # tensor size small, ensuring the trace file doesn't become - # multi-gigabytes in size. - --num-frames 2 \ - \ - # Minimize Iteration Loop (Steps): - # This is the most critical setting for profiling. - # Diffusion models run the same loop X times. - # Profiling 2 steps gives you the exact same performance - # data as 50 steps, but saves minutes of runtime and - # prevents the trace viewer from freezing. - --num-inference-steps 2 \ - \ - --guidance-scale 5.0 \ - --guidance-scale-high 6.0 \ - --boundary-ratio 0.875 \ - --flow-shift 12.0 \ - --fps 16 \ - --output i2v_output.mp4 +vllm serve Qwen/Qwen2.5-Omni-7B \ + --omni \ + --stage-configs-path qwen2_5_omni.yaml \ + --port 8091 ``` -> **Note:** For diffusion stages within a multi-stage omni pipeline, use `profiler_config` in the stage YAML instead (see Section 1). - -**Examples**: - -1. **Qwen image edit**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py) - -2. **Wan-AI/Wan2.2-I2V-A14B-Diffusers**: [https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video) - -### 4. Profiling Online Serving - -When `profiler_config` is set in the stage YAML, the server automatically exposes `/start_profile` and `/stop_profile` HTTP endpoints. +Single-stage diffusion serving with torch profiler: -**1. Start the server** with a stage YAML that has `profiler_config` enabled: ```bash -vllm serve Qwen/Qwen2.5-Omni-7B \ - --omni \ - --stage-configs-path qwen2_5_omni.yaml \ - --port 8091 +vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers \ + --omni \ + --port 8091 \ + --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}' ``` -Or for one stage diffusion models: +Single-stage diffusion serving with Nsight Systems: ```bash -vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers --omni --port 8091 --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}' +nsys profile \ + --trace-fork-before-exec=true \ + --cuda-graph-trace=node \ + --capture-range=cudaProfilerApi \ + --capture-range-end=repeat \ + -o serving_trace \ + vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers \ + --omni \ + --port 8091 \ + --profiler-config '{"profiler": "cuda"}' ``` -**2. Start profiling** by sending a POST request: +### Control capture + ```bash -# Profile all stages that have profiler_config set +# Start profiling on all profiled stages. curl -X POST http://localhost:8091/start_profile -# Profile specific stages only +# Start profiling on selected stages. curl -X POST http://localhost:8091/start_profile \ - -H "Content-Type: application/json" \ - -d '{"stages": [0]}' -``` + -H "Content-Type: application/json" \ + -d '{"stages": [0]}' -**3. Send your inference requests** as normal while the profiler is running. - -**4. Stop profiling** and collect traces: -```bash -# Stop all stages +# Stop profiling. curl -X POST http://localhost:8091/stop_profile - -# Stop specific stages (must match the stages you started) -curl -X POST http://localhost:8091/stop_profile \ - -H "Content-Type: application/json" \ - -d '{"stages": [0]}' ``` -Trace files are written to the `torch_profiler_dir` specified in your stage YAML. +For mixed-stage pipelines, use explicit `stages` and pass the same stage list to both endpoints. + +## 5. Analyze Results -> **Important:** Always stop the same stages you started. Stopping a stage that was never started will produce errors. +Torch profiler output: -### 5. Analyzing Traces +- Chrome/Perfetto traces under `torch_profiler_dir` +- Optional aggregated CUDA-time tables under the same directory -Output files are saved to the `torch_profiler_dir` specified in your stage YAML config. +CUDA profiler / Nsight Systems output: -**Output** -**Chrome Trace** (`.json.gz`): Visual timeline of kernels and stages. Open in Perfetto UI. +- `.nsys-rep` report files written by `nsys -o ...` -**Viewing Tools:** +Recommended viewers: -- [Perfetto](https://ui.perfetto.dev/) (recommended) -- `chrome://tracing` (Chrome only) +- [Perfetto](https://ui.perfetto.dev/) for torch traces +- `nsys stats .nsys-rep` for CLI summaries +- Nsight Systems GUI for CUDA kernel timelines -**Note**: vLLM-Omni reuses the PyTorch Profiler infrastructure from vLLM. See the official vLLM profiler documentation: [vLLM Profiling Guide](https://docs.vllm.ai/en/stable/contributing/profiling/) +vLLM-Omni reuses the vLLM profiling infrastructure where possible. For the upstream reference, see the [vLLM profiling guide](https://docs.vllm.ai/en/stable/contributing/profiling/). diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py index ea4b9d96f71..260682135e4 100644 --- a/vllm_omni/diffusion/worker/diffusion_worker.py +++ b/vllm_omni/diffusion/worker/diffusion_worker.py @@ -20,6 +20,7 @@ from vllm.config import CompilationConfig, DeviceConfig, VllmConfig, set_current_vllm_config from vllm.distributed.device_communicators.shm_broadcast import MessageQueue from vllm.logger import init_logger +from vllm.profiler.wrapper import CudaProfilerWrapper, WorkerProfiler from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.mem_utils import GiB_bytes from vllm.v1.worker.workspace import init_workspace_manager @@ -83,15 +84,7 @@ def __init__( od_config=self.od_config, device=self.device, ) - # Initialize profiler if configured - self.profiler: OmniTorchProfilerWrapper | None = None - profiler_config = self.od_config.profiler_config - if profiler_config and profiler_config.profiler == "torch": - self.profiler = create_omni_profiler( - profiler_config=profiler_config, - worker_name=f"diffusion_worker_{self.rank}", - local_rank=self.local_rank, - ) + self.profiler: WorkerProfiler | None = self._create_profiler() if not skip_load_model: self.load_model(load_format=self.od_config.diffusion_load_format) self.init_lora_manager() @@ -122,6 +115,7 @@ def init_device(self) -> None: vllm_config.parallel_config.tensor_parallel_size = self.od_config.parallel_config.tensor_parallel_size vllm_config.parallel_config.data_parallel_size = self.od_config.parallel_config.data_parallel_size vllm_config.parallel_config.enable_expert_parallel = self.od_config.parallel_config.enable_expert_parallel + vllm_config.profiler_config = self.od_config.profiler_config self.vllm_config = vllm_config # Initialize distributed environment @@ -147,6 +141,41 @@ def init_device(self) -> None: ) init_workspace_manager(self.device) + def _create_profiler(self) -> WorkerProfiler | None: + profiler_config = getattr(self.od_config, "profiler_config", None) + if self.vllm_config is not None: + self.vllm_config.profiler_config = profiler_config + + profiler_type = getattr(profiler_config, "profiler", None) + if profiler_type == "torch": + return create_omni_profiler( + profiler_config=profiler_config, + worker_name=f"diffusion-rank-{self.rank}", + local_rank=self.local_rank, + ) + if profiler_type == "cuda": + try: + return CudaProfilerWrapper(profiler_config) + except Exception as exc: + logger.warning( + "Failed to initialize CUDA profiler on diffusion worker %s: %s", + self.rank, + exc, + ) + return None + if profiler_type is not None: + logger.warning("Unknown profiler backend %r on diffusion worker %s", profiler_type, self.rank) + return None + + def _profiler_context(self, name: str) -> AbstractContextManager: + if self.profiler is None: + return nullcontext() + return self.profiler.annotate_context_manager(name) + + def _step_profiler(self) -> None: + if self.profiler is not None: + self.profiler.step() + def load_model(self, load_format: str = "default", custom_pipeline_name: str | None = None) -> None: """Load the diffusion model using DiffusionModelRunner.""" with ( @@ -187,7 +216,7 @@ def generate(self, request: OmniDiffusionRequest) -> DiffusionOutput: """Generate output for the given requests.""" return self.execute_model(request, self.od_config) - def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None: + def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> dict[str, Any] | None: """Start or stop profiling for this GPU worker. Args: @@ -203,16 +232,18 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N return if is_start: - from vllm_omni.profiler import OmniTorchProfilerWrapper - if isinstance(self.profiler, OmniTorchProfilerWrapper): import time - filename = profile_prefix or f"diffusion_{int(time.time())}" + filename = profile_prefix or f"diffusion_rank{self.rank}_{int(time.time())}" self.profiler.set_trace_filename(filename) self.profiler.start() - else: - self.profiler.stop() + return None + + self.profiler.stop() + if isinstance(self.profiler, OmniTorchProfilerWrapper): + return self.profiler.get_results() + return None def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfig) -> DiffusionOutput: """Execute a forward pass by delegating to the model runner.""" @@ -224,7 +255,10 @@ def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfi if req.sampling_params.lora_request is not None: raise logger.warning("LoRA activation skipped: %s", exc) - return self.model_runner.execute_model(req) + with self._profiler_context("diffusion_forward"): + output = self.model_runner.execute_model(req) + self._step_profiler() + return output def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> RunnerOutput: """Execute one diffusion step by delegating to the model runner.""" @@ -236,8 +270,10 @@ def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> Runner if any(new_req.req.sampling_params.lora_request is not None for new_req in scheduler_output.scheduled_new_reqs): raise ValueError("Step mode does not support LoRA yet.") - - return self.model_runner.execute_stepwise(scheduler_output) + with self._profiler_context("diffusion_step"): + output = self.model_runner.execute_stepwise(scheduler_output) + self._step_profiler() + return output def load_weights(self, weights) -> set[str]: """Load weights by delegating to the model runner.""" From 26602fe0996118dbd9c44deebc46e385c12b68ed Mon Sep 17 00:00:00 2001 From: Jinheng Li Date: Tue, 3 Feb 2026 14:28:59 +0800 Subject: [PATCH 04/13] [Profiler] Remove online profiling endpoints, focus on nsys integration Remove HTTP /start_profile and /stop_profile endpoint registration from api_server.py as someone else is handling online profiling. This PR now focuses purely on nsys integration for diffusion workers: - CudaProfiler class with platform guards - torch.cuda.profiler calls in DiffusionWorker.start_profile/stop_profile - Updated docs for nsys usage with offline diffusion scripts Co-Authored-By: Claude Opus 4.5 Signed-off-by: Jinheng Li --- vllm_omni/entrypoints/openai/api_server.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 972fee3b58d..d15dc90fe5d 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -246,19 +246,6 @@ async def show_available_models(self) -> ModelList: # Server entry points -def _register_profiling_routes(app) -> None: - """Unconditionally register /start_profile and /stop_profile on the app. - - vllm's build_app() only registers these routes when a profiler_config - is explicitly provided (e.g. --profiler-config). For omni we always - want them available so that nsys profiling can be triggered via HTTP - without extra CLI flags. - """ - from vllm.entrypoints.serve.profile.api_router import router as profile_router - - app.include_router(profile_router) - - async def omni_run_server(args, **uvicorn_kwargs) -> None: """Run a single-worker API server. @@ -318,10 +305,6 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None, _remove_route_from_app(app, "/v1/models", {"GET"}) # Remove upstream /v1/models to use omni's handler app.include_router(router) - # Register profiling endpoints directly on the app so they are - # available regardless of how vllm's build_app handles routers. - _register_profiling_routes(app) - await omni_init_app_state(engine_client, app.state, args) # Conditionally register profiler endpoints based on stage YAML configs From 8771777ccaaa85e36c584ef0ffa2183ef39438c7 Mon Sep 17 00:00:00 2001 From: Jinheng Li Date: Tue, 3 Feb 2026 16:02:20 +0800 Subject: [PATCH 05/13] Align diffusion profiling with vLLM Signed-off-by: Jinheng Li --- vllm_omni/diffusion/diffusion_engine.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 784da617529..1cfb36fdc7b 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -332,8 +332,8 @@ def add_req_and_wait_for_response(self, request: OmniDiffusionRequest) -> Diffus missing_result_error="Diffusion execution finished without a final output.", ) - def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None: - """Start or stop torch profiling on all diffusion workers. + def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> list[Any] | None: + """Start or stop profiling on all diffusion workers. Args: is_start: True to start profiling, False to stop. @@ -351,12 +351,13 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N logger.info("Stopping diffusion profiling...") try: - self.collective_rpc(method="profile", args=(is_start, profile_prefix)) + return self.collective_rpc(method="profile", args=(is_start, profile_prefix)) except Exception as e: action = "start" if is_start else "stop" logger.error(f"Failed to {action} profiling on workers", exc_info=True) if is_start: raise RuntimeError(f"Could not {action} profiler: {e}") from e + return None def _dummy_run(self): """A dummy run to warm up the model.""" From d54e6e3e422f8a2e624874f4d91033290e6afcce Mon Sep 17 00:00:00 2001 From: Jinheng Li Date: Tue, 7 Apr 2026 14:46:59 +0800 Subject: [PATCH 06/13] Add CUDA profiler coverage for diffusion worker Signed-off-by: Jinheng Li --- .../test_diffusion_worker_cuda_profiler.py | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 tests/diffusion/test_diffusion_worker_cuda_profiler.py diff --git a/tests/diffusion/test_diffusion_worker_cuda_profiler.py b/tests/diffusion/test_diffusion_worker_cuda_profiler.py new file mode 100644 index 00000000000..ddc2aed2fc2 --- /dev/null +++ b/tests/diffusion/test_diffusion_worker_cuda_profiler.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock + +import pytest +from pytest_mock import MockerFixture + +from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker + +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] + + +@pytest.fixture +def mock_od_config(mocker: MockerFixture): + """Create a mock OmniDiffusionConfig with a CUDA profiler backend.""" + config = mocker.Mock() + config.profiler_config = mocker.Mock() + config.profiler_config.profiler = "cuda" + config.diffusion_load_format = "default" + return config + + +@pytest.fixture +def mock_diffusion_worker_dependencies(mocker: MockerFixture): + """Patch heavy worker dependencies for focused profiler tests.""" + mocker.patch.object(DiffusionWorker, "init_device") + mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.DiffusionModelRunner") + + +class TestDiffusionWorkerCudaProfiler: + def test_creates_cuda_profiler_wrapper( + self, + mocker: MockerFixture, + mock_od_config, + mock_diffusion_worker_dependencies, + ): + fake_profiler = mocker.Mock() + cuda_profiler = mocker.patch( + "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper", + return_value=fake_profiler, + ) + create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler") + + worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) + + cuda_profiler.assert_called_once_with(mock_od_config.profiler_config) + create_omni_profiler.assert_not_called() + assert worker.profiler is fake_profiler + + def test_profile_start_stop_delegates_to_cuda_profiler( + self, + mocker: MockerFixture, + mock_od_config, + mock_diffusion_worker_dependencies, + ): + fake_profiler = mocker.Mock() + fake_profiler.start = MagicMock() + fake_profiler.stop = MagicMock() + mocker.patch( + "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper", + return_value=fake_profiler, + ) + + worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) + + assert worker.profile(is_start=True) is None + assert worker.profile(is_start=False) is None + + fake_profiler.start.assert_called_once_with() + fake_profiler.stop.assert_called_once_with() + + def test_returns_none_when_profiler_config_is_missing( + self, + mocker: MockerFixture, + mock_od_config, + mock_diffusion_worker_dependencies, + ): + mock_od_config.profiler_config = None + cuda_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper") + create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler") + + worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) + + cuda_profiler.assert_not_called() + create_omni_profiler.assert_not_called() + assert worker.profiler is None + + def test_cuda_backend_does_not_use_torch_profiler_factory( + self, + mocker: MockerFixture, + mock_od_config, + mock_diffusion_worker_dependencies, + ): + mocker.patch( + "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper", + return_value=mocker.Mock(), + ) + create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler") + + DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) + + create_omni_profiler.assert_not_called() From 46180c138699b6c591fe75afb4e41a1127f01466 Mon Sep 17 00:00:00 2001 From: Jinheng Li Date: Wed, 8 Apr 2026 15:26:52 +0800 Subject: [PATCH 07/13] chore: trigger CI rerun Signed-off-by: Jinheng Li From 9871942ba4f1cd8b87f6d7cc9e0923744c37132b Mon Sep 17 00:00:00 2001 From: Jinheng Li Date: Wed, 8 Apr 2026 17:27:06 +0800 Subject: [PATCH 08/13] fix: make diffusion worker profiler helpers defensive Signed-off-by: Jinheng Li --- vllm_omni/diffusion/worker/diffusion_worker.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py index 260682135e4..c0d54589558 100644 --- a/vllm_omni/diffusion/worker/diffusion_worker.py +++ b/vllm_omni/diffusion/worker/diffusion_worker.py @@ -168,13 +168,15 @@ def _create_profiler(self) -> WorkerProfiler | None: return None def _profiler_context(self, name: str) -> AbstractContextManager: - if self.profiler is None: + profiler = getattr(self, "profiler", None) + if profiler is None: return nullcontext() - return self.profiler.annotate_context_manager(name) + return profiler.annotate_context_manager(name) def _step_profiler(self) -> None: - if self.profiler is not None: - self.profiler.step() + profiler = getattr(self, "profiler", None) + if profiler is not None: + profiler.step() def load_model(self, load_format: str = "default", custom_pipeline_name: str | None = None) -> None: """Load the diffusion model using DiffusionModelRunner.""" From baf2cf7923bbc99b3be20cbb804d7156bd8c2346 Mon Sep 17 00:00:00 2001 From: Canlin Guo <961750412@qq.com> Date: Thu, 9 Apr 2026 10:00:59 +0800 Subject: [PATCH 09/13] clean code Signed-off-by: Canlin Guo <961750412@qq.com> --- vllm_omni/diffusion/diffusion_engine.py | 11 +--- .../diffusion/worker/diffusion_worker.py | 58 +++++-------------- 2 files changed, 18 insertions(+), 51 deletions(-) diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 1cfb36fdc7b..17391ac191a 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -332,16 +332,12 @@ def add_req_and_wait_for_response(self, request: OmniDiffusionRequest) -> Diffus missing_result_error="Diffusion execution finished without a final output.", ) - def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> list[Any] | None: + def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None: """Start or stop profiling on all diffusion workers. Args: is_start: True to start profiling, False to stop. - profile_prefix: Optional prefix for trace filename (vLLM compat). - - Note: - Matches vLLM's worker.profile() signature for consistency. - Traces are saved automatically via on_trace_ready callback. + profile_prefix: Optional prefix for trace filename. """ if is_start: if profile_prefix is None: @@ -351,13 +347,12 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> l logger.info("Stopping diffusion profiling...") try: - return self.collective_rpc(method="profile", args=(is_start, profile_prefix)) + self.collective_rpc(method="profile", args=(is_start, profile_prefix)) except Exception as e: action = "start" if is_start else "stop" logger.error(f"Failed to {action} profiling on workers", exc_info=True) if is_start: raise RuntimeError(f"Could not {action} profiler: {e}") from e - return None def _dummy_run(self): """A dummy run to warm up the model.""" diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py index c0d54589558..09d46f09206 100644 --- a/vllm_omni/diffusion/worker/diffusion_worker.py +++ b/vllm_omni/diffusion/worker/diffusion_worker.py @@ -142,42 +142,20 @@ def init_device(self) -> None: init_workspace_manager(self.device) def _create_profiler(self) -> WorkerProfiler | None: - profiler_config = getattr(self.od_config, "profiler_config", None) - if self.vllm_config is not None: - self.vllm_config.profiler_config = profiler_config - + profiler_config = self.od_config.profiler_config profiler_type = getattr(profiler_config, "profiler", None) if profiler_type == "torch": return create_omni_profiler( profiler_config=profiler_config, - worker_name=f"diffusion-rank-{self.rank}", + worker_name=f"diffusion_rank{self.rank}", local_rank=self.local_rank, ) if profiler_type == "cuda": - try: - return CudaProfilerWrapper(profiler_config) - except Exception as exc: - logger.warning( - "Failed to initialize CUDA profiler on diffusion worker %s: %s", - self.rank, - exc, - ) - return None + return CudaProfilerWrapper(profiler_config) if profiler_type is not None: logger.warning("Unknown profiler backend %r on diffusion worker %s", profiler_type, self.rank) return None - def _profiler_context(self, name: str) -> AbstractContextManager: - profiler = getattr(self, "profiler", None) - if profiler is None: - return nullcontext() - return profiler.annotate_context_manager(name) - - def _step_profiler(self) -> None: - profiler = getattr(self, "profiler", None) - if profiler is not None: - profiler.step() - def load_model(self, load_format: str = "default", custom_pipeline_name: str | None = None) -> None: """Load the diffusion model using DiffusionModelRunner.""" with ( @@ -218,34 +196,24 @@ def generate(self, request: OmniDiffusionRequest) -> DiffusionOutput: """Generate output for the given requests.""" return self.execute_model(request, self.od_config) - def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> dict[str, Any] | None: + def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None: """Start or stop profiling for this GPU worker. Args: is_start: True to start profiling, False to stop. - profile_prefix: Optional prefix for trace filename (vLLM compat). - - Note: - Matches vLLM's worker.profile() signature for consistency. - Traces are saved automatically via on_trace_ready callback. + profile_prefix: Optional prefix for trace filename. """ if self.profiler is None: - logger.warning("Profiler not initialized, skipping profile(%s)", is_start) return if is_start: if isinstance(self.profiler, OmniTorchProfilerWrapper): import time - filename = profile_prefix or f"diffusion_rank{self.rank}_{int(time.time())}" self.profiler.set_trace_filename(filename) self.profiler.start() - return None - - self.profiler.stop() - if isinstance(self.profiler, OmniTorchProfilerWrapper): - return self.profiler.get_results() - return None + else: + self.profiler.stop() def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfig) -> DiffusionOutput: """Execute a forward pass by delegating to the model runner.""" @@ -257,9 +225,11 @@ def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfi if req.sampling_params.lora_request is not None: raise logger.warning("LoRA activation skipped: %s", exc) - with self._profiler_context("diffusion_forward"): + ctx = self.profiler.annotate_context_manager("diffusion_forward") if self.profiler else nullcontext() + with ctx: output = self.model_runner.execute_model(req) - self._step_profiler() + if self.profiler: + self.profiler.step() return output def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> RunnerOutput: @@ -272,9 +242,11 @@ def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> Runner if any(new_req.req.sampling_params.lora_request is not None for new_req in scheduler_output.scheduled_new_reqs): raise ValueError("Step mode does not support LoRA yet.") - with self._profiler_context("diffusion_step"): + ctx = self.profiler.annotate_context_manager("diffusion_step") if self.profiler else nullcontext() + with ctx: output = self.model_runner.execute_stepwise(scheduler_output) - self._step_profiler() + if self.profiler: + self.profiler.step() return output def load_weights(self, weights) -> set[str]: From 224e25e039b7f10776193b08686045ced85f5e9b Mon Sep 17 00:00:00 2001 From: Canlin Guo <961750412@qq.com> Date: Thu, 9 Apr 2026 10:44:01 +0800 Subject: [PATCH 10/13] lint Signed-off-by: Canlin Guo <961750412@qq.com> --- vllm_omni/diffusion/worker/diffusion_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py index 09d46f09206..36588452e79 100644 --- a/vllm_omni/diffusion/worker/diffusion_worker.py +++ b/vllm_omni/diffusion/worker/diffusion_worker.py @@ -209,6 +209,7 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N if is_start: if isinstance(self.profiler, OmniTorchProfilerWrapper): import time + filename = profile_prefix or f"diffusion_rank{self.rank}_{int(time.time())}" self.profiler.set_trace_filename(filename) self.profiler.start() From 0dfdc7735b23e2045e34a27c86827e3a973cb84f Mon Sep 17 00:00:00 2001 From: Canlin Guo <961750412@qq.com> Date: Thu, 9 Apr 2026 10:47:52 +0800 Subject: [PATCH 11/13] fix lint Signed-off-by: Canlin Guo <961750412@qq.com> --- vllm_omni/diffusion/worker/diffusion_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py index 36588452e79..0693e6e34ee 100644 --- a/vllm_omni/diffusion/worker/diffusion_worker.py +++ b/vllm_omni/diffusion/worker/diffusion_worker.py @@ -209,7 +209,7 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N if is_start: if isinstance(self.profiler, OmniTorchProfilerWrapper): import time - + filename = profile_prefix or f"diffusion_rank{self.rank}_{int(time.time())}" self.profiler.set_trace_filename(filename) self.profiler.start() From 90466977f2e1c7574580e9145545159454c4afa4 Mon Sep 17 00:00:00 2001 From: Jinheng Li Date: Thu, 9 Apr 2026 11:05:23 +0800 Subject: [PATCH 12/13] chore: trigger pipeline rerun Signed-off-by: Jinheng Li From 3fdadd7d71acd4f312b243b567638d8c4b26bafc Mon Sep 17 00:00:00 2001 From: Jinheng Li Date: Thu, 9 Apr 2026 11:27:56 +0800 Subject: [PATCH 13/13] fix: restore defensive diffusion profiler access Signed-off-by: Jinheng Li --- .../diffusion/worker/diffusion_worker.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py index 0693e6e34ee..160309e0d8d 100644 --- a/vllm_omni/diffusion/worker/diffusion_worker.py +++ b/vllm_omni/diffusion/worker/diffusion_worker.py @@ -156,6 +156,9 @@ def _create_profiler(self) -> WorkerProfiler | None: logger.warning("Unknown profiler backend %r on diffusion worker %s", profiler_type, self.rank) return None + def _get_profiler(self) -> WorkerProfiler | None: + return getattr(self, "profiler", None) + def load_model(self, load_format: str = "default", custom_pipeline_name: str | None = None) -> None: """Load the diffusion model using DiffusionModelRunner.""" with ( @@ -203,18 +206,19 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N is_start: True to start profiling, False to stop. profile_prefix: Optional prefix for trace filename. """ - if self.profiler is None: + profiler = self._get_profiler() + if profiler is None: return if is_start: - if isinstance(self.profiler, OmniTorchProfilerWrapper): + if isinstance(profiler, OmniTorchProfilerWrapper): import time filename = profile_prefix or f"diffusion_rank{self.rank}_{int(time.time())}" - self.profiler.set_trace_filename(filename) - self.profiler.start() + profiler.set_trace_filename(filename) + profiler.start() else: - self.profiler.stop() + profiler.stop() def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfig) -> DiffusionOutput: """Execute a forward pass by delegating to the model runner.""" @@ -226,11 +230,12 @@ def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfi if req.sampling_params.lora_request is not None: raise logger.warning("LoRA activation skipped: %s", exc) - ctx = self.profiler.annotate_context_manager("diffusion_forward") if self.profiler else nullcontext() + profiler = self._get_profiler() + ctx = profiler.annotate_context_manager("diffusion_forward") if profiler else nullcontext() with ctx: output = self.model_runner.execute_model(req) - if self.profiler: - self.profiler.step() + if profiler: + profiler.step() return output def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> RunnerOutput: @@ -243,11 +248,12 @@ def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> Runner if any(new_req.req.sampling_params.lora_request is not None for new_req in scheduler_output.scheduled_new_reqs): raise ValueError("Step mode does not support LoRA yet.") - ctx = self.profiler.annotate_context_manager("diffusion_step") if self.profiler else nullcontext() + profiler = self._get_profiler() + ctx = profiler.annotate_context_manager("diffusion_step") if profiler else nullcontext() with ctx: output = self.model_runner.execute_stepwise(scheduler_output) - if self.profiler: - self.profiler.step() + if profiler: + profiler.step() return output def load_weights(self, weights) -> set[str]: