From 8723b37ac956636e2be655a2edb7e386f74af0b6 Mon Sep 17 00:00:00 2001
From: Jinheng Li <ahengljh@gmail.com>
Date: Fri, 30 Jan 2026 15:37:45 +0800
Subject: [PATCH 01/13] [Profiler] Fix 404 by registering profile routes on app
 directly

Move /start_profile and /stop_profile from the module-level router
to direct app registration via _register_profiling_routes(), called
after build_app() returns. This ensures the routes exist on the app
regardless of how vllm's build_app() handles router inclusion.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: Jinheng Li <ahengljh@gmail.com>
---
 vllm_omni/entrypoints/openai/api_server.py | 50 ++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index d15dc90fe5d..df2f6f1a206 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -246,6 +246,52 @@ async def show_available_models(self) -> ModelList:
 # Server entry points
 
 
+def _register_profiling_routes(app) -> None:
+    """Register /start_profile and /stop_profile directly on the app.
+
+    These are registered on the app (not the module-level router) to
+    guarantee availability regardless of how vllm's build_app() handles
+    router inclusion.
+    """
+
+    @app.post("/start_profile")
+    async def start_profile(raw_request: Request) -> JSONResponse:
+        """Start profiling on all stages.
+
+        When the server is running under nsys with
+        ``--capture-range=cudaProfilerApi``, this also opens the CUDA
+        profiler capture region.
+        """
+        engine_client = raw_request.app.state.engine_client
+        try:
+            await engine_client.start_profile()
+        except Exception as e:
+            logger.exception("Failed to start profile: %s", e)
+            raise HTTPException(
+                status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                detail=str(e),
+            ) from e
+        return JSONResponse(content={"status": "ok"})
+
+    @app.post("/stop_profile")
+    async def stop_profile(raw_request: Request) -> JSONResponse:
+        """Stop profiling on all stages.
+
+        When running under nsys, this closes the CUDA profiler capture
+        region so nsys finalises the current capture.
+        """
+        engine_client = raw_request.app.state.engine_client
+        try:
+            await engine_client.stop_profile()
+        except Exception as e:
+            logger.exception("Failed to stop profile: %s", e)
+            raise HTTPException(
+                status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                detail=str(e),
+            ) from e
+        return JSONResponse(content={"status": "ok"})
+
+
 async def omni_run_server(args, **uvicorn_kwargs) -> None:
     """Run a single-worker API server.
 
@@ -305,6 +351,10 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None,
         _remove_route_from_app(app, "/v1/models", {"GET"})  # Remove upstream /v1/models to use omni's handler
         app.include_router(router)
 
+        # Register profiling endpoints directly on the app so they are
+        # available regardless of how vllm's build_app handles routers.
+        _register_profiling_routes(app)
+
         await omni_init_app_state(engine_client, app.state, args)
 
         # Conditionally register profiler endpoints based on stage YAML configs

From d90941cb32eda6ce54677a8363c6c8bbddfda3a5 Mon Sep 17 00:00:00 2001
From: Jinheng Li <ahengljh@gmail.com>
Date: Fri, 30 Jan 2026 16:19:59 +0800
Subject: [PATCH 02/13] [Profiler] Fix 404: unconditionally include vllm's
 profile router

vllm's build_app() only registers /start_profile and /stop_profile
when profiler_config is explicitly set via CLI.  For the omni server
we always want these endpoints available so nsys profiling can be
triggered via HTTP.  Replace custom route handlers with a simple
unconditional include of vllm's existing profile router.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: Jinheng Li <ahengljh@gmail.com>
---
 vllm_omni/entrypoints/openai/api_server.py | 47 ++++------------------
 1 file changed, 7 insertions(+), 40 deletions(-)

diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index df2f6f1a206..972fee3b58d 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -247,49 +247,16 @@ async def show_available_models(self) -> ModelList:
 
 
 def _register_profiling_routes(app) -> None:
-    """Register /start_profile and /stop_profile directly on the app.
+    """Unconditionally register /start_profile and /stop_profile on the app.
 
-    These are registered on the app (not the module-level router) to
-    guarantee availability regardless of how vllm's build_app() handles
-    router inclusion.
+    vllm's build_app() only registers these routes when a profiler_config
+    is explicitly provided (e.g. --profiler-config).  For omni we always
+    want them available so that nsys profiling can be triggered via HTTP
+    without extra CLI flags.
     """
+    from vllm.entrypoints.serve.profile.api_router import router as profile_router
 
-    @app.post("/start_profile")
-    async def start_profile(raw_request: Request) -> JSONResponse:
-        """Start profiling on all stages.
-
-        When the server is running under nsys with
-        ``--capture-range=cudaProfilerApi``, this also opens the CUDA
-        profiler capture region.
-        """
-        engine_client = raw_request.app.state.engine_client
-        try:
-            await engine_client.start_profile()
-        except Exception as e:
-            logger.exception("Failed to start profile: %s", e)
-            raise HTTPException(
-                status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                detail=str(e),
-            ) from e
-        return JSONResponse(content={"status": "ok"})
-
-    @app.post("/stop_profile")
-    async def stop_profile(raw_request: Request) -> JSONResponse:
-        """Stop profiling on all stages.
-
-        When running under nsys, this closes the CUDA profiler capture
-        region so nsys finalises the current capture.
-        """
-        engine_client = raw_request.app.state.engine_client
-        try:
-            await engine_client.stop_profile()
-        except Exception as e:
-            logger.exception("Failed to stop profile: %s", e)
-            raise HTTPException(
-                status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
-                detail=str(e),
-            ) from e
-        return JSONResponse(content={"status": "ok"})
+    app.include_router(profile_router)
 
 
 async def omni_run_server(args, **uvicorn_kwargs) -> None:

From e0100fa0c0d5c941bca19201d7bd82f29e6c1b5c Mon Sep 17 00:00:00 2001
From: Jinheng Li <ahengljh@gmail.com>
Date: Tue, 3 Feb 2026 10:03:52 +0800
Subject: [PATCH 03/13] [Profiler] Address PR review feedback

- Guard torch.cuda.profiler calls with torch.cuda.is_available() so
  non-CUDA platforms (ROCm, NPU, XPU) get no-ops instead of crashes
- Add torch.cuda.profiler.start()/stop() inside
  DiffusionWorker.start_profile/stop_profile so nsys captures GPU
  activity in the actual diffusion worker subprocesses
- Restructure profiling docs: move nsys online serving section to
  the top as the primary workflow, remove duplicate section

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: Jinheng Li <ahengljh@gmail.com>
---
 docs/contributing/profiling.md                | 268 ++++++++----------
 .../diffusion/worker/diffusion_worker.py      |  72 +++--
 2 files changed, 176 insertions(+), 164 deletions(-)

diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 7a2e64f1312..418fb707ae9 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -1,216 +1,192 @@
 # Profiling vLLM-Omni
 
-> **Warning:** Profiling incurs significant overhead. Use only for development and debugging, never in production.
+> **Warning:** Profiling is for development and debugging only. It adds significant overhead and should not be enabled in production.
 
-vLLM-Omni uses the PyTorch Profiler to analyze performance across both **multi-stage omni-modality models** and **diffusion models**.
+vLLM-Omni supports two profiler backends through `profiler_config`:
 
-### 1. Configure Profiling in the Stage YAML
+- `torch`: detailed CPU/CUDA traces written to `torch_profiler_dir`
+- `cuda`: low-overhead CUDA range control for NVIDIA Nsight Systems (`nsys`)
 
-Enable profiling by adding `profiler_config` under `engine_args` for the stage(s) you want to profile in your stage config YAML:
+## 1. Configure Profiling
+
+Use the same `profiler_config` shape everywhere:
+
+```yaml
+profiler_config:
+  profiler: torch
+  torch_profiler_dir: ./perf
+```
+
+Supported fields:
+
+| Field | Description |
+|---|---|
+| `profiler` | Profiler backend. Supported values: `torch`, `cuda`. |
+| `torch_profiler_dir` | Output directory for torch traces. Required when `profiler: torch`. |
+| `delay_iterations` | Number of worker iterations to skip before profiling starts. |
+| `max_iterations` | Maximum number of worker iterations to capture before auto-stop. |
+| `warmup_iterations` | Torch-profiler warmup iterations. |
+| `active_iterations` | Torch-profiler active iterations. |
+| `wait_iterations` | Torch-profiler wait iterations before warmup. |
+
+For multi-stage omni pipelines, put `profiler_config` under the target stage's `engine_args`.
 
 ```yaml
 stage_args:
   - stage_id: 0
     stage_type: llm
     engine_args:
-      # ... other engine args ...
       profiler_config:
         profiler: torch
         torch_profiler_dir: ./perf
 ```
 
-| Field | Description |
-|---|---|
-| `profiler` | Profiler backend to use. Currently supports `torch`. |
-| `torch_profiler_dir` | Directory where trace files are saved. Created automatically if it doesn't exist. |
-
-> **Tip:** Only enable `profiler_config` on stages you actually need to profile. Stages without it will not start a profiler, keeping overhead minimal.
-
-### 2. Profiling Omni-Modality Models
+For single-stage diffusion usage, pass `profiler_config` directly to `Omni(...)` or `vllm serve`.
 
-**Selective Stage Profiling**
+## 2. Profiling Omni Pipelines
 
-It is highly recommended to profile specific stages to prevent producing overly large trace files:
+It is usually best to profile only the stages you need.
 
 ```python
-# Profile all stages
-omni_llm.start_profile()
+# Profile all stages.
+omni.start_profile()
 
-# Only profile Stage 1
-omni_llm.start_profile(stages=[1])
-
-# Stage 0 (Thinker) and Stage 2 (Audio Decoder) for qwen omni
-omni_llm.start_profile(stages=[0, 2])
+# Profile selected stages only.
+omni.start_profile(stages=[0, 2])
+...
+omni.stop_profile(stages=[0, 2])
 ```
 
-> **Important:** Always pass the same `stages` list to both `start_profile()` and `stop_profile()`. If you omit `stages` from `stop_profile()`, it defaults to stopping all stages — including ones that were never started — which will produce errors.
-
-**Python Usage**: Wrap your generation logic with `start_profile()` and `stop_profile()`.
+Always stop the same stage set that you started. If only some stages have `profiler_config`, pass an explicit `stages=[...]` list instead of relying on the default "all stages" behavior.
 
-```python
-profiler_stages = [0]  # Only profile the stages you need
+Examples:
 
-# 1. Start profiling
-omni.start_profile(stages=profiler_stages)
+1. [Qwen2.5-Omni end2end](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py)
+2. [Qwen3-Omni end2end](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py)
 
-# Initialize generator
-omni_generator = omni.generate(prompts, sampling_params_list, py_generator=args.py_generator)
+## 3. Profiling Single-Stage Diffusion
 
-total_requests = len(prompts)
-processed_count = 0
+Single-stage diffusion models use the same `start_profile()` / `stop_profile()` controls, but you must provide `profiler_config` explicitly.
 
-# Main Processing Loop
-for stage_outputs in omni_generator:
+### PyTorch profiler
 
-    # ... [Output processing logic for text/audio would go here] ...
+```python
+from vllm_omni import Omni
+
+omni = Omni(
+    model="Wan-AI/Wan2.2-I2V-A14B-Diffusers",
+    profiler_config={
+        "profiler": "torch",
+        "torch_profiler_dir": "./perf",
+    },
+)
+
+omni.start_profile()
+...
+omni.stop_profile()
+```
 
-    # Update count to track when to stop profiling
-    processed_count += len(stage_outputs.request_output)
+### Nsight Systems (`nsys`)
 
-    # 2. Check if all requests are done to stop the profiler safely
-    if profiler_enabled and processed_count >= total_requests:
-        print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...")
+For Nsight Systems, use `profiler: cuda` and wrap the process with `nsys profile`.
 
-        # Stop the profiler while workers are still active
-        # Pass the same stages list used in start_profile()
-        omni_llm.stop_profile(stages=profiler_stages)
+```bash
+nsys profile \
+  --trace-fork-before-exec=true \
+  --cuda-graph-trace=node \
+  --capture-range=cudaProfilerApi \
+  --capture-range-end=repeat \
+  -o diffusion_trace \
+  python image_to_video.py ...
+```
 
-        # Wait for traces to flush to disk
-        print("[Info] Waiting 30s for workers to write trace files to disk...")
-        time.sleep(30)
-        print("[Info] Trace export wait time finished.")
+The Python process being profiled must create the diffusion engine with:
 
-omni_llm.close()
+```python
+profiler_config={"profiler": "cuda"}
 ```
 
+Then call `start_profile()` before the requests you want to capture and `stop_profile()` after them. The diffusion worker processes open and close the CUDA capture range themselves, so `nsys` sees the actual GPU work instead of only the parent process.
 
-**CLI Usage** (using `end2end.py`):
-```bash
-# Profile only Stage 0 (Thinker)
-python end2end.py --output-wav output_audio \
-    --query-type text --enable-profiler --profiler-stages 0
+Examples:
 
-# Profile Stage 0 and Stage 2
-python end2end.py --output-wav output_audio \
-    --query-type text --enable-profiler --profiler-stages 0 2
+1. [Image edit example](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py)
+2. [Image to video example](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video)
 
-# Profile all stages (omit --profiler-stages)
-python end2end.py --output-wav output_audio \
-    --query-type text --enable-profiler
-```
+## 4. Profiling Online Serving
 
-**Examples**:
+When any stage has `profiler_config.profiler` set, the server exposes:
 
-1. **Qwen2.5-Omni**:  [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py)
+- `POST /start_profile`
+- `POST /stop_profile`
 
-2. **Qwen3-Omni**:   [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py)
+### Start the server
 
-### 3. Profiling diffusion models
+Multi-stage omni serving:
 
-Diffusion profiling is End-to-End, capturing encoding, denoising loops, and decoding. Standalone diffusion scripts use `--profiler-dir` to enable profiling.
-
-**CLI Usage:**
 ```bash
-python image_to_video.py \
-    --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \
-    --image qwen-bear.png \
-    --prompt "A cat playing with yarn, smooth motion" \
-    --profiler-dir \
-    \
-    # Minimize Spatial Dimensions (Optional but helpful):
-    #    Drastically reduces memory usage so the profiler doesn't
-    #    crash due to overhead, though for accurate performance
-    #    tuning you often want target resolutions.
-    --height 48 \
-    --width 64 \
-    \
-    # Minimize Temporal Dimension (Frames):
-    #    Video models process 3D tensors (Time, Height, Width).
-    #    Reducing frames to the absolute minimum (2) keeps the
-    #    tensor size small, ensuring the trace file doesn't become
-    #    multi-gigabytes in size.
-    --num-frames 2 \
-    \
-    # Minimize Iteration Loop (Steps):
-    #    This is the most critical setting for profiling.
-    #    Diffusion models run the same loop X times.
-    #    Profiling 2 steps gives you the exact same performance
-    #    data as 50 steps, but saves minutes of runtime and
-    #    prevents the trace viewer from freezing.
-    --num-inference-steps 2 \
-    \
-    --guidance-scale 5.0 \
-    --guidance-scale-high 6.0 \
-    --boundary-ratio 0.875 \
-    --flow-shift 12.0 \
-    --fps 16 \
-    --output i2v_output.mp4
+vllm serve Qwen/Qwen2.5-Omni-7B \
+  --omni \
+  --stage-configs-path qwen2_5_omni.yaml \
+  --port 8091
 ```
 
-> **Note:** For diffusion stages within a multi-stage omni pipeline, use `profiler_config` in the stage YAML instead (see Section 1).
-
-**Examples**:
-
-1. **Qwen image edit**:  [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py)
-
-2. **Wan-AI/Wan2.2-I2V-A14B-Diffusers**:   [https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video)
-
-### 4. Profiling Online Serving
-
-When `profiler_config` is set in the stage YAML, the server automatically exposes `/start_profile` and `/stop_profile` HTTP endpoints.
+Single-stage diffusion serving with torch profiler:
 
-**1. Start the server** with a stage YAML that has `profiler_config` enabled:
 ```bash
-vllm serve Qwen/Qwen2.5-Omni-7B \
-    --omni \
-    --stage-configs-path qwen2_5_omni.yaml \
-    --port 8091
+vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers \
+  --omni \
+  --port 8091 \
+  --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}'
 ```
 
-Or for one stage diffusion models:
+Single-stage diffusion serving with Nsight Systems:
 
 ```bash
-vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers --omni --port 8091 --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}'
+nsys profile \
+  --trace-fork-before-exec=true \
+  --cuda-graph-trace=node \
+  --capture-range=cudaProfilerApi \
+  --capture-range-end=repeat \
+  -o serving_trace \
+  vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers \
+    --omni \
+    --port 8091 \
+    --profiler-config '{"profiler": "cuda"}'
 ```
 
-**2. Start profiling** by sending a POST request:
+### Control capture
+
 ```bash
-# Profile all stages that have profiler_config set
+# Start profiling on all profiled stages.
 curl -X POST http://localhost:8091/start_profile
 
-# Profile specific stages only
+# Start profiling on selected stages.
 curl -X POST http://localhost:8091/start_profile \
-    -H "Content-Type: application/json" \
-    -d '{"stages": [0]}'
-```
+  -H "Content-Type: application/json" \
+  -d '{"stages": [0]}'
 
-**3. Send your inference requests** as normal while the profiler is running.
-
-**4. Stop profiling** and collect traces:
-```bash
-# Stop all stages
+# Stop profiling.
 curl -X POST http://localhost:8091/stop_profile
-
-# Stop specific stages (must match the stages you started)
-curl -X POST http://localhost:8091/stop_profile \
-    -H "Content-Type: application/json" \
-    -d '{"stages": [0]}'
 ```
 
-Trace files are written to the `torch_profiler_dir` specified in your stage YAML.
+For mixed-stage pipelines, use explicit `stages` and pass the same stage list to both endpoints.
+
+## 5. Analyze Results
 
-> **Important:** Always stop the same stages you started. Stopping a stage that was never started will produce errors.
+Torch profiler output:
 
-### 5. Analyzing Traces
+- Chrome/Perfetto traces under `torch_profiler_dir`
+- Optional aggregated CUDA-time tables under the same directory
 
-Output files are saved to the `torch_profiler_dir` specified in your stage YAML config.
+CUDA profiler / Nsight Systems output:
 
-**Output**
-**Chrome Trace** (`.json.gz`): Visual timeline of kernels and stages. Open in Perfetto UI.
+- `.nsys-rep` report files written by `nsys -o ...`
 
-**Viewing Tools:**
+Recommended viewers:
 
-- [Perfetto](https://ui.perfetto.dev/) (recommended)
-- `chrome://tracing` (Chrome only)
+- [Perfetto](https://ui.perfetto.dev/) for torch traces
+- `nsys stats <report>.nsys-rep` for CLI summaries
+- Nsight Systems GUI for CUDA kernel timelines
 
-**Note**: vLLM-Omni reuses the PyTorch Profiler infrastructure from vLLM. See the official vLLM profiler documentation:  [vLLM Profiling Guide](https://docs.vllm.ai/en/stable/contributing/profiling/)
+vLLM-Omni reuses the vLLM profiling infrastructure where possible. For the upstream reference, see the [vLLM profiling guide](https://docs.vllm.ai/en/stable/contributing/profiling/).
diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
index ea4b9d96f71..260682135e4 100644
--- a/vllm_omni/diffusion/worker/diffusion_worker.py
+++ b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -20,6 +20,7 @@
 from vllm.config import CompilationConfig, DeviceConfig, VllmConfig, set_current_vllm_config
 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
 from vllm.logger import init_logger
+from vllm.profiler.wrapper import CudaProfilerWrapper, WorkerProfiler
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.mem_utils import GiB_bytes
 from vllm.v1.worker.workspace import init_workspace_manager
@@ -83,15 +84,7 @@ def __init__(
             od_config=self.od_config,
             device=self.device,
         )
-        # Initialize profiler if configured
-        self.profiler: OmniTorchProfilerWrapper | None = None
-        profiler_config = self.od_config.profiler_config
-        if profiler_config and profiler_config.profiler == "torch":
-            self.profiler = create_omni_profiler(
-                profiler_config=profiler_config,
-                worker_name=f"diffusion_worker_{self.rank}",
-                local_rank=self.local_rank,
-            )
+        self.profiler: WorkerProfiler | None = self._create_profiler()
         if not skip_load_model:
             self.load_model(load_format=self.od_config.diffusion_load_format)
             self.init_lora_manager()
@@ -122,6 +115,7 @@ def init_device(self) -> None:
         vllm_config.parallel_config.tensor_parallel_size = self.od_config.parallel_config.tensor_parallel_size
         vllm_config.parallel_config.data_parallel_size = self.od_config.parallel_config.data_parallel_size
         vllm_config.parallel_config.enable_expert_parallel = self.od_config.parallel_config.enable_expert_parallel
+        vllm_config.profiler_config = self.od_config.profiler_config
         self.vllm_config = vllm_config
 
         # Initialize distributed environment
@@ -147,6 +141,41 @@ def init_device(self) -> None:
             )
             init_workspace_manager(self.device)
 
+    def _create_profiler(self) -> WorkerProfiler | None:
+        profiler_config = getattr(self.od_config, "profiler_config", None)
+        if self.vllm_config is not None:
+            self.vllm_config.profiler_config = profiler_config
+
+        profiler_type = getattr(profiler_config, "profiler", None)
+        if profiler_type == "torch":
+            return create_omni_profiler(
+                profiler_config=profiler_config,
+                worker_name=f"diffusion-rank-{self.rank}",
+                local_rank=self.local_rank,
+            )
+        if profiler_type == "cuda":
+            try:
+                return CudaProfilerWrapper(profiler_config)
+            except Exception as exc:
+                logger.warning(
+                    "Failed to initialize CUDA profiler on diffusion worker %s: %s",
+                    self.rank,
+                    exc,
+                )
+                return None
+        if profiler_type is not None:
+            logger.warning("Unknown profiler backend %r on diffusion worker %s", profiler_type, self.rank)
+        return None
+
+    def _profiler_context(self, name: str) -> AbstractContextManager:
+        if self.profiler is None:
+            return nullcontext()
+        return self.profiler.annotate_context_manager(name)
+
+    def _step_profiler(self) -> None:
+        if self.profiler is not None:
+            self.profiler.step()
+
     def load_model(self, load_format: str = "default", custom_pipeline_name: str | None = None) -> None:
         """Load the diffusion model using DiffusionModelRunner."""
         with (
@@ -187,7 +216,7 @@ def generate(self, request: OmniDiffusionRequest) -> DiffusionOutput:
         """Generate output for the given requests."""
         return self.execute_model(request, self.od_config)
 
-    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
+    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> dict[str, Any] | None:
         """Start or stop profiling for this GPU worker.
 
         Args:
@@ -203,16 +232,18 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N
             return
 
         if is_start:
-            from vllm_omni.profiler import OmniTorchProfilerWrapper
-
             if isinstance(self.profiler, OmniTorchProfilerWrapper):
                 import time
 
-                filename = profile_prefix or f"diffusion_{int(time.time())}"
+                filename = profile_prefix or f"diffusion_rank{self.rank}_{int(time.time())}"
                 self.profiler.set_trace_filename(filename)
             self.profiler.start()
-        else:
-            self.profiler.stop()
+            return None
+
+        self.profiler.stop()
+        if isinstance(self.profiler, OmniTorchProfilerWrapper):
+            return self.profiler.get_results()
+        return None
 
     def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfig) -> DiffusionOutput:
         """Execute a forward pass by delegating to the model runner."""
@@ -224,7 +255,10 @@ def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfi
                 if req.sampling_params.lora_request is not None:
                     raise
                 logger.warning("LoRA activation skipped: %s", exc)
-        return self.model_runner.execute_model(req)
+        with self._profiler_context("diffusion_forward"):
+            output = self.model_runner.execute_model(req)
+        self._step_profiler()
+        return output
 
     def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> RunnerOutput:
         """Execute one diffusion step by delegating to the model runner."""
@@ -236,8 +270,10 @@ def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> Runner
 
         if any(new_req.req.sampling_params.lora_request is not None for new_req in scheduler_output.scheduled_new_reqs):
             raise ValueError("Step mode does not support LoRA yet.")
-
-        return self.model_runner.execute_stepwise(scheduler_output)
+        with self._profiler_context("diffusion_step"):
+            output = self.model_runner.execute_stepwise(scheduler_output)
+        self._step_profiler()
+        return output
 
     def load_weights(self, weights) -> set[str]:
         """Load weights by delegating to the model runner."""

From 26602fe0996118dbd9c44deebc46e385c12b68ed Mon Sep 17 00:00:00 2001
From: Jinheng Li <ahengljh@gmail.com>
Date: Tue, 3 Feb 2026 14:28:59 +0800
Subject: [PATCH 04/13] [Profiler] Remove online profiling endpoints, focus on
 nsys integration

Remove HTTP /start_profile and /stop_profile endpoint registration
from api_server.py as someone else is handling online profiling.

This PR now focuses purely on nsys integration for diffusion workers:
- CudaProfiler class with platform guards
- torch.cuda.profiler calls in DiffusionWorker.start_profile/stop_profile
- Updated docs for nsys usage with offline diffusion scripts

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: Jinheng Li <ahengljh@gmail.com>
---
 vllm_omni/entrypoints/openai/api_server.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 972fee3b58d..d15dc90fe5d 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -246,19 +246,6 @@ async def show_available_models(self) -> ModelList:
 # Server entry points
 
 
-def _register_profiling_routes(app) -> None:
-    """Unconditionally register /start_profile and /stop_profile on the app.
-
-    vllm's build_app() only registers these routes when a profiler_config
-    is explicitly provided (e.g. --profiler-config).  For omni we always
-    want them available so that nsys profiling can be triggered via HTTP
-    without extra CLI flags.
-    """
-    from vllm.entrypoints.serve.profile.api_router import router as profile_router
-
-    app.include_router(profile_router)
-
-
 async def omni_run_server(args, **uvicorn_kwargs) -> None:
     """Run a single-worker API server.
 
@@ -318,10 +305,6 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None,
         _remove_route_from_app(app, "/v1/models", {"GET"})  # Remove upstream /v1/models to use omni's handler
         app.include_router(router)
 
-        # Register profiling endpoints directly on the app so they are
-        # available regardless of how vllm's build_app handles routers.
-        _register_profiling_routes(app)
-
         await omni_init_app_state(engine_client, app.state, args)
 
         # Conditionally register profiler endpoints based on stage YAML configs

From 8771777ccaaa85e36c584ef0ffa2183ef39438c7 Mon Sep 17 00:00:00 2001
From: Jinheng Li <ahengljh@gmail.com>
Date: Tue, 3 Feb 2026 16:02:20 +0800
Subject: [PATCH 05/13] Align diffusion profiling with vLLM

Signed-off-by: Jinheng Li <ahengljh@gmail.com>
---
 vllm_omni/diffusion/diffusion_engine.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py
index 784da617529..1cfb36fdc7b 100644
--- a/vllm_omni/diffusion/diffusion_engine.py
+++ b/vllm_omni/diffusion/diffusion_engine.py
@@ -332,8 +332,8 @@ def add_req_and_wait_for_response(self, request: OmniDiffusionRequest) -> Diffus
                         missing_result_error="Diffusion execution finished without a final output.",
                     )
 
-    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
-        """Start or stop torch profiling on all diffusion workers.
+    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> list[Any] | None:
+        """Start or stop profiling on all diffusion workers.
 
         Args:
             is_start: True to start profiling, False to stop.
@@ -351,12 +351,13 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N
             logger.info("Stopping diffusion profiling...")
 
         try:
-            self.collective_rpc(method="profile", args=(is_start, profile_prefix))
+            return self.collective_rpc(method="profile", args=(is_start, profile_prefix))
         except Exception as e:
             action = "start" if is_start else "stop"
             logger.error(f"Failed to {action} profiling on workers", exc_info=True)
             if is_start:
                 raise RuntimeError(f"Could not {action} profiler: {e}") from e
+            return None
 
     def _dummy_run(self):
         """A dummy run to warm up the model."""

From d54e6e3e422f8a2e624874f4d91033290e6afcce Mon Sep 17 00:00:00 2001
From: Jinheng Li <ahengljh@gmail.com>
Date: Tue, 7 Apr 2026 14:46:59 +0800
Subject: [PATCH 06/13] Add CUDA profiler coverage for diffusion worker

Signed-off-by: Jinheng Li <ahengljh@gmail.com>
---
 .../test_diffusion_worker_cuda_profiler.py    | 103 ++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 tests/diffusion/test_diffusion_worker_cuda_profiler.py

diff --git a/tests/diffusion/test_diffusion_worker_cuda_profiler.py b/tests/diffusion/test_diffusion_worker_cuda_profiler.py
new file mode 100644
index 00000000000..ddc2aed2fc2
--- /dev/null
+++ b/tests/diffusion/test_diffusion_worker_cuda_profiler.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock
+
+import pytest
+from pytest_mock import MockerFixture
+
+from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker
+
+pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]
+
+
+@pytest.fixture
+def mock_od_config(mocker: MockerFixture):
+    """Create a mock OmniDiffusionConfig with a CUDA profiler backend."""
+    config = mocker.Mock()
+    config.profiler_config = mocker.Mock()
+    config.profiler_config.profiler = "cuda"
+    config.diffusion_load_format = "default"
+    return config
+
+
+@pytest.fixture
+def mock_diffusion_worker_dependencies(mocker: MockerFixture):
+    """Patch heavy worker dependencies for focused profiler tests."""
+    mocker.patch.object(DiffusionWorker, "init_device")
+    mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.DiffusionModelRunner")
+
+
+class TestDiffusionWorkerCudaProfiler:
+    def test_creates_cuda_profiler_wrapper(
+        self,
+        mocker: MockerFixture,
+        mock_od_config,
+        mock_diffusion_worker_dependencies,
+    ):
+        fake_profiler = mocker.Mock()
+        cuda_profiler = mocker.patch(
+            "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper",
+            return_value=fake_profiler,
+        )
+        create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler")
+
+        worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True)
+
+        cuda_profiler.assert_called_once_with(mock_od_config.profiler_config)
+        create_omni_profiler.assert_not_called()
+        assert worker.profiler is fake_profiler
+
+    def test_profile_start_stop_delegates_to_cuda_profiler(
+        self,
+        mocker: MockerFixture,
+        mock_od_config,
+        mock_diffusion_worker_dependencies,
+    ):
+        fake_profiler = mocker.Mock()
+        fake_profiler.start = MagicMock()
+        fake_profiler.stop = MagicMock()
+        mocker.patch(
+            "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper",
+            return_value=fake_profiler,
+        )
+
+        worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True)
+
+        assert worker.profile(is_start=True) is None
+        assert worker.profile(is_start=False) is None
+
+        fake_profiler.start.assert_called_once_with()
+        fake_profiler.stop.assert_called_once_with()
+
+    def test_returns_none_when_profiler_config_is_missing(
+        self,
+        mocker: MockerFixture,
+        mock_od_config,
+        mock_diffusion_worker_dependencies,
+    ):
+        mock_od_config.profiler_config = None
+        cuda_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper")
+        create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler")
+
+        worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True)
+
+        cuda_profiler.assert_not_called()
+        create_omni_profiler.assert_not_called()
+        assert worker.profiler is None
+
+    def test_cuda_backend_does_not_use_torch_profiler_factory(
+        self,
+        mocker: MockerFixture,
+        mock_od_config,
+        mock_diffusion_worker_dependencies,
+    ):
+        mocker.patch(
+            "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper",
+            return_value=mocker.Mock(),
+        )
+        create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler")
+
+        DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True)
+
+        create_omni_profiler.assert_not_called()

From 46180c138699b6c591fe75afb4e41a1127f01466 Mon Sep 17 00:00:00 2001
From: Jinheng Li <ahengljh@gmail.com>
Date: Wed, 8 Apr 2026 15:26:52 +0800
Subject: [PATCH 07/13] chore: trigger CI rerun

Signed-off-by: Jinheng Li <ahengljh@gmail.com>

From 9871942ba4f1cd8b87f6d7cc9e0923744c37132b Mon Sep 17 00:00:00 2001
From: Jinheng Li <ahengljh@gmail.com>
Date: Wed, 8 Apr 2026 17:27:06 +0800
Subject: [PATCH 08/13] fix: make diffusion worker profiler helpers defensive

Signed-off-by: Jinheng Li <ahengljh@gmail.com>
---
 vllm_omni/diffusion/worker/diffusion_worker.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
index 260682135e4..c0d54589558 100644
--- a/vllm_omni/diffusion/worker/diffusion_worker.py
+++ b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -168,13 +168,15 @@ def _create_profiler(self) -> WorkerProfiler | None:
         return None
 
     def _profiler_context(self, name: str) -> AbstractContextManager:
-        if self.profiler is None:
+        profiler = getattr(self, "profiler", None)
+        if profiler is None:
             return nullcontext()
-        return self.profiler.annotate_context_manager(name)
+        return profiler.annotate_context_manager(name)
 
     def _step_profiler(self) -> None:
-        if self.profiler is not None:
-            self.profiler.step()
+        profiler = getattr(self, "profiler", None)
+        if profiler is not None:
+            profiler.step()
 
     def load_model(self, load_format: str = "default", custom_pipeline_name: str | None = None) -> None:
         """Load the diffusion model using DiffusionModelRunner."""

From baf2cf7923bbc99b3be20cbb804d7156bd8c2346 Mon Sep 17 00:00:00 2001
From: Canlin Guo <961750412@qq.com>
Date: Thu, 9 Apr 2026 10:00:59 +0800
Subject: [PATCH 09/13] clean code

Signed-off-by: Canlin Guo <961750412@qq.com>
---
 vllm_omni/diffusion/diffusion_engine.py       | 11 +---
 .../diffusion/worker/diffusion_worker.py      | 58 +++++--------------
 2 files changed, 18 insertions(+), 51 deletions(-)

diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py
index 1cfb36fdc7b..17391ac191a 100644
--- a/vllm_omni/diffusion/diffusion_engine.py
+++ b/vllm_omni/diffusion/diffusion_engine.py
@@ -332,16 +332,12 @@ def add_req_and_wait_for_response(self, request: OmniDiffusionRequest) -> Diffus
                         missing_result_error="Diffusion execution finished without a final output.",
                     )
 
-    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> list[Any] | None:
+    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
         """Start or stop profiling on all diffusion workers.
 
         Args:
             is_start: True to start profiling, False to stop.
-            profile_prefix: Optional prefix for trace filename (vLLM compat).
-
-        Note:
-            Matches vLLM's worker.profile() signature for consistency.
-            Traces are saved automatically via on_trace_ready callback.
+            profile_prefix: Optional prefix for trace filename.
         """
         if is_start:
             if profile_prefix is None:
@@ -351,13 +347,12 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> l
             logger.info("Stopping diffusion profiling...")
 
         try:
-            return self.collective_rpc(method="profile", args=(is_start, profile_prefix))
+            self.collective_rpc(method="profile", args=(is_start, profile_prefix))
         except Exception as e:
             action = "start" if is_start else "stop"
             logger.error(f"Failed to {action} profiling on workers", exc_info=True)
             if is_start:
                 raise RuntimeError(f"Could not {action} profiler: {e}") from e
-            return None
 
     def _dummy_run(self):
         """A dummy run to warm up the model."""
diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
index c0d54589558..09d46f09206 100644
--- a/vllm_omni/diffusion/worker/diffusion_worker.py
+++ b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -142,42 +142,20 @@ def init_device(self) -> None:
             init_workspace_manager(self.device)
 
     def _create_profiler(self) -> WorkerProfiler | None:
-        profiler_config = getattr(self.od_config, "profiler_config", None)
-        if self.vllm_config is not None:
-            self.vllm_config.profiler_config = profiler_config
-
+        profiler_config = self.od_config.profiler_config
         profiler_type = getattr(profiler_config, "profiler", None)
         if profiler_type == "torch":
             return create_omni_profiler(
                 profiler_config=profiler_config,
-                worker_name=f"diffusion-rank-{self.rank}",
+                worker_name=f"diffusion_rank{self.rank}",
                 local_rank=self.local_rank,
             )
         if profiler_type == "cuda":
-            try:
-                return CudaProfilerWrapper(profiler_config)
-            except Exception as exc:
-                logger.warning(
-                    "Failed to initialize CUDA profiler on diffusion worker %s: %s",
-                    self.rank,
-                    exc,
-                )
-                return None
+            return CudaProfilerWrapper(profiler_config)
         if profiler_type is not None:
             logger.warning("Unknown profiler backend %r on diffusion worker %s", profiler_type, self.rank)
         return None
 
-    def _profiler_context(self, name: str) -> AbstractContextManager:
-        profiler = getattr(self, "profiler", None)
-        if profiler is None:
-            return nullcontext()
-        return profiler.annotate_context_manager(name)
-
-    def _step_profiler(self) -> None:
-        profiler = getattr(self, "profiler", None)
-        if profiler is not None:
-            profiler.step()
-
     def load_model(self, load_format: str = "default", custom_pipeline_name: str | None = None) -> None:
         """Load the diffusion model using DiffusionModelRunner."""
         with (
@@ -218,34 +196,24 @@ def generate(self, request: OmniDiffusionRequest) -> DiffusionOutput:
         """Generate output for the given requests."""
         return self.execute_model(request, self.od_config)
 
-    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> dict[str, Any] | None:
+    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
         """Start or stop profiling for this GPU worker.
 
         Args:
             is_start: True to start profiling, False to stop.
-            profile_prefix: Optional prefix for trace filename (vLLM compat).
-
-        Note:
-            Matches vLLM's worker.profile() signature for consistency.
-            Traces are saved automatically via on_trace_ready callback.
+            profile_prefix: Optional prefix for trace filename.
         """
         if self.profiler is None:
-            logger.warning("Profiler not initialized, skipping profile(%s)", is_start)
             return
 
         if is_start:
             if isinstance(self.profiler, OmniTorchProfilerWrapper):
                 import time
-
                 filename = profile_prefix or f"diffusion_rank{self.rank}_{int(time.time())}"
                 self.profiler.set_trace_filename(filename)
             self.profiler.start()
-            return None
-
-        self.profiler.stop()
-        if isinstance(self.profiler, OmniTorchProfilerWrapper):
-            return self.profiler.get_results()
-        return None
+        else:
+            self.profiler.stop()
 
     def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfig) -> DiffusionOutput:
         """Execute a forward pass by delegating to the model runner."""
@@ -257,9 +225,11 @@ def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfi
                 if req.sampling_params.lora_request is not None:
                     raise
                 logger.warning("LoRA activation skipped: %s", exc)
-        with self._profiler_context("diffusion_forward"):
+        ctx = self.profiler.annotate_context_manager("diffusion_forward") if self.profiler else nullcontext()
+        with ctx:
             output = self.model_runner.execute_model(req)
-        self._step_profiler()
+        if self.profiler:
+            self.profiler.step()
         return output
 
     def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> RunnerOutput:
@@ -272,9 +242,11 @@ def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> Runner
 
         if any(new_req.req.sampling_params.lora_request is not None for new_req in scheduler_output.scheduled_new_reqs):
             raise ValueError("Step mode does not support LoRA yet.")
-        with self._profiler_context("diffusion_step"):
+        ctx = self.profiler.annotate_context_manager("diffusion_step") if self.profiler else nullcontext()
+        with ctx:
             output = self.model_runner.execute_stepwise(scheduler_output)
-        self._step_profiler()
+        if self.profiler:
+            self.profiler.step()
         return output
 
     def load_weights(self, weights) -> set[str]:

From 224e25e039b7f10776193b08686045ced85f5e9b Mon Sep 17 00:00:00 2001
From: Canlin Guo <961750412@qq.com>
Date: Thu, 9 Apr 2026 10:44:01 +0800
Subject: [PATCH 10/13] lint

Signed-off-by: Canlin Guo <961750412@qq.com>
---
 vllm_omni/diffusion/worker/diffusion_worker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
index 09d46f09206..36588452e79 100644
--- a/vllm_omni/diffusion/worker/diffusion_worker.py
+++ b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -209,6 +209,7 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N
         if is_start:
             if isinstance(self.profiler, OmniTorchProfilerWrapper):
                 import time
+                
                 filename = profile_prefix or f"diffusion_rank{self.rank}_{int(time.time())}"
                 self.profiler.set_trace_filename(filename)
             self.profiler.start()

From 0dfdc7735b23e2045e34a27c86827e3a973cb84f Mon Sep 17 00:00:00 2001
From: Canlin Guo <961750412@qq.com>
Date: Thu, 9 Apr 2026 10:47:52 +0800
Subject: [PATCH 11/13] fix lint

Signed-off-by: Canlin Guo <961750412@qq.com>
---
 vllm_omni/diffusion/worker/diffusion_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
index 36588452e79..0693e6e34ee 100644
--- a/vllm_omni/diffusion/worker/diffusion_worker.py
+++ b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -209,7 +209,7 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N
         if is_start:
             if isinstance(self.profiler, OmniTorchProfilerWrapper):
                 import time
-                
+
                 filename = profile_prefix or f"diffusion_rank{self.rank}_{int(time.time())}"
                 self.profiler.set_trace_filename(filename)
             self.profiler.start()

From 90466977f2e1c7574580e9145545159454c4afa4 Mon Sep 17 00:00:00 2001
From: Jinheng Li <ahengljh@gmail.com>
Date: Thu, 9 Apr 2026 11:05:23 +0800
Subject: [PATCH 12/13] chore: trigger pipeline rerun

Signed-off-by: Jinheng Li <ahengljh@gmail.com>

From 3fdadd7d71acd4f312b243b567638d8c4b26bafc Mon Sep 17 00:00:00 2001
From: Jinheng Li <ahengljh@gmail.com>
Date: Thu, 9 Apr 2026 11:27:56 +0800
Subject: [PATCH 13/13] fix: restore defensive diffusion profiler access

Signed-off-by: Jinheng Li <ahengljh@gmail.com>
---
 .../diffusion/worker/diffusion_worker.py      | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
index 0693e6e34ee..160309e0d8d 100644
--- a/vllm_omni/diffusion/worker/diffusion_worker.py
+++ b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -156,6 +156,9 @@ def _create_profiler(self) -> WorkerProfiler | None:
             logger.warning("Unknown profiler backend %r on diffusion worker %s", profiler_type, self.rank)
         return None
 
+    def _get_profiler(self) -> WorkerProfiler | None:
+        return getattr(self, "profiler", None)
+
     def load_model(self, load_format: str = "default", custom_pipeline_name: str | None = None) -> None:
         """Load the diffusion model using DiffusionModelRunner."""
         with (
@@ -203,18 +206,19 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N
             is_start: True to start profiling, False to stop.
             profile_prefix: Optional prefix for trace filename.
         """
-        if self.profiler is None:
+        profiler = self._get_profiler()
+        if profiler is None:
             return
 
         if is_start:
-            if isinstance(self.profiler, OmniTorchProfilerWrapper):
+            if isinstance(profiler, OmniTorchProfilerWrapper):
                 import time
 
                 filename = profile_prefix or f"diffusion_rank{self.rank}_{int(time.time())}"
-                self.profiler.set_trace_filename(filename)
-            self.profiler.start()
+                profiler.set_trace_filename(filename)
+            profiler.start()
         else:
-            self.profiler.stop()
+            profiler.stop()
 
     def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfig) -> DiffusionOutput:
         """Execute a forward pass by delegating to the model runner."""
@@ -226,11 +230,12 @@ def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfi
                 if req.sampling_params.lora_request is not None:
                     raise
                 logger.warning("LoRA activation skipped: %s", exc)
-        ctx = self.profiler.annotate_context_manager("diffusion_forward") if self.profiler else nullcontext()
+        profiler = self._get_profiler()
+        ctx = profiler.annotate_context_manager("diffusion_forward") if profiler else nullcontext()
         with ctx:
             output = self.model_runner.execute_model(req)
-        if self.profiler:
-            self.profiler.step()
+        if profiler:
+            profiler.step()
         return output
 
     def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> RunnerOutput:
@@ -243,11 +248,12 @@ def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> Runner
 
         if any(new_req.req.sampling_params.lora_request is not None for new_req in scheduler_output.scheduled_new_reqs):
             raise ValueError("Step mode does not support LoRA yet.")
-        ctx = self.profiler.annotate_context_manager("diffusion_step") if self.profiler else nullcontext()
+        profiler = self._get_profiler()
+        ctx = profiler.annotate_context_manager("diffusion_step") if profiler else nullcontext()
         with ctx:
             output = self.model_runner.execute_stepwise(scheduler_output)
-        if self.profiler:
-            self.profiler.step()
+        if profiler:
+            profiler.step()
         return output
 
     def load_weights(self, weights) -> set[str]: