From f3d2fd12abf12b7a9db620c04f175a9d7759ea7e Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 11 Feb 2026 03:26:29 +0800
Subject: [PATCH 1/6] feat: add unified profiler with online serving and
 stage-aware endpoints

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 docs/contributing/profiling.md                | 296 ++++++++++++------
 examples/offline_inference/bagel/end2end.py   |  31 +-
 .../image_to_image/image_edit.py              |  43 +--
 .../image_to_video/image_to_video.py          |  39 +--
 .../lora_inference/lora_inference.py          |  31 +-
 .../offline_inference/qwen2_5_omni/end2end.py |  34 +-
 .../offline_inference/qwen3_omni/end2end.py   |  38 ++-
 .../offline_inference/qwen3_tts/end2end.py    |  39 ++-
 .../text_to_audio/text_to_audio.py            |  26 +-
 .../text_to_image/text_to_image.py            |  38 +--
 .../text_to_video/text_to_video.py            |  38 +--
 tests/e2e/online_serving/test_async_omni.py   |   2 +-
 .../openai_api/test_image_server.py           |   2 +-
 tests/entrypoints/test_omni_llm.py            |   2 +-
 tests/profiler/__init__.py                    |   2 +
 tests/profiler/test_api_router.py             | 118 +++++++
 tests/profiler/test_config.py                 | 160 ++++++++++
 tests/profiler/test_torch_profiler.py         | 196 ++++++++++++
 vllm_omni/benchmarks/patch/patch.py           |   5 +-
 vllm_omni/config/__init__.py                  |   2 +
 vllm_omni/diffusion/diffusion_engine.py       | 131 +-------
 vllm_omni/diffusion/profiler/base.py          |  58 ----
 .../diffusion/profiler/torch_profiler.py      | 126 --------
 .../diffusion/worker/diffusion_worker.py      |  31 +-
 vllm_omni/entrypoints/async_omni_llm.py       |  24 --
 vllm_omni/entrypoints/omni.py                 | 102 +++---
 vllm_omni/entrypoints/omni_diffusion.py       |  45 +--
 vllm_omni/entrypoints/omni_llm.py             |  42 +++
 vllm_omni/entrypoints/omni_stage.py           | 141 +++++----
 vllm_omni/entrypoints/openai/api_server.py    |  19 +-
 vllm_omni/entrypoints/serve/__init__.py       |   0
 .../entrypoints/serve/profile/__init__.py     |   0
 .../entrypoints/serve/profile/api_router.py   |  70 +++++
 vllm_omni/inputs/data.py                      |   1 -
 .../{diffusion => }/profiler/__init__.py      |   6 +-
 vllm_omni/profiler/config.py                  |  84 +++++
 vllm_omni/profiler/torch_profiler.py          | 117 +++++++
 37 files changed, 1447 insertions(+), 692 deletions(-)
 create mode 100644 tests/profiler/__init__.py
 create mode 100644 tests/profiler/test_api_router.py
 create mode 100644 tests/profiler/test_config.py
 create mode 100644 tests/profiler/test_torch_profiler.py
 delete mode 100644 vllm_omni/diffusion/profiler/base.py
 delete mode 100644 vllm_omni/diffusion/profiler/torch_profiler.py
 create mode 100644 vllm_omni/entrypoints/serve/__init__.py
 create mode 100644 vllm_omni/entrypoints/serve/profile/__init__.py
 create mode 100644 vllm_omni/entrypoints/serve/profile/api_router.py
 rename vllm_omni/{diffusion => }/profiler/__init__.py (52%)
 create mode 100644 vllm_omni/profiler/config.py
 create mode 100644 vllm_omni/profiler/torch_profiler.py

diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index a7df8c32297..cc4d931572d 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -2,148 +2,256 @@
 
 > **Warning:** Profiling incurs significant overhead. Use only for development and debugging, never in production.
 
-vLLM-Omni uses the PyTorch Profiler to analyze performance across both **multi-stage omni-modality models** and **diffusion models**.
+vLLM-Omni provides a profiling module (`vllm_omni/profiler/`) aligned with upstream vLLM 0.16.0 semantics. It captures **performance traces** (TensorBoard/Chrome traces) using `tensorboard_trace_handler` and supports delay/max iteration control.
 
-### 1. Set the Output Directory
-Before running any script, set this environment variable. The system detects this and automatically saves traces here.
+## Quick Start
 
-```bash
-export VLLM_TORCH_PROFILER_DIR=./profiles
+```python
+from vllm_omni import Omni
+from vllm_omni.profiler import ProfilerConfig
+
+# Configure profiler at initialization
+omni = Omni(
+    model="Tongyi-MAI/Z-Image-Turbo",
+    profiler_config=ProfilerConfig(
+        profiler="torch",
+        torch_profiler_dir="./profiles",
+    )
+)
+
+# Profile your workload
+omni.start_profile()
+outputs = omni.generate({"prompt": "a cat"}, sampling_params)
+omni.stop_profile()
+
+# Trace files are written to ./profiles/ by each worker
 ```
 
-### 2. Profiling Omni-Modality Models
+## Command Line Usage
 
-It is best to limit profiling to one iteration to keep trace files manageable.
+All offline inference examples support profiling via CLI arguments:
 
 ```bash
-export VLLM_PROFILER_MAX_ITERS=1
+# Enable profiling
+python text_to_image.py --model MODEL --profile-dir ./profiles
 ```
 
-**Selective Stage Profiling**
-The profiler is default to function across all stages. But It is highly recommended to profile specific stages by passing the stages list, preventing from producing too large trace files:
+## ProfilerConfig
+
+```python
+from vllm_omni.profiler import ProfilerConfig
+
+ProfilerConfig(
+    profiler="torch",                          # Required: "torch" or "cuda"
+    torch_profiler_dir="./profiles",           # Required when profiler="torch"
+    torch_profiler_with_stack=True,            # Enable stack tracing
+    torch_profiler_with_flops=False,           # Enable FLOPS counting
+    torch_profiler_use_gzip=True,              # Save traces in gzip format
+    torch_profiler_dump_cuda_time_total=True,  # Dump CUDA time stats on stop
+    torch_profiler_record_shapes=False,        # Record tensor shapes
+    torch_profiler_with_memory=False,          # Enable memory profiling
+    delay_iterations=0,                        # Skip N iterations before starting
+    max_iterations=0,                          # Stop after N iterations (0=unlimited)
+)
+```
+
+### Serialization
+
+`ProfilerConfig` supports `to_dict()` / `from_dict()` for cross-process RPC serialization.
+
+## Output Files
+
+| File | Format | How to View |
+|------|--------|-------------|
+| `*.trace.json.gz` | TensorBoard trace | TensorBoard, chrome://tracing, or ui.perfetto.dev |
+| `profiler_out_*.txt` | CUDA time stats | Any text editor |
+
+---
+
+## Profiling Omni-Modality Models
+
+### Selective Stage Profiling
+
+Profile specific stages to keep trace files manageable:
+
 ```python
 # Profile all stages
-omni_llm.start_profile()
+omni.start_profile()
 
 # Only profile Stage 1
-omni_llm.start_profile(stages=[1])
+omni.start_profile(stages=[1])
+
+# Stage 0 (Thinker) and Stage 2 (Audio Decoder) for Qwen Omni
+omni.start_profile(stages=[0, 2])
 ```
 
-```python
-# Stage 0 (Thinker) and Stage 2 (Audio Decoder) for qwen omni
-omni_llm.start_profile(stages=[0, 2])
+### Examples
+
+- **Qwen2.5-Omni**: [examples/offline_inference/qwen2_5_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py)
+- **Qwen3-Omni**: [examples/offline_inference/qwen3_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py)
+
+---
+
+## Profiling Diffusion Models
+
+Diffusion profiling is end-to-end, capturing encoding, denoising loops, and decoding.
+
+### Minimizing Trace Size
+
+For profiling, minimize dimensions to keep trace files manageable:
+
+```bash
+python image_to_video.py \
+    --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \
+    --image input.png \
+    --prompt "A cat playing with yarn" \
+    --profile-dir ./profiles \
+    \
+    # Minimize dimensions for profiling:
+    --height 48 \
+    --width 64 \
+    --num_frames 2 \
+    --num_inference_steps 2
 ```
 
-**Python Usage**: Wrap your generation logic with `start_profile()` and `stop_profile()`.
+### Examples
 
-```python
-from vllm_omni import omni_llm
+- **Image Edit**: [examples/offline_inference/image_to_image/image_edit.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py)
+- **Image to Video**: [examples/offline_inference/image_to_video/](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video)
+- **Text to Image**: [examples/offline_inference/text_to_image/text_to_image.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/text_to_image/text_to_image.py)
 
-profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR"))
+---
 
-# 1. Start profiling if enabled
-if profiler_enabled:
-    omni_llm.start_profile(stages=[0])
+## Viewing Traces
 
-# Initialize generator
-omni_generator = omni_llm.generate(prompts, sampling_params_list, py_generator=args.py_generator)
+### Performance Traces (`.trace.json.gz`)
 
-total_requests = len(prompts)
-processed_count = 0
+- [TensorBoard](https://www.tensorflow.org/tensorboard) (recommended)
+- [Perfetto UI](https://ui.perfetto.dev/)
+- `chrome://tracing` (Chrome only)
 
-# Main Processing Loop
-for stage_outputs in omni_generator:
+---
 
-    # ... [Output processing logic for text/audio would go here] ...
+## API Reference
 
-    # Update count to track when to stop profiling
-    processed_count += len(stage_outputs.request_output)
+### ProfilerConfig
 
-    # 2. Check if all requests are done to stop the profiler safely
-    if profiler_enabled and processed_count >= total_requests:
-        print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...")
+```python
+@dataclass
+class ProfilerConfig:
+    profiler: Literal["torch", "cuda"] | None = None
+    torch_profiler_dir: str = ""
+    torch_profiler_with_stack: bool = True
+    torch_profiler_with_flops: bool = False
+    torch_profiler_use_gzip: bool = True
+    torch_profiler_dump_cuda_time_total: bool = True
+    torch_profiler_record_shapes: bool = False
+    torch_profiler_with_memory: bool = False
+    delay_iterations: int = 0
+    max_iterations: int = 0
+```
+
+### TorchProfiler
+
+```python
+class TorchProfiler:
+    def __init__(self, config: ProfilerConfig, worker_name: str = "", local_rank: int = 0): ...
+    def start(self) -> None: ...
+    def stop(self) -> None: ...
+    def step(self) -> None: ...
+    def shutdown(self) -> None: ...
+    @property
+    def is_running(self) -> bool: ...
+```
 
-        # Stop the profiler while workers are still active
-        omni_llm.stop_profile()
+### Omni Methods
 
-        # Wait for traces to flush to disk
-        print("[Info] Waiting 30s for workers to write trace files to disk...")
-        time.sleep(30)
-        print("[Info] Trace export wait time finished.")
+```python
+# Start profiling for specified stages (None = all)
+omni.start_profile(stages: list[int] | None = None) -> None
 
-omni_llm.close()
+# Stop profiling for specified stages (None = all)
+omni.stop_profile(stages: list[int] | None = None) -> None
 ```
 
+---
 
-**Examples**:
+## Best Practices
 
-1. **Qwen2.5-Omni**:  [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py)
+1. **Profile specific stages**: Use `omni.start_profile(stages=[0])` to reduce overhead and file size
 
-2. **Qwen3-Omni**:   [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py)
+2. **Minimize dimensions for diffusion**: Use small height/width/frames/steps when profiling
 
+3. **Compare before/after**: Profile before and after optimizations to measure impact
 
-### 3. Profiling diffusion models
+4. **Use during development only**: Disable profiling in production for performance
 
-Diffusion profiling is End-to-End, capturing encoding, denoising loops, and decoding.
+---
 
-**CLI Usage:**
-```python
+## Troubleshooting
 
-python image_to_video.py \
-    --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \
-    --image qwen-bear.png \
-    --prompt "A cat playing with yarn, smooth motion" \
-    \
-    # Minimize Spatial Dimensions (Optional but helpful):
-    #    Drastically reduces memory usage so the profiler doesn't
-    #    crash due to overhead, though for accurate performance
-    #    tuning you often want target resolutions.
-    --height 48 \
-    --width 64 \
-    \
-    # Minimize Temporal Dimension (Frames):
-    #    Video models process 3D tensors (Time, Height, Width).
-    #    Reducing frames to the absolute minimum (2) keeps the
-    #    tensor size small, ensuring the trace file doesn't become
-    #    multi-gigabytes in size.
-    --num_frames 2 \
-    \
-    # Minimize Iteration Loop (Steps):
-    #    This is the most critical setting for profiling.
-    #    Diffusion models run the same loop X times.
-    #    Profiling 2 steps gives you the exact same performance
-    #    data as 50 steps, but saves minutes of runtime and
-    #    prevents the trace viewer from freezing.
-    --num_inference_steps 2 \
-    \
-    --guidance_scale 5.0 \
-    --guidance_scale_high 6.0 \
-    --boundary_ratio 0.875 \
-    --flow_shift 12.0 \
-    --fps 16 \
-    --output i2v_output.mp4
+| Issue | Cause | Solution |
+|-------|-------|----------|
+| Import error | Missing module | Check `vllm_omni/profiler/__init__.py` |
+| OOM during profiling | Profiler overhead | Reduce model dimensions |
+| Huge trace files | Too many steps/frames | Reduce `num_inference_steps`, `num_frames` |
+
+---
+
+## Online Serving Profiling
 
+When running the vLLM-Omni API server, profiling can be enabled via CLI
+and controlled via HTTP endpoints at runtime.
+
+### Starting the Server with Profiling Enabled
+
+```bash
+python -m vllm_omni.entrypoints.openai.api_server \
+    --model Qwen/Qwen2.5-Omni-7B \
+    --profiler-config profiler=torch,torch_profiler_dir=./profiles
 ```
 
-**Examples**:
+### HTTP Endpoints
 
-1. **Qwen image edit**:  [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py)
+| Method | Endpoint | Body | Description |
+|--------|----------|------|-------------|
+| POST | `/start_profile` | `{"stages": [0, 1, 2]}` (optional) | Start profiling |
+| POST | `/stop_profile` | `{"stages": [0, 1, 2]}` (optional) | Stop profiling |
 
-2. **Wan-AI/Wan2.2-I2V-A14B-Diffusers**:   [https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video)
+If `stages` is omitted or null, all stages are profiled.
 
-> **Note:**
-As of now, asynchronous (online) profiling is not fully supported in vLLM-Omni. While start_profile() and stop_profile() methods exist, they are only reliable in offline inference scripts (e.g., the provided end2end.py examples). Do not use them in server-mode or streaming scenarios—traces may be incomplete or fail to flush.
+### Stage IDs for Qwen Omni Models
 
-### 4. Analyzing Omni Traces
+| Stage | Qwen2.5-Omni | Qwen3-Omni |
+|-------|-------------|------------|
+| 0 | Thinker (understanding) | Thinker (MoE understanding) |
+| 1 | Talker (text → RVQ codes) | Talker (code predictor) |
+| 2 | Code2Wav (codes → audio) | Code2Wav (codes → audio) |
 
-Output files are saved to your configured ```VLLM_TORCH_PROFILER_DIR```.
+### Examples
 
-**Output**
-**Chrome Trace** (```.json.gz```): Visual timeline of kernels and stages. Open in Perfetto UI.
+```bash
+# Profile all stages (default)
+curl -X POST http://localhost:8000/start_profile
 
-**Viewing Tools:**
+# Profile only the Thinker stage
+curl -X POST http://localhost:8000/start_profile \
+    -H "Content-Type: application/json" \
+    -d '{"stages": [0]}'
+
+# Profile Thinker and Talker stages
+curl -X POST http://localhost:8000/start_profile \
+    -H "Content-Type: application/json" \
+    -d '{"stages": [0, 1]}'
+
+# Stop profiling (traces written to torch_profiler_dir)
+curl -X POST http://localhost:8000/stop_profile
+```
 
-- [Perfetto](https://ui.perfetto.dev/)(recommended)
-- ```chrome://tracing```(Chrome only)
+### Tips
 
-**Note**: vLLM-Omni reuses the PyTorch Profiler infrastructure from vLLM. See the official vLLM profiler documentation:  [vLLM Profiling Guide](https://docs.vllm.ai/en/stable/contributing/profiling/)
+1. **Profile one stage at a time** for smaller, more focused traces
+2. **Profile the Thinker** (stage 0) to analyze LLM bottlenecks
+3. **Profile the Talker** (stage 1) to analyze codec generation
+4. **Profile Code2Wav** (stage 2) to analyze audio synthesis
+5. Trace files are named per-stage (e.g., `stage-0_*.trace.json.gz`)
diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py
index 8fa412065d8..e4c31506fb5 100644
--- a/examples/offline_inference/bagel/end2end.py
+++ b/examples/offline_inference/bagel/end2end.py
@@ -1,8 +1,10 @@
 import argparse
 import os
+import time
 from typing import cast
 
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType
+from vllm_omni.profiler import ProfilerConfig
 
 
 def parse_args():
@@ -46,6 +48,14 @@ def parse_args():
     parser.add_argument("--stage-configs-path", type=str, default=None)
     parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.")
 
+    # Profiler arguments
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help="Directory to save profiling outputs. Enables profiling when set.",
+    )
+
     args = parser.parse_args()
     return args
 
@@ -114,6 +124,12 @@ def main():
     else:
         from vllm_omni.entrypoints.omni import Omni
 
+        # Build profiler config from arguments
+        profiler_config = None
+        if args.profile_dir:
+            profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir)
+            print(f"[Profiler] Output dir: {args.profile_dir}")
+
         omni_kwargs = {}
         if args.stage_configs_path:
             omni_kwargs["stage_configs_path"] = args.stage_configs_path
@@ -130,7 +146,7 @@ def main():
             }
         )
 
-        omni = Omni(model=model_name, **omni_kwargs)
+        omni = Omni(model=model_name, profiler_config=profiler_config, **omni_kwargs)
 
         formatted_prompts = []
         for p in args.prompts:
@@ -160,7 +176,20 @@ def main():
             if len(params_list) > 1:
                 params_list[1].num_inference_steps = args.steps  # type: ignore # The second stage is an OmniDiffusionSamplingParam
 
+        if profiler_config:
+            print("[Profiler] Starting profiling...")
+            omni.start_profile()
+
+        generation_start = time.perf_counter()
         omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list))
+        generation_end = time.perf_counter()
+        generation_time = generation_end - generation_start
+        print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)")
+
+        if profiler_config:
+            print("\n[Profiler] Stopping profiler and collecting results...")
+            omni.stop_profile()
+            print("[Profiler] Profiling stopped.")
 
     for i, req_output in enumerate(omni_outputs):
         images = getattr(req_output, "images", None)
diff --git a/examples/offline_inference/image_to_image/image_edit.py b/examples/offline_inference/image_to_image/image_edit.py
index 8f330e09d20..2562799e23e 100644
--- a/examples/offline_inference/image_to_image/image_edit.py
+++ b/examples/offline_inference/image_to_image/image_edit.py
@@ -82,6 +82,7 @@
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.outputs import OmniRequestOutput
 from vllm_omni.platforms import current_omni_platform
+from vllm_omni.profiler import ProfilerConfig
 
 
 def parse_args() -> argparse.Namespace:
@@ -306,6 +307,14 @@ def parse_args() -> argparse.Namespace:
         default=1,
         help="Number of ready layers (blocks) to keep on GPU during generation.",
     )
+
+    # Profiler arguments
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help="Directory to save profiling outputs. Enables profiling when set.",
+    )
     return parser.parse_args()
 
 
@@ -358,9 +367,16 @@ def main():
             # Note: coefficients will use model-specific defaults based on model_type
         }
 
+    # Build profiler config from arguments
+    profiler_config = None
+    if args.profile_dir:
+        profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir)
+        print(f"[Profiler] Output dir: {args.profile_dir}")
+
     # Initialize Omni with appropriate pipeline
     omni = Omni(
         model=args.model,
+        profiler_config=profiler_config,
         enable_layerwise_offload=args.enable_layerwise_offload,
         layerwise_num_gpu_layers=args.layerwise_num_gpu_layers,
         vae_use_slicing=args.vae_use_slicing,
@@ -373,9 +389,6 @@ def main():
     )
     print("Pipeline loaded")
 
-    # Check if profiling is requested via environment variable
-    profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR"))
-
     # Time profiling for generation
     print(f"\n{'=' * 60}")
     print("Generation Configuration:")
@@ -393,12 +406,12 @@ def main():
     )
     print(f"{'=' * 60}\n")
 
-    generation_start = time.perf_counter()
-
-    if profiler_enabled:
+    if profiler_config:
         print("[Profiler] Starting profiling...")
         omni.start_profile()
 
+    generation_start = time.perf_counter()
+
     # Generate edited image
     outputs = omni.generate(
         {
@@ -422,22 +435,10 @@ def main():
     # Print profiling results
     print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)")
 
-    if profiler_enabled:
+    if profiler_config:
         print("\n[Profiler] Stopping profiler and collecting results...")
-        profile_results = omni.stop_profile()
-        if profile_results and isinstance(profile_results, dict):
-            traces = profile_results.get("traces", [])
-            print("\n" + "=" * 60)
-            print("PROFILING RESULTS:")
-            for rank, trace in enumerate(traces):
-                print(f"\nRank {rank}:")
-                if trace:
-                    print(f"  • Trace: {trace}")
-            if not traces:
-                print("  No traces collected.")
-            print("=" * 60)
-        else:
-            print("[Profiler] No valid profiling data returned.")
+        omni.stop_profile()
+        print("[Profiler] Profiling stopped.")
 
     if not outputs:
         raise ValueError("No output generated from omni.generate()")
diff --git a/examples/offline_inference/image_to_video/image_to_video.py b/examples/offline_inference/image_to_video/image_to_video.py
index 8e8d3991559..e57831a9264 100644
--- a/examples/offline_inference/image_to_video/image_to_video.py
+++ b/examples/offline_inference/image_to_video/image_to_video.py
@@ -19,7 +19,6 @@
 """
 
 import argparse
-import os
 from pathlib import Path
 
 import numpy as np
@@ -31,6 +30,7 @@
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.outputs import OmniRequestOutput
 from vllm_omni.platforms import current_omni_platform
+from vllm_omni.profiler import ProfilerConfig
 
 
 def parse_args() -> argparse.Namespace:
@@ -98,6 +98,14 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Disable torch.compile and force eager execution.",
     )
+
+    # Profiler arguments
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help="Directory to save profiling outputs. Enables profiling when set.",
+    )
     return parser.parse_args()
 
 
@@ -131,13 +139,18 @@ def main():
     # Resize image to target dimensions
     image = image.resize((width, height), PIL.Image.Resampling.LANCZOS)
 
-    # Check if profiling is requested via environment variable
-    profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR"))
+    # Build profiler config from arguments
+    profiler_config = None
+    if args.profile_dir:
+        profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir)
+        print(f"[Profiler] Output dir: {args.profile_dir}")
+
     parallel_config = DiffusionParallelConfig(
         cfg_parallel_size=args.cfg_parallel_size,
     )
     omni = Omni(
         model=args.model,
+        profiler_config=profiler_config,
         enable_layerwise_offload=args.enable_layerwise_offload,
         layerwise_num_gpu_layers=args.layerwise_num_gpu_layers,
         vae_use_slicing=args.vae_use_slicing,
@@ -149,7 +162,7 @@ def main():
         enforce_eager=args.enforce_eager,
     )
 
-    if profiler_enabled:
+    if profiler_config:
         print("[Profiler] Starting profiling...")
         omni.start_profile()
 
@@ -242,22 +255,10 @@ def main():
     export_to_video(video_array, str(output_path), fps=args.fps)
     print(f"Saved generated video to {output_path}")
 
-    if profiler_enabled:
+    if profiler_config:
         print("\n[Profiler] Stopping profiler and collecting results...")
-        profile_results = omni.stop_profile()
-        if profile_results and isinstance(profile_results, dict):
-            traces = profile_results.get("traces", [])
-            print("\n" + "=" * 60)
-            print("PROFILING RESULTS:")
-            for rank, trace in enumerate(traces):
-                print(f"\nRank {rank}:")
-                if trace:
-                    print(f"  • Trace: {trace}")
-            if not traces:
-                print("  No traces collected.")
-            print("=" * 60)
-        else:
-            print("[Profiler] No valid profiling data returned.")
+        omni.stop_profile()
+        print("[Profiler] Profiling stopped.")
 
 
 if __name__ == "__main__":
diff --git a/examples/offline_inference/lora_inference/lora_inference.py b/examples/offline_inference/lora_inference/lora_inference.py
index 5e4299edb84..79938b150a7 100644
--- a/examples/offline_inference/lora_inference/lora_inference.py
+++ b/examples/offline_inference/lora_inference/lora_inference.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
+import time
 from pathlib import Path
 
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.lora.request import LoRARequest
 from vllm_omni.lora.utils import stable_lora_int_id
+from vllm_omni.profiler import ProfilerConfig
 
 
 def parse_args() -> argparse.Namespace:
@@ -56,6 +58,14 @@ def parse_args() -> argparse.Namespace:
         default=1.0,
         help="Scale factor for LoRA weights (default: 1.0).",
     )
+
+    # Profiler arguments
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help="Directory to save profiling outputs. Enables profiling when set.",
+    )
     return parser.parse_args()
 
 
@@ -64,13 +74,19 @@ def main():
 
     model = args.model
 
+    # Build profiler config from arguments
+    profiler_config = None
+    if args.profile_dir:
+        profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir)
+        print(f"[Profiler] Output dir: {args.profile_dir}")
+
     omni_kwargs = {}
 
     if args.lora_path:
         omni_kwargs["lora_path"] = args.lora_path
         print(f"Using static LoRA from: {args.lora_path}")
 
-    omni = Omni(model=model, **omni_kwargs)
+    omni = Omni(model=model, profiler_config=profiler_config, **omni_kwargs)
 
     lora_request = None
     if args.lora_request_path:
@@ -106,7 +122,15 @@ def main():
         sampling_params.lora_request = lora_request
         sampling_params.lora_scale = args.lora_scale
 
+    if profiler_config:
+        print("[Profiler] Starting profiling...")
+        omni.start_profile()
+
+    generation_start = time.perf_counter()
     outputs = omni.generate(args.prompt, sampling_params)
+    generation_end = time.perf_counter()
+    generation_time = generation_end - generation_start
+    print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)")
 
     if not outputs or len(outputs) == 0:
         raise ValueError("No output generated from omni.generate()")
@@ -142,6 +166,11 @@ def main():
             img.save(save_path)
             print(f"Saved generated image to {save_path}")
 
+    if profiler_config:
+        print("\n[Profiler] Stopping profiler and collecting results...")
+        omni.stop_profile()
+        print("[Profiler] Profiling stopped.")
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py
index 7cd8e18737e..0ab7fbb2f7c 100644
--- a/examples/offline_inference/qwen2_5_omni/end2end.py
+++ b/examples/offline_inference/qwen2_5_omni/end2end.py
@@ -6,7 +6,6 @@
 """
 
 import os
-import time
 from typing import NamedTuple
 
 import librosa
@@ -21,6 +20,7 @@
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.profiler import ProfilerConfig
 
 SEED = 42
 
@@ -320,8 +320,15 @@ def main(args):
         query_result = query_func(audio_path=audio_path, sampling_rate=sampling_rate)
     else:
         query_result = query_func()
+    # Build profiler config from arguments
+    profiler_config = None
+    if args.profile_dir:
+        profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir)
+        print(f"[Profiler] Output dir: {args.profile_dir}")
+
     omni_llm = Omni(
         model=model_name,
+        profiler_config=profiler_config,
         log_stats=args.log_stats,
         stage_init_timeout=args.stage_init_timeout,
         batch_timeout=args.batch_timeout,
@@ -377,8 +384,8 @@ def main(args):
         for i, prompt in enumerate(prompts):
             prompt["modalities"] = output_modalities
 
-    profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR"))
-    if profiler_enabled:
+    if profiler_config:
+        print("[Profiler] Starting profiling...")
         omni_llm.start_profile(stages=[0])
     omni_generator = omni_llm.generate(prompts, sampling_params_list, py_generator=args.py_generator)
 
@@ -416,14 +423,11 @@ def main(args):
                 print(f"Request ID: {request_id}, Saved audio to {output_wav}")
 
         processed_count += len(stage_outputs.request_output)
-        if profiler_enabled and processed_count >= total_requests:
+        if profiler_config and processed_count >= total_requests:
             print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...")
             # Stop the profiler while workers are still alive
             omni_llm.stop_profile()
-
-            print("[Info] Waiting 30s for workers to write massive trace files to disk...")
-            time.sleep(30)
-            print("[Info] Trace export wait finished.")
+            print("[Profiler] Profiling stopped.")
 
     omni_llm.close()
 
@@ -539,6 +543,20 @@ def parse_args():
         default=False,
         help="Use py_generator mode. The returned type of Omni.generate() is a Python Generator object.",
     )
+
+    # Profiler arguments
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help="Directory to save profiling outputs. Enables profiling when set.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Output directory for generated files (preferred over --output-wav).",
+    )
     return parser.parse_args()
 
 
diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py
index 263377762aa..f00248a696e 100644
--- a/examples/offline_inference/qwen3_omni/end2end.py
+++ b/examples/offline_inference/qwen3_omni/end2end.py
@@ -6,22 +6,22 @@
 """
 
 import os
-import time
 from typing import NamedTuple
 
 import librosa
 import numpy as np
 import soundfile as sf
-import vllm
 from PIL import Image
-from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset, video_to_ndarrays
 from vllm.multimodal.image import convert_image_mode
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
+import vllm
+from vllm import SamplingParams
 from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.profiler import ProfilerConfig
 
 SEED = 42
 
@@ -325,8 +325,15 @@ def main(args):
     else:
         query_result = query_func()
 
+    # Build profiler config from arguments
+    profiler_config = None
+    if args.profile_dir:
+        profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir)
+        print(f"[Profiler] Output dir: {args.profile_dir}")
+
     omni_llm = Omni(
         model=model_name,
+        profiler_config=profiler_config,
         stage_configs_path=args.stage_configs_path,
         log_stats=args.log_stats,
         stage_init_timeout=args.stage_init_timeout,
@@ -383,8 +390,8 @@ def main(args):
         for i, prompt in enumerate(prompts):
             prompt["modalities"] = output_modalities
 
-    profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR"))
-    if profiler_enabled:
+    if profiler_config:
+        print("[Profiler] Starting profiling...")
         omni_llm.start_profile(stages=[0])
     omni_generator = omni_llm.generate(prompts, sampling_params_list, py_generator=args.py_generator)
     # Determine output directory: prefer --output-dir; fallback to --output-wav
@@ -433,14 +440,11 @@ def main(args):
                 print(f"Request ID: {request_id}, Saved audio to {output_wav}")
 
         processed_count += len(stage_outputs.request_output)
-        if profiler_enabled and processed_count >= total_requests:
+        if profiler_config and processed_count >= total_requests:
             print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...")
             # Stop the profiler while workers are still alive
             omni_llm.stop_profile()
-
-            print("[Info] Waiting 30s for workers to write trace files to disk...")
-            time.sleep(30)
-            print("[Info] Trace export wait time finished.")
+            print("[Profiler] Profiling stopped.")
     omni_llm.close()
 
 
@@ -559,6 +563,20 @@ def parse_args():
         help="Use py_generator mode. The returned type of Omni.generate() is a Python Generator object.",
     )
 
+    # Profiler arguments
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help="Directory to save profiling outputs. Enables profiling when set.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Output directory for generated files (preferred over --output-wav).",
+    )
+
     return parser.parse_args()
 
 
diff --git a/examples/offline_inference/qwen3_tts/end2end.py b/examples/offline_inference/qwen3_tts/end2end.py
index 93aeba3ca5f..94442d1cae9 100644
--- a/examples/offline_inference/qwen3_tts/end2end.py
+++ b/examples/offline_inference/qwen3_tts/end2end.py
@@ -5,16 +5,18 @@
 """
 
 import os
+import time
 from typing import NamedTuple
 
 import soundfile as sf
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
-from vllm import SamplingParams
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
+from vllm import SamplingParams
 from vllm_omni import Omni
+from vllm_omni.profiler import ProfilerConfig
 
 
 class QueryResult(NamedTuple):
@@ -215,9 +217,16 @@ def main(args):
     else:
         query_result = query_func()
 
+    # Build profiler config from arguments
+    profiler_config = None
+    if args.profile_dir:
+        profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir)
+        print(f"[Profiler] Output dir: {args.profile_dir}")
+
     model_name = query_result.model_name
     omni = Omni(
         model=model_name,
+        profiler_config=profiler_config,
         stage_configs_path=args.stage_configs_path,
         log_stats=args.log_stats,
         stage_init_timeout=args.stage_init_timeout,
@@ -240,6 +249,11 @@ def main(args):
     output_dir = args.output_dir if getattr(args, "output_dir", None) else args.output_wav
     os.makedirs(output_dir, exist_ok=True)
 
+    if profiler_config:
+        print("[Profiler] Starting profiling...")
+        omni.start_profile()
+
+    generation_start = time.perf_counter()
     omni_generator = omni.generate(query_result.inputs, sampling_params_list)
     for stage_outputs in omni_generator:
         for output in stage_outputs.request_output:
@@ -258,6 +272,15 @@ def main(args):
             sf.write(output_wav, audio_numpy, samplerate=audio_samplerate, format="WAV")
             print(f"Request ID: {request_id}, Saved audio to {output_wav}")
 
+    generation_end = time.perf_counter()
+    generation_time = generation_end - generation_start
+    print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)")
+
+    if profiler_config:
+        print("\n[Profiler] Stopping profiler and collecting results...")
+        omni.stop_profile()
+        print("[Profiler] Profiling stopped.")
+
 
 def parse_args():
     """Parse CLI arguments for offline TTS inference.
@@ -366,6 +389,20 @@ def parse_args():
         help="Mode tag for Base query x_vector_only_mode (default: icl).",
     )
 
+    # Profiler arguments
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help="Directory to save profiling outputs. Enables profiling when set.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Output directory for generated files (preferred over --output-wav).",
+    )
+
     return parser.parse_args()
 
 
diff --git a/examples/offline_inference/text_to_audio/text_to_audio.py b/examples/offline_inference/text_to_audio/text_to_audio.py
index 0a9efcca5ff..c7e17eb7d62 100644
--- a/examples/offline_inference/text_to_audio/text_to_audio.py
+++ b/examples/offline_inference/text_to_audio/text_to_audio.py
@@ -23,6 +23,7 @@
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.platforms import current_omni_platform
+from vllm_omni.profiler import ProfilerConfig
 
 
 def parse_args() -> argparse.Namespace:
@@ -90,6 +91,14 @@ def parse_args() -> argparse.Namespace:
         default=44100,
         help="Sample rate for output audio (Stable Audio uses 44100 Hz).",
     )
+
+    # Profiler arguments
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help="Directory to save profiling outputs. Enables profiling when set.",
+    )
     return parser.parse_args()
 
 
@@ -120,6 +129,12 @@ def main():
     args = parse_args()
     generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
 
+    # Build profiler config from arguments
+    profiler_config = None
+    if args.profile_dir:
+        profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir)
+        print(f"[Profiler] Output dir: {args.profile_dir}")
+
     print(f"\n{'=' * 60}")
     print("Stable Audio Open - Text-to-Audio Generation")
     print(f"{'=' * 60}")
@@ -133,11 +148,15 @@ def main():
     print(f"{'=' * 60}\n")
 
     # Initialize Omni with Stable Audio model
-    omni = Omni(model=args.model)
+    omni = Omni(model=args.model, profiler_config=profiler_config)
 
     # Calculate audio end time
     audio_end_in_s = args.audio_start + args.audio_length
 
+    if profiler_config:
+        print("[Profiler] Starting profiling...")
+        omni.start_profile()
+
     # Time profiling for generation
     generation_start = time.perf_counter()
 
@@ -214,6 +233,11 @@ def main():
 
     print(f"\nGenerated {args.audio_length}s of audio at {args.sample_rate} Hz")
 
+    if profiler_config:
+        print("\n[Profiler] Stopping profiler and collecting results...")
+        omni.stop_profile()
+        print("[Profiler] Profiling stopped.")
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py
index 4b2817f8dee..03e8687a078 100644
--- a/examples/offline_inference/text_to_image/text_to_image.py
+++ b/examples/offline_inference/text_to_image/text_to_image.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
-import os
 import time
 from pathlib import Path
 
@@ -13,6 +12,7 @@
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.outputs import OmniRequestOutput
 from vllm_omni.platforms import current_omni_platform
+from vllm_omni.profiler import ProfilerConfig
 
 
 def parse_args() -> argparse.Namespace:
@@ -140,6 +140,14 @@ def parse_args() -> argparse.Namespace:
         default=1,
         help="Number of ranks used for VAE patch/tile parallelism (decode/encode).",
     )
+
+    # Profiler arguments
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help="Directory to save profiling outputs. Enables profiling when set.",
+    )
     return parser.parse_args()
 
 
@@ -185,11 +193,15 @@ def main():
         vae_patch_parallel_size=args.vae_patch_parallel_size,
     )
 
-    # Check if profiling is requested via environment variable
-    profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR"))
+    # Build profiler config from arguments
+    profiler_config = None
+    if args.profile_dir:
+        profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir)
+        print(f"[Profiler] Output dir: {args.profile_dir}")
 
     omni = Omni(
         model=args.model,
+        profiler_config=profiler_config,
         enable_layerwise_offload=args.enable_layerwise_offload,
         layerwise_num_gpu_layers=args.layerwise_num_gpu_layers,
         vae_use_slicing=args.vae_use_slicing,
@@ -202,7 +214,7 @@ def main():
         enable_cpu_offload=args.enable_cpu_offload,
     )
 
-    if profiler_enabled:
+    if profiler_config:
         print("[Profiler] Starting profiling...")
         omni.start_profile()
 
@@ -243,22 +255,10 @@ def main():
     # Print profiling results
     print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)")
 
-    if profiler_enabled:
+    if profiler_config:
         print("\n[Profiler] Stopping profiler and collecting results...")
-        profile_results = omni.stop_profile()
-        if profile_results and isinstance(profile_results, dict):
-            traces = profile_results.get("traces", [])
-            print("\n" + "=" * 60)
-            print("PROFILING RESULTS:")
-            for rank, trace in enumerate(traces):
-                print(f"\nRank {rank}:")
-                if trace:
-                    print(f"  • Trace: {trace}")
-            if not traces:
-                print("  No traces collected.")
-            print("=" * 60)
-        else:
-            print("[Profiler] No valid profiling data returned.")
+        omni.stop_profile()
+        print("[Profiler] Profiling stopped.")
 
     # Extract images from OmniRequestOutput
     # omni.generate() returns list[OmniRequestOutput], extract images from the first output
diff --git a/examples/offline_inference/text_to_video/text_to_video.py b/examples/offline_inference/text_to_video/text_to_video.py
index 903fc001f33..29090a3f47f 100644
--- a/examples/offline_inference/text_to_video/text_to_video.py
+++ b/examples/offline_inference/text_to_video/text_to_video.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
-import os
 import time
 from pathlib import Path
 
@@ -14,6 +13,7 @@
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.outputs import OmniRequestOutput
 from vllm_omni.platforms import current_omni_platform
+from vllm_omni.profiler import ProfilerConfig
 
 
 def parse_args() -> argparse.Namespace:
@@ -115,6 +115,14 @@ def parse_args() -> argparse.Namespace:
         default=1,
         help="Number of GPUs used for tensor parallelism (TP) inside the DiT.",
     )
+
+    # Profiler arguments
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help="Directory to save profiling outputs. Enables profiling when set.",
+    )
     return parser.parse_args()
 
 
@@ -149,11 +157,15 @@ def main():
         tensor_parallel_size=args.tensor_parallel_size,
     )
 
-    # Check if profiling is requested via environment variable
-    profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR"))
+    # Build profiler config from arguments
+    profiler_config = None
+    if args.profile_dir:
+        profiler_config = ProfilerConfig(profiler="torch", torch_profiler_dir=args.profile_dir)
+        print(f"[Profiler] Output dir: {args.profile_dir}")
 
     omni = Omni(
         model=args.model,
+        profiler_config=profiler_config,
         enable_layerwise_offload=args.enable_layerwise_offload,
         layerwise_num_gpu_layers=args.layerwise_num_gpu_layers,
         vae_use_slicing=args.vae_use_slicing,
@@ -168,7 +180,7 @@ def main():
         enforce_eager=args.enforce_eager,
     )
 
-    if profiler_enabled:
+    if profiler_config:
         print("[Profiler] Starting profiling...")
         omni.start_profile()
 
@@ -266,22 +278,10 @@ def main():
     export_to_video(video_array, str(output_path), fps=args.fps)
     print(f"Saved generated video to {output_path}")
 
-    if profiler_enabled:
+    if profiler_config:
         print("\n[Profiler] Stopping profiler and collecting results...")
-        profile_results = omni.stop_profile()
-        if profile_results and isinstance(profile_results, dict):
-            traces = profile_results.get("traces", [])
-            print("\n" + "=" * 60)
-            print("PROFILING RESULTS:")
-            for rank, trace in enumerate(traces):
-                print(f"\nRank {rank}:")
-                if trace:
-                    print(f"  • Trace: {trace}")
-            if not traces:
-                print("  No traces collected.")
-            print("=" * 60)
-        else:
-            print("[Profiler] No valid profiling data returned.")
+        omni.stop_profile()
+        print("[Profiler] Profiling stopped.")
 
 
 if __name__ == "__main__":
diff --git a/tests/e2e/online_serving/test_async_omni.py b/tests/e2e/online_serving/test_async_omni.py
index cab3e6e2286..55ccfc3359c 100644
--- a/tests/e2e/online_serving/test_async_omni.py
+++ b/tests/e2e/online_serving/test_async_omni.py
@@ -5,10 +5,10 @@
 from pathlib import Path
 
 import pytest
-from vllm import SamplingParams
 from vllm.inputs import PromptType
 
 from tests.utils import hardware_test
+from vllm import SamplingParams
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py
index 0c6479ccea7..49031ee97f9 100644
--- a/tests/entrypoints/openai_api/test_image_server.py
+++ b/tests/entrypoints/openai_api/test_image_server.py
@@ -15,8 +15,8 @@
 import pytest
 from fastapi.testclient import TestClient
 from PIL import Image
-from vllm import SamplingParams
 
+from vllm import SamplingParams
 from vllm_omni.entrypoints.openai.image_api_utils import (
     encode_image_base64,
     parse_size,
diff --git a/tests/entrypoints/test_omni_llm.py b/tests/entrypoints/test_omni_llm.py
index 4f05575ca59..33fd002e73a 100644
--- a/tests/entrypoints/test_omni_llm.py
+++ b/tests/entrypoints/test_omni_llm.py
@@ -5,8 +5,8 @@
 from unittest.mock import MagicMock
 
 import pytest
-from vllm import SamplingParams
 
+from vllm import SamplingParams
 from vllm_omni.entrypoints.stage_utils import SHUTDOWN_TASK
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
diff --git a/tests/profiler/__init__.py b/tests/profiler/__init__.py
new file mode 100644
index 00000000000..208f01a7cb5
--- /dev/null
+++ b/tests/profiler/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/tests/profiler/test_api_router.py b/tests/profiler/test_api_router.py
new file mode 100644
index 00000000000..496acc2ffdc
--- /dev/null
+++ b/tests/profiler/test_api_router.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Unit tests for the stage-aware profiler HTTP endpoints.
+
+These tests use FastAPI TestClient with a mocked engine_client,
+so they run without GPU or model weights.
+"""
+
+from types import SimpleNamespace
+from unittest.mock import AsyncMock
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+from vllm_omni.entrypoints.serve.profile.api_router import attach_router, router
+
+
+def _make_app(profiler_config=None) -> FastAPI:
+    """Create a minimal FastAPI app with mocked state for testing."""
+    app = FastAPI()
+
+    mock_engine = AsyncMock()
+    mock_engine.start_profile = AsyncMock()
+    mock_engine.stop_profile = AsyncMock()
+
+    app.state.engine_client = mock_engine
+    app.state.args = SimpleNamespace(profiler_config=profiler_config)
+    return app
+
+
+class TestAttachRouter:
+    """Tests for attach_router conditional logic."""
+
+    def test_attaches_when_profiler_set(self):
+        """Routes are registered when profiler_config.profiler is set."""
+        config = SimpleNamespace(profiler="torch", torch_profiler_dir="/tmp")
+        app = _make_app(profiler_config=config)
+        attach_router(app)
+
+        paths = {r.path for r in app.routes if hasattr(r, "path")}
+        assert "/start_profile" in paths
+        assert "/stop_profile" in paths
+
+    def test_not_attached_when_no_config(self):
+        """Routes are NOT registered when profiler_config is None."""
+        app = _make_app(profiler_config=None)
+        attach_router(app)
+
+        paths = {r.path for r in app.routes if hasattr(r, "path")}
+        assert "/start_profile" not in paths
+        assert "/stop_profile" not in paths
+
+    def test_not_attached_when_profiler_none(self):
+        """Routes are NOT registered when profiler_config.profiler is None."""
+        config = SimpleNamespace(profiler=None)
+        app = _make_app(profiler_config=config)
+        attach_router(app)
+
+        paths = {r.path for r in app.routes if hasattr(r, "path")}
+        assert "/start_profile" not in paths
+        assert "/stop_profile" not in paths
+
+
+class TestStartProfileEndpoint:
+    """Tests for POST /start_profile."""
+
+    @pytest.fixture()
+    def client(self):
+        app = _make_app()
+        app.include_router(router)
+        return TestClient(app)
+
+    def test_start_profile_no_body(self, client):
+        """Empty body profiles all stages."""
+        resp = client.post("/start_profile")
+        assert resp.status_code == 200
+        engine = client.app.state.engine_client
+        engine.start_profile.assert_awaited_once_with(stages=None)
+
+    def test_start_profile_with_stages(self, client):
+        """Body with stages=[0] profiles only stage 0."""
+        resp = client.post("/start_profile", json={"stages": [0]})
+        assert resp.status_code == 200
+        engine = client.app.state.engine_client
+        engine.start_profile.assert_awaited_once_with(stages=[0])
+
+    def test_start_profile_multiple_stages(self, client):
+        """Body with stages=[0,2] profiles stages 0 and 2."""
+        resp = client.post("/start_profile", json={"stages": [0, 2]})
+        assert resp.status_code == 200
+        engine = client.app.state.engine_client
+        engine.start_profile.assert_awaited_once_with(stages=[0, 2])
+
+
+class TestStopProfileEndpoint:
+    """Tests for POST /stop_profile."""
+
+    @pytest.fixture()
+    def client(self):
+        app = _make_app()
+        app.include_router(router)
+        return TestClient(app)
+
+    def test_stop_profile_no_body(self, client):
+        """Empty body stops all stages."""
+        resp = client.post("/stop_profile")
+        assert resp.status_code == 200
+        engine = client.app.state.engine_client
+        engine.stop_profile.assert_awaited_once_with(stages=None)
+
+    def test_stop_profile_with_stages(self, client):
+        """Body with stages=[1] stops only stage 1."""
+        resp = client.post("/stop_profile", json={"stages": [1]})
+        assert resp.status_code == 200
+        engine = client.app.state.engine_client
+        engine.stop_profile.assert_awaited_once_with(stages=[1])
diff --git a/tests/profiler/test_config.py b/tests/profiler/test_config.py
new file mode 100644
index 00000000000..71c05c826fb
--- /dev/null
+++ b/tests/profiler/test_config.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import tempfile
+
+import pytest
+
+from vllm_omni.profiler import ProfilerConfig
+
+
+class TestProfilerConfig:
+    def test_default_config(self):
+        """Test default configuration values."""
+        config = ProfilerConfig()
+        assert config.profiler is None
+        assert config.torch_profiler_dir == ""
+        assert config.torch_profiler_with_stack is True
+        assert config.torch_profiler_with_flops is False
+        assert config.torch_profiler_use_gzip is True
+        assert config.torch_profiler_dump_cuda_time_total is True
+        assert config.torch_profiler_record_shapes is False
+        assert config.torch_profiler_with_memory is False
+        assert config.delay_iterations == 0
+        assert config.max_iterations == 0
+
+    def test_torch_profiler_config(self):
+        """Test creating a torch profiler config."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = ProfilerConfig(profiler="torch", torch_profiler_dir=tmpdir)
+            assert config.profiler == "torch"
+            assert config.torch_profiler_dir == os.path.abspath(tmpdir)
+
+    def test_dir_without_profiler_raises(self):
+        """Test that setting torch_profiler_dir without profiler='torch' raises."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with pytest.raises(ValueError, match="only applicable"):
+                ProfilerConfig(torch_profiler_dir=tmpdir)
+
+    def test_torch_without_dir_raises(self):
+        """Test that profiler='torch' without dir raises."""
+        with pytest.raises(ValueError, match="must be set"):
+            ProfilerConfig(profiler="torch")
+
+    def test_to_dict(self):
+        """Test to_dict serialization."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = ProfilerConfig(profiler="torch", torch_profiler_dir=tmpdir)
+            d = config.to_dict()
+            assert isinstance(d, dict)
+            assert d["profiler"] == "torch"
+            assert d["torch_profiler_dir"] == os.path.abspath(tmpdir)
+
+    def test_from_dict(self):
+        """Test from_dict deserialization."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            d = {
+                "profiler": "torch",
+                "torch_profiler_dir": tmpdir,
+                "torch_profiler_with_stack": False,
+            }
+            config = ProfilerConfig.from_dict(d)
+            assert config.profiler == "torch"
+            assert config.torch_profiler_with_stack is False
+
+    def test_from_dict_ignores_unknown_fields(self):
+        """Test from_dict ignores unknown fields."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            d = {
+                "profiler": "torch",
+                "torch_profiler_dir": tmpdir,
+                "unknown_field": "value",
+            }
+            config = ProfilerConfig.from_dict(d)
+            assert config.profiler == "torch"
+            assert not hasattr(config, "unknown_field")
+
+    def test_roundtrip(self):
+        """Test to_dict -> from_dict roundtrip."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            original = ProfilerConfig(
+                profiler="torch",
+                torch_profiler_dir=tmpdir,
+                delay_iterations=5,
+                max_iterations=100,
+            )
+            restored = ProfilerConfig.from_dict(original.to_dict())
+            assert restored.profiler == original.profiler
+            assert restored.torch_profiler_dir == original.torch_profiler_dir
+            assert restored.delay_iterations == original.delay_iterations
+            assert restored.max_iterations == original.max_iterations
+
+    def test_dir_expanded(self):
+        """Test that ~ in dir is expanded."""
+        config = ProfilerConfig(profiler="torch", torch_profiler_dir="~/profiles")
+        assert "~" not in config.torch_profiler_dir
+        assert os.path.isabs(config.torch_profiler_dir)
+
+
+class TestProfilerConfigFromAny:
+    """Tests for ProfilerConfig.from_any() — the online serving conversion path."""
+
+    def test_from_any_none(self):
+        """None input returns None."""
+        assert ProfilerConfig.from_any(None) is None
+
+    def test_from_any_own_instance(self):
+        """Our own ProfilerConfig is returned as-is."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = ProfilerConfig(profiler="torch", torch_profiler_dir=tmpdir)
+            result = ProfilerConfig.from_any(config)
+            assert result is config
+
+    def test_from_any_dict(self):
+        """Dict input uses from_dict."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            d = {"profiler": "torch", "torch_profiler_dir": tmpdir}
+            result = ProfilerConfig.from_any(d)
+            assert isinstance(result, ProfilerConfig)
+            assert result.profiler == "torch"
+
+    def test_from_any_upstream_like_object(self):
+        """Object with .profiler and .torch_profiler_dir attributes (upstream-like)."""
+
+        class UpstreamConfig:
+            profiler = "torch"
+            torch_profiler_dir = "/tmp/test_profiles"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            obj = UpstreamConfig()
+            obj.torch_profiler_dir = tmpdir
+            result = ProfilerConfig.from_any(obj)
+            assert isinstance(result, ProfilerConfig)
+            assert result.profiler == "torch"
+            assert result.torch_profiler_dir == os.path.abspath(tmpdir)
+
+    def test_from_any_object_profiler_none(self):
+        """Object with profiler=None returns None."""
+
+        class NullConfig:
+            profiler = None
+
+        assert ProfilerConfig.from_any(NullConfig()) is None
+
+    def test_from_any_object_no_profiler_attr(self):
+        """Object without .profiler attribute returns None."""
+
+        class RandomObj:
+            foo = "bar"
+
+        assert ProfilerConfig.from_any(RandomObj()) is None
+
+
+class TestProfilerConfigReExport:
+    """Test that ProfilerConfig is accessible from vllm_omni.config."""
+
+    def test_import_from_config(self):
+        from vllm_omni.config import ProfilerConfig as ReExported
+
+        assert ReExported is ProfilerConfig
diff --git a/tests/profiler/test_torch_profiler.py b/tests/profiler/test_torch_profiler.py
new file mode 100644
index 00000000000..eb6fd8baff9
--- /dev/null
+++ b/tests/profiler/test_torch_profiler.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import glob
+import os
+import tempfile
+
+import pytest
+import torch
+
+from vllm_omni.profiler import ProfilerConfig, TorchProfiler
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+class TestTorchProfiler:
+    """Tests for TorchProfiler (require CUDA)."""
+
+    def _make_config(self, tmpdir: str) -> ProfilerConfig:
+        return ProfilerConfig(profiler="torch", torch_profiler_dir=tmpdir)
+
+    def test_start_stop_lifecycle(self):
+        """Test basic start/stop lifecycle."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = self._make_config(tmpdir)
+            profiler = TorchProfiler(config, worker_name="test")
+
+            assert not profiler.is_running
+            profiler.start()
+            assert profiler.is_running
+
+            # Do some work
+            x = torch.randn(100, 100, device="cuda")
+            y = x @ x.T
+            del x, y
+
+            profiler.stop()
+            assert not profiler.is_running
+
+    def test_stop_without_start_is_noop(self):
+        """Test stop without start is safe."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = self._make_config(tmpdir)
+            profiler = TorchProfiler(config, worker_name="test")
+            profiler.stop()  # Should not raise
+            assert not profiler.is_running
+
+    def test_double_start_is_noop(self):
+        """Test that calling start twice does not error."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = self._make_config(tmpdir)
+            profiler = TorchProfiler(config, worker_name="test")
+            profiler.start()
+            profiler.start()  # Second call should be no-op
+            assert profiler.is_running
+            profiler.stop()
+            assert not profiler.is_running
+
+    def test_shutdown(self):
+        """Test shutdown stops a running profiler."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = self._make_config(tmpdir)
+            profiler = TorchProfiler(config, worker_name="test")
+            profiler.start()
+            assert profiler.is_running
+            profiler.shutdown()
+            assert not profiler.is_running
+
+    def test_step_basic(self):
+        """Test step method doesn't crash."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = self._make_config(tmpdir)
+            profiler = TorchProfiler(config, worker_name="test")
+            profiler.start()
+            for _ in range(5):
+                profiler.step()
+            profiler.stop()
+
+    def test_delay_iterations(self):
+        """Test that profiler doesn't start until delay iterations pass."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = ProfilerConfig(
+                profiler="torch",
+                torch_profiler_dir=tmpdir,
+                delay_iterations=3,
+            )
+            profiler = TorchProfiler(config, worker_name="test")
+            profiler.start()
+            # Active but not yet running due to delay
+            assert not profiler.is_running
+            for _ in range(2):
+                profiler.step()
+            assert not profiler.is_running
+            # Third step triggers actual start
+            profiler.step()
+            assert profiler.is_running
+            profiler.stop()
+
+    def test_max_iterations(self):
+        """Test that profiler stops after max iterations."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = ProfilerConfig(
+                profiler="torch",
+                torch_profiler_dir=tmpdir,
+                max_iterations=2,
+            )
+            profiler = TorchProfiler(config, worker_name="test")
+            profiler.start()
+            assert profiler.is_running
+            profiler.step()  # iter 1
+            assert profiler.is_running
+            profiler.step()  # iter 2
+            assert profiler.is_running
+            profiler.step()  # iter 3 -> exceeds max, stops
+            assert not profiler.is_running
+
+    def test_config_driven_settings(self):
+        """Test that config fields are passed to the profiler."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = ProfilerConfig(
+                profiler="torch",
+                torch_profiler_dir=tmpdir,
+                torch_profiler_record_shapes=True,
+                torch_profiler_with_memory=True,
+                torch_profiler_with_flops=True,
+            )
+            profiler = TorchProfiler(config, worker_name="test")
+            profiler.start()
+            x = torch.randn(10, 10, device="cuda")
+            _ = x @ x.T
+            profiler.stop()
+
+    def test_trace_files_written(self):
+        """Test that stop() produces trace files in torch_profiler_dir."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = self._make_config(tmpdir)
+            profiler = TorchProfiler(config, worker_name="test-trace")
+            profiler.start()
+            x = torch.randn(64, 64, device="cuda")
+            _ = x @ x.T
+            del x
+            profiler.stop()
+
+            # Verify trace file exists and is non-empty
+            trace_files = glob.glob(os.path.join(tmpdir, "*.trace.json.gz"))
+            assert len(trace_files) >= 1, f"Expected trace files in {tmpdir}, found: {os.listdir(tmpdir)}"
+            for tf in trace_files:
+                assert os.path.getsize(tf) > 0, f"Trace file {tf} is empty"
+
+    def test_cuda_time_stats_written(self):
+        """Test that stop() writes profiler_out_*.txt when dump_cuda_time_total=True."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = ProfilerConfig(
+                profiler="torch",
+                torch_profiler_dir=tmpdir,
+                torch_profiler_dump_cuda_time_total=True,
+            )
+            profiler = TorchProfiler(config, worker_name="test-stats", local_rank=0)
+            profiler.start()
+            x = torch.randn(64, 64, device="cuda")
+            _ = x @ x.T
+            del x
+            profiler.stop()
+
+            stats_file = os.path.join(tmpdir, "profiler_out_0.txt")
+            assert os.path.exists(stats_file), f"Expected {stats_file}"
+            assert os.path.getsize(stats_file) > 0
+
+    def test_worker_name_in_trace_filename(self):
+        """Test that worker_name appears in the trace filename."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = self._make_config(tmpdir)
+            profiler = TorchProfiler(config, worker_name="stage-0")
+            profiler.start()
+            x = torch.randn(32, 32, device="cuda")
+            _ = x @ x.T
+            del x
+            profiler.stop()
+
+            trace_files = glob.glob(os.path.join(tmpdir, "*stage-0*"))
+            assert len(trace_files) >= 1, f"Expected trace with 'stage-0' in name, found: {os.listdir(tmpdir)}"
+
+
+class TestTorchProfilerCPU:
+    """Tests that don't require CUDA."""
+
+    def test_import(self):
+        """Test that TorchProfiler class can be imported."""
+        from vllm_omni.profiler import TorchProfiler
+
+        assert TorchProfiler is not None
+
+    def test_profiler_config_import(self):
+        """Test that ProfilerConfig can be imported."""
+        from vllm_omni.profiler import ProfilerConfig
+
+        assert ProfilerConfig is not None
diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py
index 775568a726f..faf923ec198 100644
--- a/vllm_omni/benchmarks/patch/patch.py
+++ b/vllm_omni/benchmarks/patch/patch.py
@@ -17,7 +17,6 @@
 from pydub import AudioSegment
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
-from vllm.benchmarks import datasets
 from vllm.benchmarks.datasets import SampleRequest
 from vllm.benchmarks.lib.endpoint_request_func import (
     ASYNC_REQUEST_FUNCS,
@@ -32,6 +31,8 @@
 )
 from vllm.logger import init_logger
 
+from vllm.benchmarks import datasets
+
 logger = init_logger(__name__)
 from vllm_omni.benchmarks.data_modules.random_multi_modal_dataset import OmniRandomMultiModalDataset
 
@@ -201,9 +202,9 @@ async def async_request_openai_chat_omni_completions(
 
 # ruff: noqa: E402
 # Prevent import order from causing patch failures
-from vllm.benchmarks import serve
 from vllm.benchmarks.serve import TaskType, calculate_metrics_for_embeddings, get_request, wait_for_endpoint
 
+from vllm.benchmarks import serve
 from vllm_omni.benchmarks.metrics.metrics import MultiModalsBenchmarkMetrics, calculate_metrics
 
 # ruff: noqa: E402
diff --git a/vllm_omni/config/__init__.py b/vllm_omni/config/__init__.py
index e2db6f4273c..22d6add7aad 100644
--- a/vllm_omni/config/__init__.py
+++ b/vllm_omni/config/__init__.py
@@ -4,8 +4,10 @@
 
 from vllm_omni.config.lora import LoRAConfig
 from vllm_omni.config.model import OmniModelConfig
+from vllm_omni.profiler import ProfilerConfig
 
 __all__ = [
     "OmniModelConfig",
     "LoRAConfig",
+    "ProfilerConfig",
 ]
diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py
index 2ab94d5b775..6bd7cc57f04 100644
--- a/vllm_omni/diffusion/diffusion_engine.py
+++ b/vllm_omni/diffusion/diffusion_engine.py
@@ -4,11 +4,14 @@
 import os
 import time
 from collections.abc import Iterable
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import PIL.Image
 from vllm.logger import init_logger
 
+if TYPE_CHECKING:
+    from vllm_omni.profiler import ProfilerConfig
+
 from vllm_omni.diffusion.data import OmniDiffusionConfig
 from vllm_omni.diffusion.executor.abstract import DiffusionExecutor
 from vllm_omni.diffusion.registry import (
@@ -194,124 +197,20 @@ def make_engine(config: OmniDiffusionConfig) -> "DiffusionEngine":
     def add_req_and_wait_for_response(self, request: OmniDiffusionRequest):
         return self.executor.add_req(request)
 
-    def start_profile(self, trace_filename: str | None = None) -> None:
-        """
-        Start torch profiling on all diffusion workers.
-
-        Creates a directory (if needed) and sets up a base filename template
-        for per-rank profiler traces (typically saved as <template>_rank<N>.json).
+    def start_profile(self, config: "ProfilerConfig | None" = None) -> None:
+        """Start torch profiling on all diffusion workers.
 
         Args:
-            trace_filename: Optional base filename (without extension or rank suffix).
-                            If None, generates one using current timestamp.
-        """
-        if trace_filename is None:
-            trace_filename = f"stage_0_diffusion_{int(time.time())}_rank"
-
-        trace_dir = os.environ.get("VLLM_TORCH_PROFILER_DIR", "./profiles")
-
-        # Expand ~ and ~user, then make absolute (robust against cwd changes)
-        trace_dir = os.path.expanduser(trace_dir)
-        trace_dir = os.path.abspath(trace_dir)
-
-        try:
-            os.makedirs(trace_dir, exist_ok=True)
-        except OSError as exc:
-            logger.error(f"Failed to create profiler directory {trace_dir}: {exc}")
-            raise
-
-        # Build final template path (without rank or extension — torch.profiler appends those)
-        full_template = os.path.join(trace_dir, trace_filename)
-
-        expected_pattern = f"{full_template}*.json"
-        logger.info(f"Starting diffusion profiling → {expected_pattern}")
-
-        # Also log the absolute directory once (useful in multi-node or containers)
-        logger.debug(f"Profiler output directory: {trace_dir}")
-
-        # Propagate to all workers
-        try:
-            self.collective_rpc(method="start_profile", args=(full_template,))
-        except Exception as e:
-            logger.error("Failed to start profiling on workers", exc_info=True)
-            raise RuntimeError(f"Could not start profiler: {e}") from e
-
-    def stop_profile(self) -> dict:
-        """
-        Stop profiling on all workers and collect the final trace/table paths.
-
-        The worker (torch_profiler.py) now handles trace export, compression to .gz,
-        and deletion of the original .json file. This method only collects and
-        reports the paths returned by the workers.
-
-        Returns:
-            dict with keys:
-            - "traces": list of final trace file paths (usually .json.gz)
-            - "tables": list of table strings (one per rank)
+            config: ProfilerConfig with torch profiler settings.
         """
-        logger.info("Stopping diffusion profiling and collecting results...")
-
-        try:
-            # Give worker enough time — export + compression + table can be slow
-            results = self.collective_rpc(method="stop_profile", timeout=600)
-        except Exception:
-            logger.error("Failed to stop profiling on workers", exc_info=True)
-            return {"traces": [], "tables": []}
-
-        output_files = {"traces": [], "tables": []}
-        successful_traces = 0
-
-        if not results:
-            logger.warning("No profiling results returned from any rank")
-            return output_files
-
-        for rank, res in enumerate(results):
-            if not isinstance(res, dict):
-                logger.warning(f"Rank {rank}: invalid result format (got {type(res)})")
-                continue
-
-            # 1. Trace file — should be .json.gz if compression succeeded
-            trace_path = res.get("trace")
-            if trace_path:
-                # We trust the worker — it created/compressed the file
-                logger.info(f"[Rank {rank}] Final trace: {trace_path}")
-                output_files["traces"].append(trace_path)
-                successful_traces += 1
-
-                # Optional: warn if path looks suspicious (e.g. still .json)
-                if not trace_path.endswith((".json.gz", ".json")):
-                    logger.warning(f"Rank {rank}: unusual trace path extension: {trace_path}")
-
-            # 2. Table file — plain text
-            table = res.get("table")
-            if table:
-                output_files["tables"].append(table)
-
-        # Final summary logging
-        num_ranks = len(results)
-        if successful_traces > 0:
-            final_paths_str = ", ".join(output_files["traces"][:3])
-            if len(output_files["traces"]) > 3:
-                final_paths_str += f" ... (+{len(output_files['traces']) - 3} more)"
-
-            logger.info(
-                f"Profiling stopped. Collected {successful_traces} trace file(s) "
-                f"from {num_ranks} rank(s). "
-                f"Final trace paths: {final_paths_str}"
-            )
-        elif output_files["traces"]:
-            logger.info(
-                f"Profiling stopped but no traces were successfully collected. "
-                f"Reported paths: {', '.join(output_files['traces'][:3])}"
-                f"{' ...' if len(output_files['traces']) > 3 else ''}"
-            )
-        else:
-            logger.info("Profiling stopped — no trace files were collected from any rank.")
-
-        if output_files["tables"]:
-            logger.debug(f"Collected {len(output_files['tables'])} profiling table(s)")
-
-        return output_files
+        if config is None:
+            raise ValueError("ProfilerConfig required")
+        os.makedirs(config.torch_profiler_dir, exist_ok=True)
+        self.collective_rpc(method="start_profile", args=(config.to_dict(),))
+
+    def stop_profile(self) -> None:
+        """Stop profiling on all diffusion workers."""
+        self.collective_rpc(method="stop_profile", timeout=120000)
 
     def _dummy_run(self):
         """A dummy run to warm up the model."""
diff --git a/vllm_omni/diffusion/profiler/base.py b/vllm_omni/diffusion/profiler/base.py
deleted file mode 100644
index 640e406da95..00000000000
--- a/vllm_omni/diffusion/profiler/base.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from abc import ABC, abstractmethod
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class ProfilerBase(ABC):
-    """
-    Abstract base class for all diffusion profilers.
-    Defines the common interface used by GPUWorker and DiffusionEngine.
-    """
-
-    @abstractmethod
-    def start(self, trace_path_template: str) -> str:
-        """
-        Start profiling.
-
-        Args:
-            trace_path_template: Base path (without rank or extension).
-                                 e.g. "/tmp/profiles/sdxl_run"
-
-        Returns:
-            Full path of the trace file this rank will write.
-        """
-        pass
-
-    @abstractmethod
-    def stop(self) -> str | None:
-        """
-        Stop profiling and finalize/output the trace.
-
-        Returns:
-            Path to the saved trace file, or None if not active.
-        """
-        pass
-
-    @abstractmethod
-    def get_step_context(self):
-        """
-        Returns a context manager that advances one profiling step.
-        Should be a no-op (nullcontext) when profiler is not active.
-        """
-        pass
-
-    @abstractmethod
-    def is_active(self) -> bool:
-        """Return True if profiling is currently running."""
-        pass
-
-    @classmethod
-    def _get_rank(cls) -> int:
-        import os
-
-        return int(os.getenv("RANK", "0"))
diff --git a/vllm_omni/diffusion/profiler/torch_profiler.py b/vllm_omni/diffusion/profiler/torch_profiler.py
deleted file mode 100644
index 37c45710071..00000000000
--- a/vllm_omni/diffusion/profiler/torch_profiler.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import subprocess
-from contextlib import nullcontext
-
-import torch
-from torch.profiler import ProfilerActivity, profile
-from vllm.logger import init_logger
-
-from .base import ProfilerBase
-
-logger = init_logger(__name__)
-
-
-class TorchProfiler(ProfilerBase):
-    """
-    Torch-based profiler configured for End-to-End continuous recording.
-    Uses 'on_trace_ready' to handle Trace export.
-    Compression is offloaded to a background subprocess to avoid blocking the worker loop.
-    """
-
-    _profiler: profile | None = None
-    _trace_template: str = ""
-
-    @classmethod
-    def start(cls, trace_path_template: str) -> str:
-        """
-        Start the profiler with the given trace path template.
-        """
-        # 1. Cleanup any existing profiler
-        if cls._profiler is not None:
-            logger.warning("[Rank %s] Stopping existing Torch profiler", cls._get_rank())
-            cls._profiler.stop()
-            cls._profiler = None
-
-        rank = cls._get_rank()
-
-        # 2. Make path absolute
-        trace_path_template = os.path.abspath(trace_path_template)
-        cls._trace_template = trace_path_template
-
-        # Expected paths
-        json_file = f"{trace_path_template}_rank{rank}.json"
-
-        os.makedirs(os.path.dirname(json_file), exist_ok=True)
-
-        logger.info(f"[Rank {rank}] Starting End-to-End Torch profiler")
-
-        # 3. Define the on_trace_ready handler
-        def trace_handler(p):
-            nonlocal json_file
-
-            # A. Export JSON Trace
-            try:
-                p.export_chrome_trace(json_file)
-                logger.info(f"[Rank {rank}] Trace exported to {json_file}")
-
-                try:
-                    subprocess.Popen(["gzip", "-f", json_file])
-                    logger.info(f"[Rank {rank}] Triggered background compression for {json_file}")
-                    # Update variable to point to the eventual file
-                    json_file = f"{json_file}.gz"
-                except Exception as compress_err:
-                    logger.warning(f"[Rank {rank}] Background gzip failed to start: {compress_err}")
-
-            except Exception as e:
-                logger.warning(f"[Rank {rank}] Failed to export trace: {e}")
-
-        # 4. Initialize profiler with long active period
-        cls._profiler = profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-            schedule=torch.profiler.schedule(
-                wait=0,
-                warmup=0,
-                active=100000,  # long capture window
-            ),
-            on_trace_ready=trace_handler,
-            record_shapes=True,
-            profile_memory=True,
-            with_stack=True,
-            with_flops=True,
-        )
-
-        # 5. Start profiling
-        cls._profiler.start()
-
-        # Return the expected final path
-        return f"{trace_path_template}_rank{rank}.json.gz"
-
-    @classmethod
-    def stop(cls) -> dict | None:
-        if cls._profiler is None:
-            return None
-
-        rank = cls._get_rank()
-
-        # Determine expected paths
-        base_path = f"{cls._trace_template}_rank{rank}"
-        gz_path = f"{base_path}.json.gz"
-
-        try:
-            # This triggers trace_handler synchronously
-            # Since we removed table generation and backgrounded compression, this returns fast.
-            cls._profiler.stop()
-        except Exception as e:
-            logger.warning(f"[Rank {rank}] Profiler stop failed: {e}")
-
-        cls._profiler = None
-
-        # We return the .gz path assuming background compression will succeed.
-        return {"trace": gz_path, "table": None}
-
-    @classmethod
-    def step(cls):
-        if cls._profiler is not None:
-            cls._profiler.step()
-
-    @classmethod
-    def is_active(cls) -> bool:
-        return cls._profiler is not None
-
-    @classmethod
-    def get_step_context(cls):
-        return nullcontext()
diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
index 6f5bd00e663..3e06ad91878 100644
--- a/vllm_omni/diffusion/worker/diffusion_worker.py
+++ b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -30,11 +30,11 @@
 )
 from vllm_omni.diffusion.forward_context import set_forward_context
 from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager
-from vllm_omni.diffusion.profiler import CurrentProfiler
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.diffusion.worker.diffusion_model_runner import DiffusionModelRunner
 from vllm_omni.lora.request import LoRARequest
 from vllm_omni.platforms import current_omni_platform
+from vllm_omni.profiler import ProfilerConfig, TorchProfiler
 
 logger = init_logger(__name__)
 
@@ -134,15 +134,28 @@ def generate(self, request: OmniDiffusionRequest) -> DiffusionOutput:
         """Generate output for the given requests."""
         return self.execute_model(request, self.od_config)
 
-    @classmethod
-    def start_profile(cls, trace_path_template: str) -> str:
-        """Start profiling for this GPU worker."""
-        return CurrentProfiler.start(trace_path_template)
+    def start_profile(self, config: dict | None = None) -> None:
+        """Start profiling for this GPU worker.
 
-    @classmethod
-    def stop_profile(cls) -> dict | None:
-        """Stop profiling and return the result dictionary."""
-        return CurrentProfiler.stop()
+        Args:
+            config: Dict of ProfilerConfig fields for profiler settings.
+        """
+        if config is None:
+            return
+        profiler_config = ProfilerConfig.from_dict(config)
+        rank = int(os.getenv("RANK", "0"))
+        self.profiler = TorchProfiler(
+            profiler_config,
+            worker_name=f"diffusion-rank-{rank}",
+            local_rank=rank,
+        )
+        self.profiler.start()
+
+    def stop_profile(self) -> None:
+        """Stop profiling."""
+        if hasattr(self, "profiler") and self.profiler is not None:
+            self.profiler.stop()
+            self.profiler = None
 
     def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfig) -> DiffusionOutput:
         """Execute a forward pass by delegating to the model runner."""
diff --git a/vllm_omni/entrypoints/async_omni_llm.py b/vllm_omni/entrypoints/async_omni_llm.py
index b557c07dd57..d55e1de04b2 100644
--- a/vllm_omni/entrypoints/async_omni_llm.py
+++ b/vllm_omni/entrypoints/async_omni_llm.py
@@ -1,12 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-import os
-import socket
 from typing import TYPE_CHECKING
 
-import torch
-import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
@@ -164,26 +160,6 @@ def __init__(
         except RuntimeError:
             pass
 
-        if envs.VLLM_TORCH_PROFILER_DIR and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM:
-            logger.info(
-                "Torch profiler enabled. AsyncOmniLLM CPU traces will be collected under %s",
-                envs.VLLM_TORCH_PROFILER_DIR,
-            )
-            worker_name = f"{socket.gethostname()}_{os.getpid()}.async_omni_llm"
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                ],
-                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    envs.VLLM_TORCH_PROFILER_DIR,
-                    worker_name=worker_name,
-                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
-                ),
-            )
-        else:
-            self.profiler = None
-
     @classmethod
     @deprecate_kwargs(
         "disable_log_requests",
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
index 510813c6abd..5b224b4430f 100644
--- a/vllm_omni/entrypoints/omni.py
+++ b/vllm_omni/entrypoints/omni.py
@@ -13,9 +13,9 @@
 import huggingface_hub
 from omegaconf import OmegaConf
 from tqdm.auto import tqdm
-from vllm import SamplingParams
 from vllm.logger import init_logger
 
+from vllm import SamplingParams
 from vllm_omni.distributed.omni_connectors import (
     get_stage_connector_config,
     initialize_orchestrator_connectors,
@@ -45,6 +45,7 @@
     download_weights_from_hf_specific,
 )
 from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.profiler import ProfilerConfig
 
 logger = init_logger(__name__)
 
@@ -99,6 +100,7 @@ class OmniBase:
 
     Args:
         model: Model name or path to load.
+        profiler_config: Optional profiler configuration for performance/memory profiling.
         **kwargs: Arbitrary keyword arguments.
             - stage_configs_path: Optional path to YAML file containing stage
               configurations. If None, configurations are loaded from the model.
@@ -116,10 +118,19 @@ class OmniBase:
             - Additional keyword arguments passed to stage engines.
     """
 
-    def __init__(self, model: str, **kwargs: Any) -> None:
+    def __init__(
+        self,
+        model: str,
+        *,
+        profiler_config: ProfilerConfig | None = None,
+        **kwargs: Any,
+    ) -> None:
         model = omni_snapshot_download(model)
         kwargs["model"] = model
 
+        # Profiler configuration
+        self._profiler_config = profiler_config
+
         # Stage management attributes
         self.stage_list: list[OmniStage] = []
         self._stage_in_queues: list[mp.Queue] = []
@@ -338,6 +349,7 @@ def _start_stages(self, model: str) -> None:
                 connectors_config=stage_connectors_config,
                 worker_backend=self.worker_backend,
                 ray_placement_group=self._ray_pg,
+                profiler_config=self._profiler_config,
             )
 
             logger.debug(f"[{self._name}] Stage-{stage_id} process started")
@@ -395,21 +407,21 @@ def _wait_for_stages_ready(self, timeout: int = 120) -> None:
     def start_profile(self, stages: list[int] | None = None) -> None:
         """Start profiling for specified stages.
 
-        Sends start_profile command to stage workers. Profiling must be enabled
-        via VLLM_TORCH_PROFILER_DIR environment variable.
+        Uses the ProfilerConfig passed at initialization.
 
         Args:
             stages: List of stage IDs to start profiling. If None, starts
-                profiling for all stages that have profiling enabled.
+                profiling for all stages.
 
         Example:
-            >>> # Profile all stages
+            >>> from vllm_omni.profiler import ProfilerConfig
+            >>> omni = Omni(model="...", profiler_config=ProfilerConfig(
+            ...     profiler="torch",
+            ...     torch_profiler_dir="./profiles",
+            ... ))
             >>> omni.start_profile()
             >>> outputs = omni.generate(prompts, sampling_params)
             >>> omni.stop_profile()
-
-            >>> # Profile only stage 0 and 2
-            >>> omni.start_profile(stages=[0, 2])
         """
         if stages is None:
             stages = list(range(len(self.stage_list)))
@@ -417,8 +429,11 @@ def start_profile(self, stages: list[int] | None = None) -> None:
         for stage_id in stages:
             if stage_id < len(self.stage_list):
                 try:
-                    self.stage_list[stage_id].submit({"type": OmniStageTaskType.PROFILER_START})
-                    logger.info("[%s] Sent start_profile to stage-%s", self._name, stage_id)
+                    task: dict[str, Any] = {"type": OmniStageTaskType.PROFILER_START}
+                    if self._profiler_config:
+                        task["config"] = self._profiler_config.to_dict()
+                    self.stage_list[stage_id].submit(task)
+                    logger.info("[%s] Started profiling for stage-%s", self._name, stage_id)
                 except Exception as e:
                     logger.warning(
                         "[%s] Failed to send start_profile to stage-%s: %s",
@@ -427,71 +442,26 @@ def start_profile(self, stages: list[int] | None = None) -> None:
                         e,
                     )
 
-    def stop_profile(self, stages: list[int] | None = None) -> dict:
-        """
-        Synchronously stop profiling for specified stages and collect
-        the file paths for traces and tables.
+    def stop_profile(self, stages: list[int] | None = None) -> None:
+        """Stop profiling for specified stages.
+
+        Trace files are written to ``torch_profiler_dir`` by each worker.
+
+        Args:
+            stages: List of stage IDs to stop profiling. If None, stops
+                profiling for all stages.
         """
         if stages is None:
             stages = list(range(len(self.stage_list)))
 
-        all_results = {"traces": [], "tables": []}
-
         for stage_id in stages:
             if stage_id < len(self.stage_list):
                 stage = self.stage_list[stage_id]
-
-                # Check if the stage object has our new bridge method
                 if hasattr(stage, "stop_profile"):
-                    logger.info("[%s] Requesting profile data collection from stage-%s", self._name, stage_id)
-
-                    # This is the blocking call that triggers the RPC chain
-                    stage_data = stage.stop_profile()
-
-                    if isinstance(stage_data, dict):
-                        # FIX: Handle both single key and list key formats
-                        traces = stage_data.get("trace") or stage_data.get("traces")
-                        tables = stage_data.get("table") or stage_data.get("tables")
-
-                        # Debug logging
-                        logger.debug(f"[{self._name}] Stage-{stage_id} returned: {stage_data.keys()}")
-                        if traces:
-                            logger.debug(f"[{self._name}] Stage-{stage_id} traces type: {type(traces)}")
-                        if tables:
-                            logger.debug(f"[{self._name}] Stage-{stage_id} tables type: {type(tables)}")
-
-                        # Handle single strings
-                        if traces:
-                            if isinstance(traces, str):
-                                all_results["traces"].append(traces)
-                            elif isinstance(traces, list):
-                                all_results["traces"].extend(traces)
-
-                        # Handle single strings
-                        if tables:
-                            if isinstance(tables, str):
-                                all_results["tables"].append(tables)
-                            elif isinstance(tables, list):
-                                all_results["tables"].extend(tables)
-                        else:
-                            logger.warning(f"[{self._name}] Stage-{stage_id} returned no table data")
-                    else:
-                        logger.warning(f"[{self._name}] Stage-{stage_id} returned non-dict data: {type(stage_data)}")
+                    stage.stop_profile()
                 else:
-                    # Fallback for non-diffusion stages
-                    logger.warning(
-                        "[%s] Stage-%s does not support synchronous stop_profile. Falling back to async.",
-                        self._name,
-                        stage_id,
-                    )
                     stage.submit({"type": OmniStageTaskType.PROFILER_STOP})
-
-        # Final debug output
-        logger.info(
-            f"[{self._name}] Collected {len(all_results['traces'])} trace(s) and {len(all_results['tables'])} table(s)"
-        )
-
-        return all_results
+                logger.info("[%s] Stopped profiling for stage-%s", self._name, stage_id)
 
     def close(self) -> None:
         """Close all stage processes and clean up resources."""
diff --git a/vllm_omni/entrypoints/omni_diffusion.py b/vllm_omni/entrypoints/omni_diffusion.py
index 9f6dde15b1c..308a37dd25f 100644
--- a/vllm_omni/entrypoints/omni_diffusion.py
+++ b/vllm_omni/entrypoints/omni_diffusion.py
@@ -13,6 +13,7 @@
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType
 from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.profiler import ProfilerConfig
 
 # TODO configure logging properly
 logging.basicConfig(level=logging.INFO)
@@ -29,9 +30,22 @@ class OmniDiffusion:
     You can pass either an `OmniDiffusionConfig` via `od_config`, or
     pass kwargs such as `model="Qwen/Qwen-Image"`,
     which will be forwarded to `OmniDiffusionConfig.from_kwargs`.
+
+    Args:
+        od_config: Optional OmniDiffusionConfig. If not provided, will be
+            created from kwargs.
+        profiler_config: Optional ProfilerConfig for profiling.
+        **kwargs: Additional arguments passed to OmniDiffusionConfig.from_kwargs.
     """
 
-    def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs):
+    def __init__(
+        self,
+        od_config: OmniDiffusionConfig | None = None,
+        profiler_config: ProfilerConfig | None = None,
+        **kwargs,
+    ):
+        self._profiler_config = profiler_config
+
         # Capture stage info from kwargs before they might be filtered out
         stage_id = kwargs.get("stage_id")
         engine_input_source = kwargs.get("engine_input_source")
@@ -117,25 +131,18 @@ def __del__(self):  # pragma: no cover - best effort cleanup
         except Exception:
             pass
 
-    def start_profile(self, trace_filename: str | None = None) -> None:
+    def start_profile(self, config: ProfilerConfig | None = None) -> None:
         """Start profiling for the diffusion model.
 
         Args:
-            trace_filename: Optional base filename for trace files.
-                           If None, a timestamp-based name will be generated.
-        """
-        if hasattr(self, "engine") and self.engine:
-            self.engine.start_profile(trace_filename)
-        else:
-            raise RuntimeError("Diffusion engine not initialized")
-
-    def stop_profile(self) -> dict:
-        """Stop profiling and return profiling results.
-
-        Returns:
-            Dictionary containing paths to trace and table files.
+            config: Optional ProfilerConfig. If None, uses the config passed
+                   at initialization.
         """
-        if hasattr(self, "engine") and self.engine:
-            return self.engine.stop_profile()
-        else:
-            raise RuntimeError("Diffusion engine not initialized")
+        effective_config = config or self._profiler_config
+        if effective_config is None:
+            raise ValueError("ProfilerConfig required")
+        self.engine.start_profile(config=effective_config)
+
+    def stop_profile(self) -> None:
+        """Stop profiling."""
+        self.engine.stop_profile()
diff --git a/vllm_omni/entrypoints/omni_llm.py b/vllm_omni/entrypoints/omni_llm.py
index 118a0ba7f47..e2bba42f30f 100644
--- a/vllm_omni/entrypoints/omni_llm.py
+++ b/vllm_omni/entrypoints/omni_llm.py
@@ -26,6 +26,7 @@
     load_stage_configs_from_yaml,
     resolve_model_config_path,
 )
+from vllm_omni.profiler import ProfilerConfig, TorchProfiler
 
 logger = init_logger(__name__)
 
@@ -74,9 +75,13 @@ def __init__(
         shm_threshold_bytes: int = 65536,
         batch_timeout: int = 10,
         init_timeout: int = 300,
+        profiler_config: ProfilerConfig | None = None,
         **kwargs: Any,
     ):
         """LLM constructor with omni-specific configuration loading."""
+        # Store profiler config (overrides vLLM's built-in profiler)
+        self._profiler_config = profiler_config
+
         # Store stage management parameters (used by Omni class)
         self.worker_backend = kwargs.get("worker_backend", "multi_process")
         self.ray_address = kwargs.get("ray_address", None)
@@ -239,3 +244,40 @@ def _run_engine(self, *, use_tqdm: bool | Callable[..., tqdm] = True) -> list[Re
         # This is necessary because some requests may be finished earlier than
         # its previous requests.
         return sorted(outputs, key=lambda x: int(x.request_id.split("-")[0]))
+
+    def start_profile(self) -> None:
+        """Start profiling using our own TorchProfiler, fully decoupled
+        from upstream vLLM's profiler.
+
+        Raises:
+            ValueError: If profiler_config was not set at initialization.
+        """
+        if self._profiler_config is None:
+            raise ValueError(
+                "profiler_config not set at initialization. Pass profiler_config to OmniLLM() constructor."
+            )
+
+        if hasattr(self, "_profiler_instance") and self._profiler_instance is not None:
+            logger.warning("Profiler already active, stopping first")
+            self._profiler_instance.stop()
+
+        import os
+
+        os.makedirs(self._profiler_config.torch_profiler_dir, exist_ok=True)
+        rank = int(os.getenv("RANK", "0"))
+        self._profiler_instance = TorchProfiler(
+            self._profiler_config,
+            worker_name=f"llm-rank-{rank}",
+            local_rank=rank,
+        )
+        self._profiler_instance.start()
+        logger.info("Started profiling for OmniLLM")
+
+    def stop_profile(self) -> None:
+        """Stop profiling."""
+        if not hasattr(self, "_profiler_instance") or self._profiler_instance is None:
+            logger.warning("No active profiler to stop")
+            return
+        self._profiler_instance.stop()
+        self._profiler_instance = None
+        logger.info("Stopped profiling for OmniLLM")
diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py
index ca5b2f43e7c..f3034186804 100644
--- a/vllm_omni/entrypoints/omni_stage.py
+++ b/vllm_omni/entrypoints/omni_stage.py
@@ -18,9 +18,11 @@
 from collections.abc import Sequence
 from contextlib import contextmanager
 from dataclasses import fields
-from typing import Any, Literal, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
+
+if TYPE_CHECKING:
+    from vllm_omni.profiler import ProfilerConfig
 
-from vllm import PromptType, RequestOutput
 from vllm.inputs import TextPrompt
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
@@ -31,6 +33,7 @@
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.llm_engine import LLMEngine
 
+from vllm import PromptType, RequestOutput
 from vllm_omni.diffusion.data import OmniDiffusionConfig
 from vllm_omni.distributed.omni_connectors import build_stage_connectors
 from vllm_omni.distributed.omni_connectors.adapter import try_recv_via_connector
@@ -391,6 +394,7 @@ def init_stage_worker(
         batch_timeout: int = 10,
         connectors_config: dict | None = None,
         worker_backend: str = "multi_process",
+        profiler_config: "ProfilerConfig | None" = None,
         **kwargs: Any,
     ) -> None:
         """Initialize and start the stage worker process.
@@ -406,6 +410,7 @@ def init_stage_worker(
             batch_timeout: Timeout in seconds for batching requests
             connectors_config: Configuration for stage connectors
             worker_backend: Backend type ("multi_process" or "ray")
+            profiler_config: Optional profiler configuration.
             **kwargs: Additional arguments (e.g. ray_placement_group)
 
         Raises:
@@ -424,6 +429,10 @@ def init_stage_worker(
         # Prepare lightweight dict config for worker
         engine_args = _to_dict(self.engine_args)
         runtime_cfg = _to_dict(getattr(self.stage_config, "runtime", {}))
+
+        # Convert profiler_config to dict for serialization
+        profiler_config_dict = profiler_config.to_dict() if profiler_config else None
+
         stage_payload: dict[str, Any] = {
             "stage_id": self.stage_id,
             "engine_args": engine_args,
@@ -432,6 +441,7 @@ def init_stage_worker(
             "connectors_config": connectors_config or {},
             "stage_type": self.stage_type,
             "engine_input_source": self.engine_input_source,
+            "profiler_config": profiler_config_dict,
         }
         try:
             old_env = os.environ.get("VLLM_LOGGING_PREFIX")
@@ -727,43 +737,41 @@ def _stage_worker(
     max_batch_size = int(runtime_cfg.get("max_batch_size", 1) or 1)
     logger.info(f"Max batch size: {max_batch_size}")
 
-    def handle_profiler_task_local(task_type: OmniStageTaskType) -> dict:
+    profiler_instance = None
+
+    def handle_profiler_task_local(task: dict) -> dict:
         """Handle profiler task locally in the worker process."""
+        nonlocal profiler_instance
+        from vllm_omni.profiler import ProfilerConfig, TorchProfiler
+
+        task_type = task.get("type")
+
         if task_type == OmniStageTaskType.PROFILER_START:
-            if stage_type == "diffusion":
-                try:
-                    profile_dir = _os.environ.get("VLLM_TORCH_PROFILER_DIR", "./profiles")
-                    _os.makedirs(profile_dir, exist_ok=True)
-                    trace_filename = f"stage_{stage_id}_diffusion_{int(_time.time())}"
-                    stage_engine.start_profile(trace_filename=trace_filename)
-                    logger.info("[Stage-%s] Diffusion Torch profiler started", stage_id)
-                except Exception as e:
-                    logger.warning("[Stage-%s] Failed to start diffusion profiler: %s", stage_id, e)
-            else:
-                try:
-                    stage_engine.start_profile()
-                    logger.info("[Stage-%s] vLLM profiler started", stage_id)
-                except Exception as e:
-                    logger.warning("[Stage-%s] Failed to start vLLM profiler: %s", stage_id, e)
-            return {}
+            try:
+                config = ProfilerConfig.from_dict(task.get("config", {}))
+                profiler_instance = TorchProfiler(
+                    config,
+                    worker_name=f"stage-{stage_id}",
+                    local_rank=0,
+                )
+                profiler_instance.start()
+                logger.info("[Stage-%s] TorchProfiler started", stage_id)
+                return {"status": "started"}
+            except Exception as e:
+                logger.error("[Stage-%s] Failed to start TorchProfiler: %s", stage_id, e)
+                return {"error": str(e)}
 
         elif task_type == OmniStageTaskType.PROFILER_STOP:
-            if stage_type == "diffusion":
-                try:
-                    # CRITICAL: Capture return value
-                    result_data = stage_engine.stop_profile()
-                    logger.info("[Stage-%s] Diffusion Torch profiler stopped", stage_id)
-                    return result_data
-                except Exception as e:
-                    logger.warning("[Stage-%s] Failed to stop diffusion profiler: %s", stage_id, e)
-                    return {}
-            else:
-                try:
-                    stage_engine.stop_profile()
-                    logger.info("[Stage-%s] vLLM profiler stopped", stage_id)
-                except Exception as e:
-                    logger.warning("[Stage-%s] Failed to stop vLLM profiler: %s", stage_id, e)
-                return {}
+            try:
+                if profiler_instance is not None:
+                    profiler_instance.stop()
+                    profiler_instance = None
+                logger.info("[Stage-%s] TorchProfiler stopped", stage_id)
+                return {"status": "stopped"}
+            except Exception as e:
+                logger.error("[Stage-%s] Failed to stop TorchProfiler: %s", stage_id, e)
+                return {"error": str(e)}
+
         return {}
 
     # Batch processing loop
@@ -778,7 +786,7 @@ def handle_profiler_task_local(task_type: OmniStageTaskType) -> dict:
 
         # Handle profiler control commands
         if is_profiler_task(task_type):
-            profiler_data = handle_profiler_task_local(task_type)
+            profiler_data = handle_profiler_task_local(task)
             # If it was a STOP command, we must reply to the Orchestrator
             if task_type == OmniStageTaskType.PROFILER_STOP:
                 out_q.put({"type": "profiler_result", "data": profiler_data})
@@ -797,7 +805,7 @@ def handle_profiler_task_local(task_type: OmniStageTaskType) -> dict:
                     # Handle profiler commands that arrive during batching
                     extra_type = extra.get("type") if isinstance(extra, dict) else None
                     if is_profiler_task(extra_type):
-                        p_data = handle_profiler_task_local(extra_type)
+                        p_data = handle_profiler_task_local(extra)
                         if extra_type == OmniStageTaskType.PROFILER_STOP:
                             out_q.put({"type": "profiler_result", "data": p_data})
                         continue
@@ -1137,41 +1145,36 @@ async def _force_log():
         await stage_engine.reset_mm_cache()
     logger.debug("[Stage-%s] Engine initialized", stage_id)
 
-    async def handle_profiler_task_async(task_type: OmniStageTaskType) -> None:
+    async_profiler_instance = None
+
+    async def handle_profiler_task_async(task: dict) -> None:
         """Handle profiler task asynchronously for both LLM and diffusion stages."""
+        nonlocal async_profiler_instance
+        from vllm_omni.profiler import ProfilerConfig, TorchProfiler
+
+        task_type = task.get("type")
+
         if task_type == OmniStageTaskType.PROFILER_START:
-            if stage_type == "diffusion":
-                try:
-                    # Sync call is safe here — diffusion profiling is lightweight
-                    profile_dir = os.environ.get("VLLM_TORCH_PROFILER_DIR", "./profiles")
-                    os.makedirs(profile_dir, exist_ok=True)
-                    trace_filename = f"stage_{stage_id}_diffusion_{int(time.time())}"
-                    stage_engine.start_profile(trace_filename=trace_filename)
-                    logger.info("[Stage-%s] Diffusion Torch profiler started", stage_id)
-                except Exception as e:
-                    logger.warning("[Stage-%s] Failed to start diffusion profiler: %s", stage_id, e)
-            else:
-                try:
-                    await stage_engine.start_profile()
-                    logger.info("[Stage-%s] vLLM profiler started", stage_id)
-                except Exception as e:
-                    logger.warning("[Stage-%s] Failed to start vLLM profiler: %s", stage_id, e)
+            try:
+                config = ProfilerConfig.from_dict(task.get("config", {}))
+                async_profiler_instance = TorchProfiler(
+                    config,
+                    worker_name=f"stage-{stage_id}-async",
+                    local_rank=0,
+                )
+                async_profiler_instance.start()
+                logger.info("[Stage-%s] TorchProfiler started", stage_id)
+            except Exception as e:
+                logger.error("[Stage-%s] Failed to start TorchProfiler: %s", stage_id, e)
 
         elif task_type == OmniStageTaskType.PROFILER_STOP:
-            if stage_type == "diffusion":
-                try:
-                    trace_files = stage_engine.stop_profile()
-                    logger.info("[Stage-%s] Diffusion Torch profiler stopped", stage_id)
-                    if trace_files:
-                        logger.info("Diffusion trace files: %s", trace_files)
-                except Exception as e:
-                    logger.warning("[Stage-%s] Failed to stop diffusion profiler: %s", stage_id, e)
-            else:
-                try:
-                    await stage_engine.stop_profile()
-                    logger.info("[Stage-%s] vLLM profiler stopped", stage_id)
-                except Exception as e:
-                    logger.warning("[Stage-%s] Failed to stop vLLM profiler: %s", stage_id, e)
+            try:
+                if async_profiler_instance is not None:
+                    async_profiler_instance.stop()
+                    async_profiler_instance = None
+                logger.info("[Stage-%s] TorchProfiler stopped", stage_id)
+            except Exception as e:
+                logger.error("[Stage-%s] Failed to stop TorchProfiler: %s", stage_id, e)
 
     # Signal readiness to orchestrator and send vllm_config back to main process
     try:
@@ -1273,7 +1276,7 @@ async def generation_single_request(task: dict[str, Any]):
                 rid = task["request_id"]
                 asyncio.create_task(stage_engine.abort(rid))
             elif is_profiler_task(task_type):
-                await handle_profiler_task_async(task_type)
+                await handle_profiler_task_async(task)
             else:
                 asyncio.create_task(generation_single_request(task))
 
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index c4fc970c450..804df69a0fd 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -24,7 +24,6 @@
 from PIL import Image
 from starlette.datastructures import State
 from starlette.routing import Route
-from vllm import SamplingParams
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.anthropic.serving import AnthropicServingMessages
 from vllm.entrypoints.chat_utils import load_chat_template
@@ -74,6 +73,7 @@
 from vllm.tool_parsers import ToolParserManager
 from vllm.utils.system_utils import decorate_logs
 
+from vllm import SamplingParams
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 from vllm_omni.entrypoints.openai.image_api_utils import (
     encode_image_base64,
@@ -212,6 +212,14 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None,
         _remove_route_from_app(app, "/v1/models", {"GET"})  # Remove upstream /v1/models to use omni's handler
         app.include_router(router)
 
+        # Replace default profiler routes with stage-aware versions
+        _remove_route_from_router(app, "/start_profile", {"POST"})
+        _remove_route_from_router(app, "/stop_profile", {"POST"})
+
+        from vllm_omni.entrypoints.serve.profile.api_router import attach_router as attach_profile_router
+
+        attach_profile_router(app)
+
         await omni_init_app_state(engine_client, app.state, args)
 
         vllm_config = await engine_client.get_vllm_config()
@@ -332,6 +340,15 @@ async def build_async_omni_from_stage_config(
         kwargs = vars(args).copy()
         # Remove model as it will be passed separately
         kwargs.pop("model", None)
+
+        # Convert profiler_config to our ProfilerConfig so that
+        # OmniBase can serialize it via to_dict() for each stage worker.
+        from vllm_omni.profiler import ProfilerConfig as OmniProfilerConfig
+
+        profiler_config = OmniProfilerConfig.from_any(kwargs.pop("profiler_config", None))
+        if profiler_config is not None:
+            kwargs["profiler_config"] = profiler_config
+
         async_omni = AsyncOmni(model=args.model, **kwargs)
 
         # # Don't keep the dummy data in memory
diff --git a/vllm_omni/entrypoints/serve/__init__.py b/vllm_omni/entrypoints/serve/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm_omni/entrypoints/serve/profile/__init__.py b/vllm_omni/entrypoints/serve/profile/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm_omni/entrypoints/serve/profile/api_router.py b/vllm_omni/entrypoints/serve/profile/api_router.py
new file mode 100644
index 00000000000..734c4321395
--- /dev/null
+++ b/vllm_omni/entrypoints/serve/profile/api_router.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Stage-aware profiler HTTP endpoints.
+
+Follows the same API shape as ``vllm.entrypoints.serve.profile.api_router``
+(``/start_profile``, ``/stop_profile``) and extends it with an optional
+``stages`` parameter for multi-stage pipeline profiling.
+"""
+
+from __future__ import annotations
+
+from fastapi import APIRouter, FastAPI, Request
+from fastapi.responses import Response
+from pydantic import BaseModel
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+class ProfileRequest(BaseModel):
+    """Request body for profiler endpoints.
+
+    Attributes:
+        stages: List of stage IDs to profile. If None, profiles all stages.
+            For Qwen2.5-Omni / Qwen3-Omni:
+              - 0 = Thinker (multimodal understanding)
+              - 1 = Talker (text to codec codes)
+              - 2 = Code2Wav (codec to audio)
+    """
+
+    stages: list[int] | None = None
+
+
+@router.post("/start_profile")
+async def start_profile(raw_request: Request, body: ProfileRequest | None = None):
+    stages = body.stages if body else None
+    engine_client = raw_request.app.state.engine_client
+    stage_desc = f"stages={stages}" if stages else "all stages"
+    logger.info("Starting profiler for %s...", stage_desc)
+    await engine_client.start_profile(stages=stages)
+    logger.info("Profiler started for %s.", stage_desc)
+    return Response(status_code=200)
+
+
+@router.post("/stop_profile")
+async def stop_profile(raw_request: Request, body: ProfileRequest | None = None):
+    stages = body.stages if body else None
+    engine_client = raw_request.app.state.engine_client
+    stage_desc = f"stages={stages}" if stages else "all stages"
+    logger.info("Stopping profiler for %s...", stage_desc)
+    await engine_client.stop_profile(stages=stages)
+    logger.info("Profiler stopped for %s.", stage_desc)
+    return Response(status_code=200)
+
+
+def attach_router(app: FastAPI):
+    """Attach profiler routes if profiler is configured.
+
+    Mirrors the check in ``vllm.entrypoints.serve.profile.api_router``
+    but uses the routes defined here (which support ``stages``).
+    """
+    profiler_config = getattr(app.state.args, "profiler_config", None)
+    if profiler_config is not None and getattr(profiler_config, "profiler", None) is not None:
+        logger.warning(
+            "Profiler with mode '%s' is enabled in the API server. This should ONLY be used for local development!",
+            profiler_config.profiler,
+        )
+        app.include_router(router)
diff --git a/vllm_omni/inputs/data.py b/vllm_omni/inputs/data.py
index bc9434817f6..beafffab88d 100644
--- a/vllm_omni/inputs/data.py
+++ b/vllm_omni/inputs/data.py
@@ -4,7 +4,6 @@
 from typing import Any, TypeAlias
 
 from vllm import PromptType, SamplingParams
-
 from vllm_omni.lora.request import LoRARequest
 
 try:
diff --git a/vllm_omni/diffusion/profiler/__init__.py b/vllm_omni/profiler/__init__.py
similarity index 52%
rename from vllm_omni/diffusion/profiler/__init__.py
rename to vllm_omni/profiler/__init__.py
index df505cbaf67..5048c083761 100644
--- a/vllm_omni/diffusion/profiler/__init__.py
+++ b/vllm_omni/profiler/__init__.py
@@ -1,9 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from .config import ProfilerConfig
 from .torch_profiler import TorchProfiler
 
-# Default profiler – can be changed later via config
-CurrentProfiler = TorchProfiler
-
-__all__ = ["CurrentProfiler", "TorchProfiler"]
+__all__ = ["ProfilerConfig", "TorchProfiler"]
diff --git a/vllm_omni/profiler/config.py b/vllm_omni/profiler/config.py
new file mode 100644
index 00000000000..88678787b0e
--- /dev/null
+++ b/vllm_omni/profiler/config.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from dataclasses import asdict, dataclass, fields
+from typing import Any, Literal
+
+ProfilerKind = Literal["torch", "cuda"]
+
+
+@dataclass
+class ProfilerConfig:
+    """Profiler configuration aligned with vLLM 0.16.0 semantics.
+
+    Independent implementation with the same fields as upstream vLLM's
+    ProfilerConfig. This enables familiar configuration while keeping
+    vllm-omni fully decoupled.
+
+    Args:
+        profiler: Which profiler to use. Options: 'torch', 'cuda', or None.
+        torch_profiler_dir: Directory to save torch profiler traces.
+        torch_profiler_with_stack: Enable stack tracing in torch profiler.
+        torch_profiler_with_flops: Enable FLOPS counting in torch profiler.
+        torch_profiler_use_gzip: Save traces in gzip format.
+        torch_profiler_dump_cuda_time_total: Dump total CUDA time stats.
+        torch_profiler_record_shapes: Record tensor shapes.
+        torch_profiler_with_memory: Enable memory profiling.
+        delay_iterations: Engine iterations to skip before starting.
+        max_iterations: Maximum engine iterations to profile (0 = no limit).
+
+    Example:
+        >>> config = ProfilerConfig(
+        ...     profiler="torch",
+        ...     torch_profiler_dir="./profiles",
+        ... )
+    """
+
+    profiler: ProfilerKind | None = None
+    torch_profiler_dir: str = ""
+    torch_profiler_with_stack: bool = True
+    torch_profiler_with_flops: bool = False
+    torch_profiler_use_gzip: bool = True
+    torch_profiler_dump_cuda_time_total: bool = True
+    torch_profiler_record_shapes: bool = False
+    torch_profiler_with_memory: bool = False
+    delay_iterations: int = 0
+    max_iterations: int = 0
+
+    def __post_init__(self):
+        if self.torch_profiler_dir and self.profiler != "torch":
+            raise ValueError("torch_profiler_dir is only applicable when profiler='torch'")
+        if self.profiler == "torch" and not self.torch_profiler_dir:
+            raise ValueError("torch_profiler_dir must be set when profiler='torch'")
+        if self.torch_profiler_dir:
+            self.torch_profiler_dir = os.path.abspath(os.path.expanduser(self.torch_profiler_dir))
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> "ProfilerConfig":
+        valid_fields = {f.name for f in fields(cls)}
+        return cls(**{k: v for k, v in d.items() if k in valid_fields})
+
+    @classmethod
+    def from_any(cls, obj: Any) -> "ProfilerConfig | None":
+        """Convert any profiler-config-like object to our ProfilerConfig.
+
+        Accepts our own ProfilerConfig, a compatible dataclass from the CLI
+        layer, or a dict.  Returns None when *obj* is None or has no profiler
+        set.
+        """
+        if obj is None:
+            return None
+        if isinstance(obj, cls):
+            return obj
+        if isinstance(obj, dict):
+            return cls.from_dict(obj)
+        if hasattr(obj, "profiler") and obj.profiler is not None:
+            return cls(
+                profiler=obj.profiler,
+                torch_profiler_dir=getattr(obj, "torch_profiler_dir", "") or "",
+            )
+        return None
diff --git a/vllm_omni/profiler/torch_profiler.py b/vllm_omni/profiler/torch_profiler.py
new file mode 100644
index 00000000000..2bb60bcb1e8
--- /dev/null
+++ b/vllm_omni/profiler/torch_profiler.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import torch
+from torch.profiler import ProfilerActivity
+from vllm.logger import init_logger
+
+from .config import ProfilerConfig
+
+logger = init_logger(__name__)
+
+
+class TorchProfiler:
+    """Instance-based torch profiler aligned with upstream vLLM 0.16.0.
+
+    Mirrors upstream WorkerProfiler + TorchProfilerWrapper behavior with
+    independent implementation. Uses ``tensorboard_trace_handler`` for trace
+    export and supports delay/max iteration control.
+
+    Args:
+        config: ProfilerConfig with torch profiler settings.
+        worker_name: Name used in trace file naming.
+        local_rank: GPU rank for CUDA time stats output.
+    """
+
+    def __init__(
+        self,
+        config: ProfilerConfig,
+        worker_name: str = "",
+        local_rank: int = 0,
+    ):
+        self._config = config
+        self._local_rank = local_rank
+        self._delay_iters = config.delay_iterations
+        self._max_iters = config.max_iterations
+        self._active = False
+        self._running = False
+        self._active_iteration_count = 0
+        self._profiling_for_iters = 0
+
+        self._profiler = torch.profiler.profile(
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            record_shapes=config.torch_profiler_record_shapes,
+            profile_memory=config.torch_profiler_with_memory,
+            with_stack=config.torch_profiler_with_stack,
+            with_flops=config.torch_profiler_with_flops,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                config.torch_profiler_dir,
+                worker_name=worker_name,
+                use_gzip=config.torch_profiler_use_gzip,
+            ),
+        )
+
+    def start(self):
+        """Start profiling, accounting for delayed starts."""
+        if self._active:
+            return
+        self._active = True
+        if self._delay_iters == 0:
+            self._call_start()
+
+    def stop(self):
+        """Stop profiling."""
+        if not self._active:
+            return
+        self._active = False
+        self._active_iteration_count = 0
+        self._profiling_for_iters = 0
+        if self._running:
+            self._call_stop()
+
+    def step(self):
+        """Per-iteration update for delay/max handling."""
+        if not self._active:
+            return
+        self._active_iteration_count += 1
+        if not self._running and self._delay_iters > 0 and self._active_iteration_count == self._delay_iters:
+            self._call_start()
+        if self._running:
+            self._profiling_for_iters += 1
+        if self._max_iters > 0 and self._running and self._profiling_for_iters > self._max_iters:
+            self._call_stop()
+
+    def shutdown(self):
+        """Shutdown the profiler if running."""
+        if self._running:
+            self.stop()
+
+    @property
+    def is_running(self) -> bool:
+        return self._running
+
+    def _call_start(self):
+        try:
+            self._profiler.start()
+            self._running = True
+        except Exception as e:
+            logger.warning("Failed to start profiler: %s", e)
+
+    def _call_stop(self):
+        try:
+            self._profiler.stop()
+            if self._config.torch_profiler_dump_cuda_time_total:
+                table = self._profiler.key_averages().table(sort_by="self_cuda_time_total")
+                out_file = os.path.join(
+                    self._config.torch_profiler_dir,
+                    f"profiler_out_{self._local_rank}.txt",
+                )
+                with open(out_file, "w") as f:
+                    print(table, file=f)
+                if self._local_rank == 0:
+                    print(table)
+        except Exception as e:
+            logger.warning("Failed to stop profiler: %s", e)
+        self._running = False

From 544ee9cc01e8f16f2e1eaa87f16f02687c29f5f7 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 11 Feb 2026 04:20:35 +0800
Subject: [PATCH 2/6] precommit

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 docs/PR_DESCRIPTION.md | 489 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 489 insertions(+)
 create mode 100644 docs/PR_DESCRIPTION.md

diff --git a/docs/PR_DESCRIPTION.md b/docs/PR_DESCRIPTION.md
new file mode 100644
index 00000000000..9ba45b07787
--- /dev/null
+++ b/docs/PR_DESCRIPTION.md
@@ -0,0 +1,489 @@
+# [Feature] Unified Profiler with Online Serving and Stage-Aware Endpoints
+
+## Summary
+
+- Consolidate the scattered diffusion-only profiler (`vllm_omni/diffusion/profiler/`) into a unified `vllm_omni/profiler/` module that works across all stage types (LLM, diffusion, omni-modality)
+- Add stage-aware HTTP profiler endpoints (`/start_profile`, `/stop_profile`) for the online API server, following upstream vLLM's API shape and extending it with an optional `stages` parameter for multi-stage pipeline profiling
+- Wire up `ProfilerConfig` end-to-end: CLI args → `AsyncOmni` → per-stage workers via `to_dict()`/`from_dict()` serialization
+- Add `--profile-dir` CLI argument to all offline inference examples (text-to-image, image-to-video, qwen2.5-omni, qwen3-omni, etc.)
+
+## Changes
+
+### New files
+| File | Description |
+|------|-------------|
+| `vllm_omni/profiler/__init__.py` | Unified profiler package (replaces `vllm_omni/diffusion/profiler/`) |
+| `vllm_omni/profiler/config.py` | `ProfilerConfig` dataclass with `to_dict()`/`from_dict()`/`from_any()` serialization |
+| `vllm_omni/profiler/torch_profiler.py` | `TorchProfiler` class aligned with upstream vLLM 0.16.0 semantics |
+| `vllm_omni/entrypoints/serve/profile/api_router.py` | Stage-aware `/start_profile` and `/stop_profile` HTTP endpoints |
+| `tests/profiler/test_config.py` | Unit tests for `ProfilerConfig` |
+| `tests/profiler/test_torch_profiler.py` | Unit tests for `TorchProfiler` (CUDA + CPU) |
+
+### Deleted files
+| File | Reason |
+|------|--------|
+| `vllm_omni/diffusion/profiler/base.py` | Replaced by `vllm_omni/profiler/torch_profiler.py` |
+| `vllm_omni/diffusion/profiler/torch_profiler.py` | Replaced by `vllm_omni/profiler/torch_profiler.py` |
+
+### Modified files
+| File | Change |
+|------|--------|
+| `vllm_omni/entrypoints/omni.py` | `OmniBase.__init__` accepts `profiler_config`, `start_profile(stages)` / `stop_profile(stages)` methods |
+| `vllm_omni/entrypoints/omni_llm.py` | `OmniLLM` accepts `profiler_config`, single-stage `start_profile()` / `stop_profile()` |
+| `vllm_omni/entrypoints/omni_stage.py` | Stage workers handle `PROFILER_START`/`PROFILER_STOP` tasks via `TorchProfiler` |
+| `vllm_omni/entrypoints/omni_diffusion.py` | Uses unified profiler module |
+| `vllm_omni/entrypoints/openai/api_server.py` | Replaces upstream profiler routes with stage-aware versions; converts `profiler_config` for `AsyncOmni` |
+| `vllm_omni/diffusion/diffusion_engine.py` | Uses unified profiler module for diffusion engine profiling |
+| `vllm_omni/diffusion/worker/diffusion_worker.py` | Uses unified profiler module |
+| `vllm_omni/config/__init__.py` | Re-exports `ProfilerConfig` |
+| `docs/contributing/profiling.md` | Full documentation for offline and online profiling |
+| `examples/offline_inference/*/` | All examples now support `--profile-dir` CLI flag |
+
+## Architecture
+
+### Class Hierarchy
+
+```
+ProfilerConfig (vllm_omni/profiler/config.py)     TorchProfiler (vllm_omni/profiler/torch_profiler.py)
+       │    to_dict() / from_dict() / from_any()          │    start() / stop() / step() / shutdown()
+       │                                                   │
+       │  used by all paths below                          │  instantiated in each worker process
+       ▼                                                   ▼
+  ┌──────────────────────────────────────────────────────────────┐
+  │                     Entry Points                             │
+  ├────────────────────┬────────────────────┬────────────────────┤
+  │  Multi-Stage       │  Single-Stage LLM  │  Single-Stage      │
+  │  (Qwen-Omni)       │  (OmniLLM)         │  Diffusion         │
+  │                    │                    │  (OmniDiffusion)   │
+  │  OmniBase          │  OmniLLM(LLM)     │  OmniDiffusion     │
+  │   ├─ Omni          │                    │   └─ DiffusionEngine│
+  │   └─ AsyncOmni     │                    │       └─ Workers   │
+  └────────────────────┴────────────────────┴────────────────────┘
+```
+
+### Multi-Stage Profiling Flow (Qwen2.5-Omni / Qwen3-Omni)
+
+This is the primary flow for online serving and offline omni-modality inference.
+Example: `stages=[0]` profiles only the Thinker stage.
+
+```
+  Online Serving                                 Offline Inference
+  ──────────────                                 ─────────────────
+
+  curl -X POST /start_profile                    omni = Omni(
+    -d '{"stages": [0]}'                           model="Qwen/Qwen2.5-Omni-7B",
+         │                                         profiler_config=ProfilerConfig(
+         ▼                                           profiler="torch",
+  api_router.py                                      torch_profiler_dir="./profiles"))
+  ProfileRequest{stages:[0]}                         │
+         │                                     omni.start_profile(stages=[0])
+         ▼                                           │
+  AsyncOmni.start_profile(stages=[0])                │
+  (async wrapper → calls super())                    │
+         │                                           │
+         └──────────────┬────────────────────────────┘
+                        │
+                        ▼
+              OmniBase.start_profile(stages=[0])
+              │
+              │  for stage_id in [0]:
+              │    task = {"type": PROFILER_START,
+              │            "config": self._profiler_config.to_dict()}
+              │    self.stage_list[0].submit(task)
+              │
+              ▼  (only stage 0 receives task; stages 1,2 are skipped)
+  ┌─────────────────┐  ┌─────────────────┐  ┌─────────────────┐
+  │    Stage 0       │  │    Stage 1       │  │    Stage 2       │
+  │   (Thinker)      │  │   (Talker)       │  │   (Code2Wav)     │
+  │                  │  │                  │  │                  │
+  │  in_q.get()      │  │  (no task)       │  │  (no task)       │
+  │       │          │  │                  │  │                  │
+  │       ▼          │  │                  │  │                  │
+  │  Worker Process  │  │                  │  │                  │
+  │  (omni_stage.py) │  │                  │  │                  │
+  └───────┬──────────┘  └──────────────────┘  └──────────────────┘
+          │
+          ▼
+  handle_profiler_task_local(task)  ──or──  handle_profiler_task_async(task)
+  (sync worker: diffusion stages)          (async worker: LLM stages)
+          │
+          ├─ config = ProfilerConfig.from_dict(task["config"])
+          ├─ profiler = TorchProfiler(config, worker_name="stage-0")
+          └─ profiler.start()
+          │
+  ════════╪══════════════════════════════════════════════════
+          │  Requests flow through stage 0 with profiling active
+          │  torch.profiler captures CPU/CUDA activity per iteration
+  ════════╪══════════════════════════════════════════════════
+          │
+  ┌───────┴───────── STOP FLOW ─────────────────────────────┐
+  │                                                          │
+  │  curl -X POST /stop_profile     omni.stop_profile()     │
+  │    -d '{"stages": [0]}'              │                   │
+  │         │                            │                   │
+  │         └────────────┬───────────────┘                   │
+  │                      ▼                                   │
+  │  OmniBase.stop_profile(stages=[0])                       │
+  │  │                                                       │
+  │  │  stage.stop_profile()                                 │
+  │  │    └─ submit(PROFILER_STOP)                           │
+  │  │    └─ out_q.get(timeout=600)  # wait for worker reply │
+  │  │                                                       │
+  │  └───────────► Worker Process receives PROFILER_STOP     │
+  │                      │                                   │
+  │                      ▼                                   │
+  │                profiler.stop()                           │
+  │                      │                                   │
+  │                      ├─ Flush trace via tensorboard_trace_handler
+  │                      ├─ Write CUDA time stats table      │
+  │                      └─ out_q.put({"type":"profiler_result"})
+  │                                                          │
+  └──────────────────────────────────────────────────────────┘
+          │
+          ▼
+  Output Files (torch_profiler_dir):
+  ├── stage-0_*.trace.json.gz     # TensorBoard / Perfetto trace
+  └── profiler_out_0.txt          # CUDA time stats (key_averages table)
+```
+
+### Single-Stage LLM Flow (OmniLLM)
+
+For single-stage LLM-only models. TorchProfiler runs in-process (no IPC needed).
+
+```
+  omni_llm = OmniLLM(profiler_config=ProfilerConfig(...))
+       │
+  omni_llm.start_profile()
+       │
+       ├─ TorchProfiler(config, worker_name="llm-rank-0").start()
+       │   (created directly in the same process)
+       │
+       │  ... requests ...
+       │
+  omni_llm.stop_profile()
+       │
+       └─ profiler.stop() → trace files written
+```
+
+### Single-Stage Diffusion Flow (OmniDiffusion / DiffusionEngine)
+
+For standalone diffusion models. Profiler is distributed to GPU workers via `collective_rpc`.
+
+```
+  omni_diff = OmniDiffusion(profiler_config=ProfilerConfig(...))
+       │
+  omni_diff.start_profile()
+       │
+       └─ DiffusionEngine.start_profile(config)
+            │
+            └─ collective_rpc("start_profile", args=(config.to_dict(),))
+                 │
+                 ├─► DiffusionWorker rank 0: TorchProfiler(config).start()
+                 ├─► DiffusionWorker rank 1: TorchProfiler(config).start()
+                 └─► ...
+                 │
+                 │  ... generation ...
+                 │
+  omni_diff.stop_profile()
+       │
+       └─ DiffusionEngine.stop_profile()
+            │
+            └─ collective_rpc("stop_profile")
+                 │
+                 ├─► Worker rank 0: profiler.stop() → trace files
+                 └─► Worker rank 1: profiler.stop() → trace files
+
+  Output Files:
+  ├── diffusion-rank-0_*.trace.json.gz
+  ├── diffusion-rank-1_*.trace.json.gz
+  ├── profiler_out_0.txt
+  └── profiler_out_1.txt
+```
+
+### Online Serving Config Conversion
+
+When the API server starts, upstream's `--profiler-config` CLI arg is converted to our `ProfilerConfig`:
+
+```
+  vllm serve --profiler-config profiler=torch,torch_profiler_dir=./profiles
+       │
+       ▼
+  build_async_omni_from_stage_config(args)
+       │
+       ├─ upstream_config = args.profiler_config   (vllm.config.ProfilerConfig)
+       ├─ our_config = OmniProfilerConfig.from_any(upstream_config)
+       │     └─ converts to vllm_omni.profiler.ProfilerConfig
+       └─ AsyncOmni(model=..., profiler_config=our_config)
+              │
+              └─ OmniBase.__init__ stores self._profiler_config
+                   │
+                   └─ start_profile() serializes via .to_dict() for each stage worker
+```
+
+Upstream's profiler routes are replaced at server startup:
+
+```
+  app = build_openai_app(args)          # upstream registers /start_profile, /stop_profile
+       │
+  _remove_route_from_router(app, "/start_profile")   # remove upstream routes
+  _remove_route_from_router(app, "/stop_profile")
+       │
+  attach_profile_router(app)            # register our stage-aware routes
+       │                                # (checks app.state.args.profiler_config)
+       ▼
+  Our /start_profile accepts: {"stages": [0,1,2]} or empty body (all stages)
+  Our /stop_profile  accepts: {"stages": [0,1,2]} or empty body (all stages)
+```
+
+## Test Plan
+
+### Unit Tests (no GPU needed)
+
+```bash
+# 1. ProfilerConfig: defaults, validation, to_dict, from_dict, roundtrip, dir expansion
+# 2. ProfilerConfig.from_any(): None, own instance, dict, upstream-like object, profiler=None
+# 3. ProfilerConfig re-export from vllm_omni.config
+pytest tests/profiler/test_config.py -v
+
+# 4. API router: attach_router conditional (profiler set / None / profiler=None)
+# 5. POST /start_profile: no body (all stages), stages=[0], stages=[0,2]
+# 6. POST /stop_profile: no body (all stages), stages=[1]
+# 7. Verifies engine_client.start_profile/stop_profile called with correct stages arg
+pytest tests/profiler/test_api_router.py -v
+```
+
+### Unit Tests (CUDA required)
+
+```bash
+# 8.  TorchProfiler start/stop lifecycle
+# 9.  stop without start is no-op
+# 10. double start is no-op
+# 11. shutdown stops running profiler
+# 12. step() iteration counting
+# 13. delay_iterations: profiler starts only after N steps
+# 14. max_iterations: profiler auto-stops after N steps
+# 15. config-driven settings (record_shapes, memory, flops)
+# 16. Trace files (.trace.json.gz) written and non-empty
+# 17. CUDA time stats (profiler_out_0.txt) written and non-empty
+# 18. worker_name appears in trace filename (e.g. "stage-0")
+pytest tests/profiler/test_torch_profiler.py -v
+```
+
+### Integration: Offline Diffusion (Text-to-Image)
+
+```bash
+# 19. Single-stage diffusion profiling end-to-end
+python examples/offline_inference/text_to_image/text_to_image.py \
+    --model Tongyi-MAI/Z-Image-Turbo \
+    --profile-dir ./profiles/t2i
+
+# Verify:
+ls ./profiles/t2i/
+# - *.trace.json.gz exists and size > 0
+# - profiler_out_*.txt exists and size > 0
+python -c "
+import glob, os
+traces = glob.glob('./profiles/t2i/*.trace.json.gz')
+stats = glob.glob('./profiles/t2i/profiler_out_*.txt')
+assert len(traces) >= 1, f'No traces: {os.listdir(\"./profiles/t2i\")}'
+assert len(stats) >= 1, f'No stats: {os.listdir(\"./profiles/t2i\")}'
+assert all(os.path.getsize(f) > 0 for f in traces + stats)
+print(f'OK: {len(traces)} traces, {len(stats)} stats files')
+"
+```
+
+### Integration: Offline Qwen2.5-Omni (3-stage, stage-selective)
+
+```bash
+# 20. Multi-stage profiling — stage 0 (Thinker) only
+python examples/offline_inference/qwen2_5_omni/end2end.py \
+    --model Qwen/Qwen2.5-Omni-7B \
+    --profile-dir ./profiles/qwen25
+
+# Verify:
+ls ./profiles/qwen25/
+# - stage-0_*.trace.json.gz exists (Thinker trace)
+# - No stage-1_* or stage-2_* files (only stage 0 was profiled)
+python -c "
+import glob, os
+s0 = glob.glob('./profiles/qwen25/stage-0*')
+s1 = glob.glob('./profiles/qwen25/stage-1*')
+s2 = glob.glob('./profiles/qwen25/stage-2*')
+assert len(s0) >= 1, f'No stage-0 traces: {os.listdir(\"./profiles/qwen25\")}'
+assert len(s1) == 0, f'Unexpected stage-1 traces: {s1}'
+assert len(s2) == 0, f'Unexpected stage-2 traces: {s2}'
+print(f'OK: {len(s0)} stage-0 traces, no stage-1/2 traces')
+"
+```
+
+### Integration: Offline Qwen3-Omni (3-stage)
+
+```bash
+# 21. Multi-stage profiling — Qwen3-Omni variant
+python examples/offline_inference/qwen3_omni/end2end.py \
+    --model Qwen/Qwen3-Omni-8B \
+    --profile-dir ./profiles/qwen3
+
+ls ./profiles/qwen3/
+# Expected: stage-0_*.trace.json.gz
+```
+
+### Integration: Online Serving Startup
+
+```bash
+# 22. Server starts with profiling enabled
+python -m vllm_omni.entrypoints.openai.api_server \
+    --model Qwen/Qwen2.5-Omni-7B \
+    --profiler-config profiler=torch,torch_profiler_dir=./profiles/online
+
+# Verify server logs contain:
+#   "Profiler with mode 'torch' is enabled in the API server..."
+```
+
+### Integration: Online Profile All Stages
+
+```bash
+# 23. Profile all stages via HTTP (no stages param)
+curl -X POST http://localhost:8000/start_profile
+# Expected: HTTP 200
+
+# Send requests to exercise all stages
+curl -X POST http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-Omni-7B",
+        "messages": [{"role": "user", "content": "Hello, how are you?"}]
+    }'
+
+curl -X POST http://localhost:8000/stop_profile
+# Expected: HTTP 200
+
+# Verify traces for ALL stages:
+ls ./profiles/online/
+# Expected: stage-0_*, stage-1_*, stage-2_* trace files
+python -c "
+import glob
+for i in range(3):
+    files = glob.glob(f'./profiles/online/stage-{i}*')
+    print(f'stage-{i}: {len(files)} files')
+    assert len(files) >= 1, f'Missing stage-{i} traces'
+print('OK: all 3 stages have traces')
+"
+```
+
+### Integration: Online Stage-Selective Profiling
+
+```bash
+# 24. Profile only Stage 0 (Thinker)
+rm -rf ./profiles/online/*  # clean from previous test
+
+curl -X POST http://localhost:8000/start_profile \
+    -H "Content-Type: application/json" \
+    -d '{"stages": [0]}'
+
+curl -X POST http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-Omni-7B",
+        "messages": [{"role": "user", "content": "Tell me a joke"}]
+    }'
+
+curl -X POST http://localhost:8000/stop_profile \
+    -H "Content-Type: application/json" \
+    -d '{"stages": [0]}'
+
+# Verify ONLY stage-0 traces:
+python -c "
+import glob
+s0 = glob.glob('./profiles/online/stage-0*')
+s1 = glob.glob('./profiles/online/stage-1*')
+s2 = glob.glob('./profiles/online/stage-2*')
+assert len(s0) >= 1, f'Missing stage-0 traces'
+assert len(s1) == 0, f'Unexpected stage-1 traces: {s1}'
+assert len(s2) == 0, f'Unexpected stage-2 traces: {s2}'
+print(f'OK: only stage-0 ({len(s0)} files), no stage-1/2')
+"
+```
+
+### Integration: Online Profile Talker + Code2Wav
+
+```bash
+# 25. Profile stages 1 and 2 only
+rm -rf ./profiles/online/*
+
+curl -X POST http://localhost:8000/start_profile \
+    -H "Content-Type: application/json" \
+    -d '{"stages": [1, 2]}'
+
+# Send requests...
+curl -X POST http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-Omni-7B",
+        "messages": [{"role": "user", "content": "Count to five"}]
+    }'
+
+curl -X POST http://localhost:8000/stop_profile \
+    -H "Content-Type: application/json" \
+    -d '{"stages": [1, 2]}'
+
+# Verify:
+python -c "
+import glob
+s0 = glob.glob('./profiles/online/stage-0*')
+s1 = glob.glob('./profiles/online/stage-1*')
+s2 = glob.glob('./profiles/online/stage-2*')
+assert len(s0) == 0, f'Unexpected stage-0 traces: {s0}'
+assert len(s1) >= 1, f'Missing stage-1 traces'
+assert len(s2) >= 1, f'Missing stage-2 traces'
+print(f'OK: no stage-0, stage-1 ({len(s1)} files), stage-2 ({len(s2)} files)')
+"
+```
+
+### Negative Tests
+
+```bash
+# 26. Server WITHOUT --profiler-config: endpoints should not exist
+python -m vllm_omni.entrypoints.openai.api_server \
+    --model Qwen/Qwen2.5-Omni-7B
+# (in another terminal)
+curl -X POST http://localhost:8000/start_profile
+# Expected: 404 Not Found or 405 Method Not Allowed
+
+# 27. Offline: start_profile without profiler_config raises ValueError
+python -c "
+from vllm_omni.entrypoints.omni_llm import OmniLLM
+try:
+    llm = OmniLLM.__new__(OmniLLM)
+    llm._profiler_config = None
+    llm._profiler_instance = None
+    llm.start_profile()
+    assert False, 'Should have raised ValueError'
+except ValueError as e:
+    print(f'OK: {e}')
+"
+```
+
+### Trace Viewing
+
+```bash
+# 28. Verify trace files load in Perfetto
+#     Upload any .trace.json.gz to https://ui.perfetto.dev/
+#     Should render timeline with CPU/CUDA activity
+```
+
+### Checklist
+
+- [ ] `pytest tests/profiler/test_config.py -v` — all 14 tests pass (config, from_any, re-export)
+- [ ] `pytest tests/profiler/test_api_router.py -v` — all 8 tests pass (endpoints, attach_router)
+- [ ] `pytest tests/profiler/test_torch_profiler.py -v` — all 13 tests pass (lifecycle, trace output)
+- [ ] Offline text-to-image: traces written, non-empty
+- [ ] Offline Qwen2.5-Omni: stage-0 traces only (no stage-1/2)
+- [ ] Offline Qwen3-Omni: stage-0 traces
+- [ ] Online server starts with profiler warning
+- [ ] Online `/start_profile` → 200, `/stop_profile` → 200, all stage traces written
+- [ ] Online `{"stages": [0]}` → only stage-0 traces
+- [ ] Online `{"stages": [1,2]}` → only stage-1/2 traces
+- [ ] Server without `--profiler-config` → 404 on `/start_profile`
+- [ ] Offline without `profiler_config` → ValueError
+- [ ] Traces load in Perfetto UI

From 662f9c6da7580aa76827d4a10455805fcde8da55 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 11 Feb 2026 04:30:23 +0800
Subject: [PATCH 3/6] precommit

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 .../offline_inference/qwen3_omni/end2end.py   |  4 ++--
 .../offline_inference/qwen3_tts/end2end.py    |  2 +-
 pyproject.toml                                |  3 +++
 tests/e2e/online_serving/test_async_omni.py   |  2 +-
 .../openai_api/test_image_server.py           |  2 +-
 tests/entrypoints/test_omni_llm.py            |  2 +-
 vllm_omni/benchmarks/patch/patch.py           | 21 +++++++++++++------
 vllm_omni/entrypoints/omni.py                 |  8 +++++--
 vllm_omni/entrypoints/omni_stage.py           |  9 ++++++--
 vllm_omni/entrypoints/openai/api_server.py    | 12 ++++++++---
 vllm_omni/inputs/data.py                      |  1 +
 11 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py
index f00248a696e..313f9dcd891 100644
--- a/examples/offline_inference/qwen3_omni/end2end.py
+++ b/examples/offline_inference/qwen3_omni/end2end.py
@@ -11,15 +11,15 @@
 import librosa
 import numpy as np
 import soundfile as sf
+import vllm
 from PIL import Image
+from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset, video_to_ndarrays
 from vllm.multimodal.image import convert_image_mode
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
-import vllm
-from vllm import SamplingParams
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.profiler import ProfilerConfig
 
diff --git a/examples/offline_inference/qwen3_tts/end2end.py b/examples/offline_inference/qwen3_tts/end2end.py
index 94442d1cae9..4818eb6140a 100644
--- a/examples/offline_inference/qwen3_tts/end2end.py
+++ b/examples/offline_inference/qwen3_tts/end2end.py
@@ -12,9 +12,9 @@
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
+from vllm import SamplingParams
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
-from vllm import SamplingParams
 from vllm_omni import Omni
 from vllm_omni.profiler import ProfilerConfig
 
diff --git a/pyproject.toml b/pyproject.toml
index 6af1939536c..3aa832b70db 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -117,6 +117,9 @@ ignore = [
     "N812",  # lowercase imported as non-lowercase: functional as F
 ]
 
+[tool.ruff.lint.isort]
+known-third-party = ["vllm"]
+
 [tool.ruff.lint.per-file-ignores]
 "examples/**" = ["E501"]  # Allow long lines in examples
 "tests/**" = ["E501"]  # Allow long lines in tests
diff --git a/tests/e2e/online_serving/test_async_omni.py b/tests/e2e/online_serving/test_async_omni.py
index 55ccfc3359c..cab3e6e2286 100644
--- a/tests/e2e/online_serving/test_async_omni.py
+++ b/tests/e2e/online_serving/test_async_omni.py
@@ -5,10 +5,10 @@
 from pathlib import Path
 
 import pytest
+from vllm import SamplingParams
 from vllm.inputs import PromptType
 
 from tests.utils import hardware_test
-from vllm import SamplingParams
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py
index 49031ee97f9..0c6479ccea7 100644
--- a/tests/entrypoints/openai_api/test_image_server.py
+++ b/tests/entrypoints/openai_api/test_image_server.py
@@ -15,8 +15,8 @@
 import pytest
 from fastapi.testclient import TestClient
 from PIL import Image
-
 from vllm import SamplingParams
+
 from vllm_omni.entrypoints.openai.image_api_utils import (
     encode_image_base64,
     parse_size,
diff --git a/tests/entrypoints/test_omni_llm.py b/tests/entrypoints/test_omni_llm.py
index 33fd002e73a..4f05575ca59 100644
--- a/tests/entrypoints/test_omni_llm.py
+++ b/tests/entrypoints/test_omni_llm.py
@@ -5,8 +5,8 @@
 from unittest.mock import MagicMock
 
 import pytest
-
 from vllm import SamplingParams
+
 from vllm_omni.entrypoints.stage_utils import SHUTDOWN_TASK
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py
index faf923ec198..5c4ce8f6af2 100644
--- a/vllm_omni/benchmarks/patch/patch.py
+++ b/vllm_omni/benchmarks/patch/patch.py
@@ -17,6 +17,7 @@
 from pydub import AudioSegment
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
+from vllm.benchmarks import datasets
 from vllm.benchmarks.datasets import SampleRequest
 from vllm.benchmarks.lib.endpoint_request_func import (
     ASYNC_REQUEST_FUNCS,
@@ -31,10 +32,10 @@
 )
 from vllm.logger import init_logger
 
-from vllm.benchmarks import datasets
-
 logger = init_logger(__name__)
-from vllm_omni.benchmarks.data_modules.random_multi_modal_dataset import OmniRandomMultiModalDataset
+from vllm_omni.benchmarks.data_modules.random_multi_modal_dataset import (
+    OmniRandomMultiModalDataset,
+)
 
 get_samples_old = datasets.get_samples
 
@@ -202,10 +203,18 @@ async def async_request_openai_chat_omni_completions(
 
 # ruff: noqa: E402
 # Prevent import order from causing patch failures
-from vllm.benchmarks.serve import TaskType, calculate_metrics_for_embeddings, get_request, wait_for_endpoint
-
 from vllm.benchmarks import serve
-from vllm_omni.benchmarks.metrics.metrics import MultiModalsBenchmarkMetrics, calculate_metrics
+from vllm.benchmarks.serve import (
+    TaskType,
+    calculate_metrics_for_embeddings,
+    get_request,
+    wait_for_endpoint,
+)
+
+from vllm_omni.benchmarks.metrics.metrics import (
+    MultiModalsBenchmarkMetrics,
+    calculate_metrics,
+)
 
 # ruff: noqa: E402
 
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
index 5b224b4430f..23678502896 100644
--- a/vllm_omni/entrypoints/omni.py
+++ b/vllm_omni/entrypoints/omni.py
@@ -13,9 +13,9 @@
 import huggingface_hub
 from omegaconf import OmegaConf
 from tqdm.auto import tqdm
+from vllm import SamplingParams
 from vllm.logger import init_logger
 
-from vllm import SamplingParams
 from vllm_omni.distributed.omni_connectors import (
     get_stage_connector_config,
     initialize_orchestrator_connectors,
@@ -39,7 +39,11 @@
     load_stage_configs_from_yaml,
     resolve_model_config_path,
 )
-from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType, OmniSamplingParams
+from vllm_omni.inputs.data import (
+    OmniDiffusionSamplingParams,
+    OmniPromptType,
+    OmniSamplingParams,
+)
 from vllm_omni.metrics import OrchestratorAggregator, StageRequestStats
 from vllm_omni.model_executor.model_loader.weight_utils import (
     download_weights_from_hf_specific,
diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py
index f3034186804..c6448c62d8f 100644
--- a/vllm_omni/entrypoints/omni_stage.py
+++ b/vllm_omni/entrypoints/omni_stage.py
@@ -23,6 +23,7 @@
 if TYPE_CHECKING:
     from vllm_omni.profiler import ProfilerConfig
 
+from vllm import PromptType, RequestOutput
 from vllm.inputs import TextPrompt
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
@@ -33,7 +34,6 @@
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.llm_engine import LLMEngine
 
-from vllm import PromptType, RequestOutput
 from vllm_omni.diffusion.data import OmniDiffusionConfig
 from vllm_omni.distributed.omni_connectors import build_stage_connectors
 from vllm_omni.distributed.omni_connectors.adapter import try_recv_via_connector
@@ -52,7 +52,12 @@
     maybe_dump_to_shm,
     set_stage_devices,
 )
-from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType, OmniSamplingParams, OmniTokensPrompt
+from vllm_omni.inputs.data import (
+    OmniDiffusionSamplingParams,
+    OmniPromptType,
+    OmniSamplingParams,
+    OmniTokensPrompt,
+)
 from vllm_omni.metrics import count_tokens_from_outputs
 from vllm_omni.outputs import OmniRequestOutput
 
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 804df69a0fd..a09347d6387 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -24,6 +24,7 @@
 from PIL import Image
 from starlette.datastructures import State
 from starlette.routing import Route
+from vllm import SamplingParams
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.anthropic.serving import AnthropicServingMessages
 from vllm.entrypoints.chat_utils import load_chat_template
@@ -73,7 +74,6 @@
 from vllm.tool_parsers import ToolParserManager
 from vllm.utils.system_utils import decorate_logs
 
-from vllm import SamplingParams
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 from vllm_omni.entrypoints.openai.image_api_utils import (
     encode_image_base64,
@@ -87,7 +87,11 @@
 )
 from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
 from vllm_omni.entrypoints.openai.serving_speech import OmniOpenAIServingSpeech
-from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniSamplingParams, OmniTextPrompt
+from vllm_omni.inputs.data import (
+    OmniDiffusionSamplingParams,
+    OmniSamplingParams,
+    OmniTextPrompt,
+)
 from vllm_omni.lora.request import LoRARequest
 from vllm_omni.lora.utils import stable_lora_int_id
 
@@ -216,7 +220,9 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None,
         _remove_route_from_router(app, "/start_profile", {"POST"})
         _remove_route_from_router(app, "/stop_profile", {"POST"})
 
-        from vllm_omni.entrypoints.serve.profile.api_router import attach_router as attach_profile_router
+        from vllm_omni.entrypoints.serve.profile.api_router import (
+            attach_router as attach_profile_router,
+        )
 
         attach_profile_router(app)
 
diff --git a/vllm_omni/inputs/data.py b/vllm_omni/inputs/data.py
index beafffab88d..bc9434817f6 100644
--- a/vllm_omni/inputs/data.py
+++ b/vllm_omni/inputs/data.py
@@ -4,6 +4,7 @@
 from typing import Any, TypeAlias
 
 from vllm import PromptType, SamplingParams
+
 from vllm_omni.lora.request import LoRARequest
 
 try:

From 985e4d3248a4107110bb1bf1f6fd7664ecc4cc98 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 11 Feb 2026 04:32:49 +0800
Subject: [PATCH 4/6] precommit

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 pyproject.toml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3aa832b70db..6af1939536c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -117,9 +117,6 @@ ignore = [
     "N812",  # lowercase imported as non-lowercase: functional as F
 ]
 
-[tool.ruff.lint.isort]
-known-third-party = ["vllm"]
-
 [tool.ruff.lint.per-file-ignores]
 "examples/**" = ["E501"]  # Allow long lines in examples
 "tests/**" = ["E501"]  # Allow long lines in tests

From 1c1e22add91a2b6eea3f7931b4081944a4de66a4 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 11 Feb 2026 04:33:42 +0800
Subject: [PATCH 5/6] precommit

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 docs/PR_DESCRIPTION.md | 489 -----------------------------------------
 1 file changed, 489 deletions(-)
 delete mode 100644 docs/PR_DESCRIPTION.md

diff --git a/docs/PR_DESCRIPTION.md b/docs/PR_DESCRIPTION.md
deleted file mode 100644
index 9ba45b07787..00000000000
--- a/docs/PR_DESCRIPTION.md
+++ /dev/null
@@ -1,489 +0,0 @@
-# [Feature] Unified Profiler with Online Serving and Stage-Aware Endpoints
-
-## Summary
-
-- Consolidate the scattered diffusion-only profiler (`vllm_omni/diffusion/profiler/`) into a unified `vllm_omni/profiler/` module that works across all stage types (LLM, diffusion, omni-modality)
-- Add stage-aware HTTP profiler endpoints (`/start_profile`, `/stop_profile`) for the online API server, following upstream vLLM's API shape and extending it with an optional `stages` parameter for multi-stage pipeline profiling
-- Wire up `ProfilerConfig` end-to-end: CLI args → `AsyncOmni` → per-stage workers via `to_dict()`/`from_dict()` serialization
-- Add `--profile-dir` CLI argument to all offline inference examples (text-to-image, image-to-video, qwen2.5-omni, qwen3-omni, etc.)
-
-## Changes
-
-### New files
-| File | Description |
-|------|-------------|
-| `vllm_omni/profiler/__init__.py` | Unified profiler package (replaces `vllm_omni/diffusion/profiler/`) |
-| `vllm_omni/profiler/config.py` | `ProfilerConfig` dataclass with `to_dict()`/`from_dict()`/`from_any()` serialization |
-| `vllm_omni/profiler/torch_profiler.py` | `TorchProfiler` class aligned with upstream vLLM 0.16.0 semantics |
-| `vllm_omni/entrypoints/serve/profile/api_router.py` | Stage-aware `/start_profile` and `/stop_profile` HTTP endpoints |
-| `tests/profiler/test_config.py` | Unit tests for `ProfilerConfig` |
-| `tests/profiler/test_torch_profiler.py` | Unit tests for `TorchProfiler` (CUDA + CPU) |
-
-### Deleted files
-| File | Reason |
-|------|--------|
-| `vllm_omni/diffusion/profiler/base.py` | Replaced by `vllm_omni/profiler/torch_profiler.py` |
-| `vllm_omni/diffusion/profiler/torch_profiler.py` | Replaced by `vllm_omni/profiler/torch_profiler.py` |
-
-### Modified files
-| File | Change |
-|------|--------|
-| `vllm_omni/entrypoints/omni.py` | `OmniBase.__init__` accepts `profiler_config`, `start_profile(stages)` / `stop_profile(stages)` methods |
-| `vllm_omni/entrypoints/omni_llm.py` | `OmniLLM` accepts `profiler_config`, single-stage `start_profile()` / `stop_profile()` |
-| `vllm_omni/entrypoints/omni_stage.py` | Stage workers handle `PROFILER_START`/`PROFILER_STOP` tasks via `TorchProfiler` |
-| `vllm_omni/entrypoints/omni_diffusion.py` | Uses unified profiler module |
-| `vllm_omni/entrypoints/openai/api_server.py` | Replaces upstream profiler routes with stage-aware versions; converts `profiler_config` for `AsyncOmni` |
-| `vllm_omni/diffusion/diffusion_engine.py` | Uses unified profiler module for diffusion engine profiling |
-| `vllm_omni/diffusion/worker/diffusion_worker.py` | Uses unified profiler module |
-| `vllm_omni/config/__init__.py` | Re-exports `ProfilerConfig` |
-| `docs/contributing/profiling.md` | Full documentation for offline and online profiling |
-| `examples/offline_inference/*/` | All examples now support `--profile-dir` CLI flag |
-
-## Architecture
-
-### Class Hierarchy
-
-```
-ProfilerConfig (vllm_omni/profiler/config.py)     TorchProfiler (vllm_omni/profiler/torch_profiler.py)
-       │    to_dict() / from_dict() / from_any()          │    start() / stop() / step() / shutdown()
-       │                                                   │
-       │  used by all paths below                          │  instantiated in each worker process
-       ▼                                                   ▼
-  ┌──────────────────────────────────────────────────────────────┐
-  │                     Entry Points                             │
-  ├────────────────────┬────────────────────┬────────────────────┤
-  │  Multi-Stage       │  Single-Stage LLM  │  Single-Stage      │
-  │  (Qwen-Omni)       │  (OmniLLM)         │  Diffusion         │
-  │                    │                    │  (OmniDiffusion)   │
-  │  OmniBase          │  OmniLLM(LLM)     │  OmniDiffusion     │
-  │   ├─ Omni          │                    │   └─ DiffusionEngine│
-  │   └─ AsyncOmni     │                    │       └─ Workers   │
-  └────────────────────┴────────────────────┴────────────────────┘
-```
-
-### Multi-Stage Profiling Flow (Qwen2.5-Omni / Qwen3-Omni)
-
-This is the primary flow for online serving and offline omni-modality inference.
-Example: `stages=[0]` profiles only the Thinker stage.
-
-```
-  Online Serving                                 Offline Inference
-  ──────────────                                 ─────────────────
-
-  curl -X POST /start_profile                    omni = Omni(
-    -d '{"stages": [0]}'                           model="Qwen/Qwen2.5-Omni-7B",
-         │                                         profiler_config=ProfilerConfig(
-         ▼                                           profiler="torch",
-  api_router.py                                      torch_profiler_dir="./profiles"))
-  ProfileRequest{stages:[0]}                         │
-         │                                     omni.start_profile(stages=[0])
-         ▼                                           │
-  AsyncOmni.start_profile(stages=[0])                │
-  (async wrapper → calls super())                    │
-         │                                           │
-         └──────────────┬────────────────────────────┘
-                        │
-                        ▼
-              OmniBase.start_profile(stages=[0])
-              │
-              │  for stage_id in [0]:
-              │    task = {"type": PROFILER_START,
-              │            "config": self._profiler_config.to_dict()}
-              │    self.stage_list[0].submit(task)
-              │
-              ▼  (only stage 0 receives task; stages 1,2 are skipped)
-  ┌─────────────────┐  ┌─────────────────┐  ┌─────────────────┐
-  │    Stage 0       │  │    Stage 1       │  │    Stage 2       │
-  │   (Thinker)      │  │   (Talker)       │  │   (Code2Wav)     │
-  │                  │  │                  │  │                  │
-  │  in_q.get()      │  │  (no task)       │  │  (no task)       │
-  │       │          │  │                  │  │                  │
-  │       ▼          │  │                  │  │                  │
-  │  Worker Process  │  │                  │  │                  │
-  │  (omni_stage.py) │  │                  │  │                  │
-  └───────┬──────────┘  └──────────────────┘  └──────────────────┘
-          │
-          ▼
-  handle_profiler_task_local(task)  ──or──  handle_profiler_task_async(task)
-  (sync worker: diffusion stages)          (async worker: LLM stages)
-          │
-          ├─ config = ProfilerConfig.from_dict(task["config"])
-          ├─ profiler = TorchProfiler(config, worker_name="stage-0")
-          └─ profiler.start()
-          │
-  ════════╪══════════════════════════════════════════════════
-          │  Requests flow through stage 0 with profiling active
-          │  torch.profiler captures CPU/CUDA activity per iteration
-  ════════╪══════════════════════════════════════════════════
-          │
-  ┌───────┴───────── STOP FLOW ─────────────────────────────┐
-  │                                                          │
-  │  curl -X POST /stop_profile     omni.stop_profile()     │
-  │    -d '{"stages": [0]}'              │                   │
-  │         │                            │                   │
-  │         └────────────┬───────────────┘                   │
-  │                      ▼                                   │
-  │  OmniBase.stop_profile(stages=[0])                       │
-  │  │                                                       │
-  │  │  stage.stop_profile()                                 │
-  │  │    └─ submit(PROFILER_STOP)                           │
-  │  │    └─ out_q.get(timeout=600)  # wait for worker reply │
-  │  │                                                       │
-  │  └───────────► Worker Process receives PROFILER_STOP     │
-  │                      │                                   │
-  │                      ▼                                   │
-  │                profiler.stop()                           │
-  │                      │                                   │
-  │                      ├─ Flush trace via tensorboard_trace_handler
-  │                      ├─ Write CUDA time stats table      │
-  │                      └─ out_q.put({"type":"profiler_result"})
-  │                                                          │
-  └──────────────────────────────────────────────────────────┘
-          │
-          ▼
-  Output Files (torch_profiler_dir):
-  ├── stage-0_*.trace.json.gz     # TensorBoard / Perfetto trace
-  └── profiler_out_0.txt          # CUDA time stats (key_averages table)
-```
-
-### Single-Stage LLM Flow (OmniLLM)
-
-For single-stage LLM-only models. TorchProfiler runs in-process (no IPC needed).
-
-```
-  omni_llm = OmniLLM(profiler_config=ProfilerConfig(...))
-       │
-  omni_llm.start_profile()
-       │
-       ├─ TorchProfiler(config, worker_name="llm-rank-0").start()
-       │   (created directly in the same process)
-       │
-       │  ... requests ...
-       │
-  omni_llm.stop_profile()
-       │
-       └─ profiler.stop() → trace files written
-```
-
-### Single-Stage Diffusion Flow (OmniDiffusion / DiffusionEngine)
-
-For standalone diffusion models. Profiler is distributed to GPU workers via `collective_rpc`.
-
-```
-  omni_diff = OmniDiffusion(profiler_config=ProfilerConfig(...))
-       │
-  omni_diff.start_profile()
-       │
-       └─ DiffusionEngine.start_profile(config)
-            │
-            └─ collective_rpc("start_profile", args=(config.to_dict(),))
-                 │
-                 ├─► DiffusionWorker rank 0: TorchProfiler(config).start()
-                 ├─► DiffusionWorker rank 1: TorchProfiler(config).start()
-                 └─► ...
-                 │
-                 │  ... generation ...
-                 │
-  omni_diff.stop_profile()
-       │
-       └─ DiffusionEngine.stop_profile()
-            │
-            └─ collective_rpc("stop_profile")
-                 │
-                 ├─► Worker rank 0: profiler.stop() → trace files
-                 └─► Worker rank 1: profiler.stop() → trace files
-
-  Output Files:
-  ├── diffusion-rank-0_*.trace.json.gz
-  ├── diffusion-rank-1_*.trace.json.gz
-  ├── profiler_out_0.txt
-  └── profiler_out_1.txt
-```
-
-### Online Serving Config Conversion
-
-When the API server starts, upstream's `--profiler-config` CLI arg is converted to our `ProfilerConfig`:
-
-```
-  vllm serve --profiler-config profiler=torch,torch_profiler_dir=./profiles
-       │
-       ▼
-  build_async_omni_from_stage_config(args)
-       │
-       ├─ upstream_config = args.profiler_config   (vllm.config.ProfilerConfig)
-       ├─ our_config = OmniProfilerConfig.from_any(upstream_config)
-       │     └─ converts to vllm_omni.profiler.ProfilerConfig
-       └─ AsyncOmni(model=..., profiler_config=our_config)
-              │
-              └─ OmniBase.__init__ stores self._profiler_config
-                   │
-                   └─ start_profile() serializes via .to_dict() for each stage worker
-```
-
-Upstream's profiler routes are replaced at server startup:
-
-```
-  app = build_openai_app(args)          # upstream registers /start_profile, /stop_profile
-       │
-  _remove_route_from_router(app, "/start_profile")   # remove upstream routes
-  _remove_route_from_router(app, "/stop_profile")
-       │
-  attach_profile_router(app)            # register our stage-aware routes
-       │                                # (checks app.state.args.profiler_config)
-       ▼
-  Our /start_profile accepts: {"stages": [0,1,2]} or empty body (all stages)
-  Our /stop_profile  accepts: {"stages": [0,1,2]} or empty body (all stages)
-```
-
-## Test Plan
-
-### Unit Tests (no GPU needed)
-
-```bash
-# 1. ProfilerConfig: defaults, validation, to_dict, from_dict, roundtrip, dir expansion
-# 2. ProfilerConfig.from_any(): None, own instance, dict, upstream-like object, profiler=None
-# 3. ProfilerConfig re-export from vllm_omni.config
-pytest tests/profiler/test_config.py -v
-
-# 4. API router: attach_router conditional (profiler set / None / profiler=None)
-# 5. POST /start_profile: no body (all stages), stages=[0], stages=[0,2]
-# 6. POST /stop_profile: no body (all stages), stages=[1]
-# 7. Verifies engine_client.start_profile/stop_profile called with correct stages arg
-pytest tests/profiler/test_api_router.py -v
-```
-
-### Unit Tests (CUDA required)
-
-```bash
-# 8.  TorchProfiler start/stop lifecycle
-# 9.  stop without start is no-op
-# 10. double start is no-op
-# 11. shutdown stops running profiler
-# 12. step() iteration counting
-# 13. delay_iterations: profiler starts only after N steps
-# 14. max_iterations: profiler auto-stops after N steps
-# 15. config-driven settings (record_shapes, memory, flops)
-# 16. Trace files (.trace.json.gz) written and non-empty
-# 17. CUDA time stats (profiler_out_0.txt) written and non-empty
-# 18. worker_name appears in trace filename (e.g. "stage-0")
-pytest tests/profiler/test_torch_profiler.py -v
-```
-
-### Integration: Offline Diffusion (Text-to-Image)
-
-```bash
-# 19. Single-stage diffusion profiling end-to-end
-python examples/offline_inference/text_to_image/text_to_image.py \
-    --model Tongyi-MAI/Z-Image-Turbo \
-    --profile-dir ./profiles/t2i
-
-# Verify:
-ls ./profiles/t2i/
-# - *.trace.json.gz exists and size > 0
-# - profiler_out_*.txt exists and size > 0
-python -c "
-import glob, os
-traces = glob.glob('./profiles/t2i/*.trace.json.gz')
-stats = glob.glob('./profiles/t2i/profiler_out_*.txt')
-assert len(traces) >= 1, f'No traces: {os.listdir(\"./profiles/t2i\")}'
-assert len(stats) >= 1, f'No stats: {os.listdir(\"./profiles/t2i\")}'
-assert all(os.path.getsize(f) > 0 for f in traces + stats)
-print(f'OK: {len(traces)} traces, {len(stats)} stats files')
-"
-```
-
-### Integration: Offline Qwen2.5-Omni (3-stage, stage-selective)
-
-```bash
-# 20. Multi-stage profiling — stage 0 (Thinker) only
-python examples/offline_inference/qwen2_5_omni/end2end.py \
-    --model Qwen/Qwen2.5-Omni-7B \
-    --profile-dir ./profiles/qwen25
-
-# Verify:
-ls ./profiles/qwen25/
-# - stage-0_*.trace.json.gz exists (Thinker trace)
-# - No stage-1_* or stage-2_* files (only stage 0 was profiled)
-python -c "
-import glob, os
-s0 = glob.glob('./profiles/qwen25/stage-0*')
-s1 = glob.glob('./profiles/qwen25/stage-1*')
-s2 = glob.glob('./profiles/qwen25/stage-2*')
-assert len(s0) >= 1, f'No stage-0 traces: {os.listdir(\"./profiles/qwen25\")}'
-assert len(s1) == 0, f'Unexpected stage-1 traces: {s1}'
-assert len(s2) == 0, f'Unexpected stage-2 traces: {s2}'
-print(f'OK: {len(s0)} stage-0 traces, no stage-1/2 traces')
-"
-```
-
-### Integration: Offline Qwen3-Omni (3-stage)
-
-```bash
-# 21. Multi-stage profiling — Qwen3-Omni variant
-python examples/offline_inference/qwen3_omni/end2end.py \
-    --model Qwen/Qwen3-Omni-8B \
-    --profile-dir ./profiles/qwen3
-
-ls ./profiles/qwen3/
-# Expected: stage-0_*.trace.json.gz
-```
-
-### Integration: Online Serving Startup
-
-```bash
-# 22. Server starts with profiling enabled
-python -m vllm_omni.entrypoints.openai.api_server \
-    --model Qwen/Qwen2.5-Omni-7B \
-    --profiler-config profiler=torch,torch_profiler_dir=./profiles/online
-
-# Verify server logs contain:
-#   "Profiler with mode 'torch' is enabled in the API server..."
-```
-
-### Integration: Online Profile All Stages
-
-```bash
-# 23. Profile all stages via HTTP (no stages param)
-curl -X POST http://localhost:8000/start_profile
-# Expected: HTTP 200
-
-# Send requests to exercise all stages
-curl -X POST http://localhost:8000/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": "Qwen/Qwen2.5-Omni-7B",
-        "messages": [{"role": "user", "content": "Hello, how are you?"}]
-    }'
-
-curl -X POST http://localhost:8000/stop_profile
-# Expected: HTTP 200
-
-# Verify traces for ALL stages:
-ls ./profiles/online/
-# Expected: stage-0_*, stage-1_*, stage-2_* trace files
-python -c "
-import glob
-for i in range(3):
-    files = glob.glob(f'./profiles/online/stage-{i}*')
-    print(f'stage-{i}: {len(files)} files')
-    assert len(files) >= 1, f'Missing stage-{i} traces'
-print('OK: all 3 stages have traces')
-"
-```
-
-### Integration: Online Stage-Selective Profiling
-
-```bash
-# 24. Profile only Stage 0 (Thinker)
-rm -rf ./profiles/online/*  # clean from previous test
-
-curl -X POST http://localhost:8000/start_profile \
-    -H "Content-Type: application/json" \
-    -d '{"stages": [0]}'
-
-curl -X POST http://localhost:8000/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": "Qwen/Qwen2.5-Omni-7B",
-        "messages": [{"role": "user", "content": "Tell me a joke"}]
-    }'
-
-curl -X POST http://localhost:8000/stop_profile \
-    -H "Content-Type: application/json" \
-    -d '{"stages": [0]}'
-
-# Verify ONLY stage-0 traces:
-python -c "
-import glob
-s0 = glob.glob('./profiles/online/stage-0*')
-s1 = glob.glob('./profiles/online/stage-1*')
-s2 = glob.glob('./profiles/online/stage-2*')
-assert len(s0) >= 1, f'Missing stage-0 traces'
-assert len(s1) == 0, f'Unexpected stage-1 traces: {s1}'
-assert len(s2) == 0, f'Unexpected stage-2 traces: {s2}'
-print(f'OK: only stage-0 ({len(s0)} files), no stage-1/2')
-"
-```
-
-### Integration: Online Profile Talker + Code2Wav
-
-```bash
-# 25. Profile stages 1 and 2 only
-rm -rf ./profiles/online/*
-
-curl -X POST http://localhost:8000/start_profile \
-    -H "Content-Type: application/json" \
-    -d '{"stages": [1, 2]}'
-
-# Send requests...
-curl -X POST http://localhost:8000/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": "Qwen/Qwen2.5-Omni-7B",
-        "messages": [{"role": "user", "content": "Count to five"}]
-    }'
-
-curl -X POST http://localhost:8000/stop_profile \
-    -H "Content-Type: application/json" \
-    -d '{"stages": [1, 2]}'
-
-# Verify:
-python -c "
-import glob
-s0 = glob.glob('./profiles/online/stage-0*')
-s1 = glob.glob('./profiles/online/stage-1*')
-s2 = glob.glob('./profiles/online/stage-2*')
-assert len(s0) == 0, f'Unexpected stage-0 traces: {s0}'
-assert len(s1) >= 1, f'Missing stage-1 traces'
-assert len(s2) >= 1, f'Missing stage-2 traces'
-print(f'OK: no stage-0, stage-1 ({len(s1)} files), stage-2 ({len(s2)} files)')
-"
-```
-
-### Negative Tests
-
-```bash
-# 26. Server WITHOUT --profiler-config: endpoints should not exist
-python -m vllm_omni.entrypoints.openai.api_server \
-    --model Qwen/Qwen2.5-Omni-7B
-# (in another terminal)
-curl -X POST http://localhost:8000/start_profile
-# Expected: 404 Not Found or 405 Method Not Allowed
-
-# 27. Offline: start_profile without profiler_config raises ValueError
-python -c "
-from vllm_omni.entrypoints.omni_llm import OmniLLM
-try:
-    llm = OmniLLM.__new__(OmniLLM)
-    llm._profiler_config = None
-    llm._profiler_instance = None
-    llm.start_profile()
-    assert False, 'Should have raised ValueError'
-except ValueError as e:
-    print(f'OK: {e}')
-"
-```
-
-### Trace Viewing
-
-```bash
-# 28. Verify trace files load in Perfetto
-#     Upload any .trace.json.gz to https://ui.perfetto.dev/
-#     Should render timeline with CPU/CUDA activity
-```
-
-### Checklist
-
-- [ ] `pytest tests/profiler/test_config.py -v` — all 14 tests pass (config, from_any, re-export)
-- [ ] `pytest tests/profiler/test_api_router.py -v` — all 8 tests pass (endpoints, attach_router)
-- [ ] `pytest tests/profiler/test_torch_profiler.py -v` — all 13 tests pass (lifecycle, trace output)
-- [ ] Offline text-to-image: traces written, non-empty
-- [ ] Offline Qwen2.5-Omni: stage-0 traces only (no stage-1/2)
-- [ ] Offline Qwen3-Omni: stage-0 traces
-- [ ] Online server starts with profiler warning
-- [ ] Online `/start_profile` → 200, `/stop_profile` → 200, all stage traces written
-- [ ] Online `{"stages": [0]}` → only stage-0 traces
-- [ ] Online `{"stages": [1,2]}` → only stage-1/2 traces
-- [ ] Server without `--profiler-config` → 404 on `/start_profile`
-- [ ] Offline without `profiler_config` → ValueError
-- [ ] Traces load in Perfetto UI

From 9d2699e3be9909ff956ae4ae0ed05534411d0ff1 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 11 Feb 2026 05:12:26 +0800
Subject: [PATCH 6/6] fix: add type annotation to **kwargs to fix RTD strict
 build

griffe warns on unannotated **kwargs, which fails the mkdocs
build in strict mode.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/entrypoints/omni_diffusion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/entrypoints/omni_diffusion.py b/vllm_omni/entrypoints/omni_diffusion.py
index 308a37dd25f..0b7f4cac33e 100644
--- a/vllm_omni/entrypoints/omni_diffusion.py
+++ b/vllm_omni/entrypoints/omni_diffusion.py
@@ -4,6 +4,7 @@
 import logging
 import uuid
 from collections.abc import Sequence
+from typing import Any
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import get_hf_file_to_dict
@@ -42,7 +43,7 @@ def __init__(
         self,
         od_config: OmniDiffusionConfig | None = None,
         profiler_config: ProfilerConfig | None = None,
-        **kwargs,
+        **kwargs: Any,
     ):
         self._profiler_config = profiler_config