From a740028334c042063bb11961c9689fd6158fafab Mon Sep 17 00:00:00 2001
From: bjf-frz <frz123db@gmail.com>
Date: Thu, 23 Apr 2026 11:01:44 +0800
Subject: [PATCH 01/11] modify-log-stats

Signed-off-by: bjf-frz <frz123db@gmail.com>
---
 docs/contributing/metrics.md                | 22 ++++++++++++-------
 tests/entrypoints/test_async_omni_abort.py  | 11 ++++++++--
 tests/entrypoints/test_pd_disaggregation.py |  6 ++++--
 tests/metrics/test_stats.py                 |  7 +++++-
 vllm_omni/engine/async_omni_engine.py       |  7 ++++++
 vllm_omni/entrypoints/async_omni.py         |  6 ++++++
 vllm_omni/entrypoints/omni.py               |  4 ++++
 vllm_omni/entrypoints/omni_base.py          |  2 ++
 vllm_omni/metrics/stats.py                  | 24 +++++++++++++++------
 9 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md
index 92dcd92ccac..b021d05b445 100644
--- a/docs/contributing/metrics.md
+++ b/docs/contributing/metrics.md
@@ -3,7 +3,7 @@
 
 You can use these metrics in production to monitor the health and performance of the vLLM-omni system. Typical scenarios include:
 
-- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `e2e_total_ms`), and resource utilization to verify that the system meets expected standards.
+- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `request_latency_ms`), and resource utilization to verify that the system meets expected standards.
 
 - **Debugging and Troubleshooting**: Use detailed per-request metrics to diagnose issues, such as high transfer times or unexpected token counts.
 
@@ -31,9 +31,10 @@ With `--log-stats` enabled, the server will output detailed metrics logs after e
 | Field                       | Value        |
 |-----------------------------|--------------|
 | e2e_requests                | 1            |
-| e2e_wall_time_ms            | 41,299.190   |
+| run_wall_time_ms            | 41,299.190   |
+| request_latency_total_ms    | 41,299.133   |
 | e2e_total_tokens            | 5,202        |
-| e2e_avg_time_per_request_ms | 41,299.190   |
+| avg_run_wall_time_per_request_ms | 41,299.190   |
 | e2e_avg_tokens_per_s        | 125.959      |
 | e2e_stage_0_wall_time_ms    | 10,192.289   |
 | e2e_stage_1_wall_time_ms    | 30,541.409   |
@@ -43,7 +44,8 @@ With `--log-stats` enabled, the server will output detailed metrics logs after e
 
 | Field                   | Value      |
 |-------------------------|------------|
-| e2e_total_ms            | 41,299.133 |
+| request_latency_ms      | 41,299.133 |
+| request_submit_prep_ms  | 0.000      |
 | e2e_total_tokens        | 5,202      |
 | transfers_total_time_ms | 245.895    |
 | transfers_total_kbytes  | 138,089.939|
@@ -98,10 +100,13 @@ For **online inference** (serving mode), the summary is always per-request. `e2e
 | Field                     | Meaning                                                                                       |
 |---------------------------|----------------------------------------------------------------------------------------------|
 | `e2e_requests`            | Number of completed requests.                                                                |
-| `e2e_wall_time_ms`        | Wall-clock time span from run start to last completion, in ms.                               |
+| `run_wall_time_ms`        | Wall-clock time span from run start to last completion, in ms.                               |
+| `request_latency_total_ms` | Sum of per-request `request_latency_ms` across completed requests.                           |
+| `request_submit_prep_total_ms` | Sum of pre-submit request preparation time across completed requests.                    |
 | `e2e_total_tokens`        | Total tokens counted across all completed requests (stage0 input + all stage outputs).       |
-| `e2e_avg_time_per_request_ms` | Average wall time per request: `e2e_wall_time_ms / e2e_requests`.                        |
-| `e2e_avg_tokens_per_s`    | Average token throughput over wall time: `e2e_total_tokens * 1000 / e2e_wall_time_ms`.      |
+| `avg_run_wall_time_per_request_ms` | Average wall time per request: `run_wall_time_ms / e2e_requests`.                 |
+| `avg_request_submit_prep_ms` | Average pre-submit request preparation time per completed request.                        |
+| `e2e_avg_tokens_per_s`    | Average token throughput over wall time: `e2e_total_tokens * 1000 / run_wall_time_ms`.      |
 | `e2e_stage_{i}_wall_time_ms` | Wall-clock time span for stage i, in ms. Each stage's wall time is reported as a separate field, e.g., `e2e_stage_0_wall_time_ms`, `e2e_stage_1_wall_time_ms`, etc. |
 
 ---
@@ -110,7 +115,8 @@ For **online inference** (serving mode), the summary is always per-request. `e2e
 
 | Field                     | Meaning                                                               |
 |---------------------------|-----------------------------------------------------------------------|
-| `e2e_total_ms`            | End-to-end latency in ms.                                             |
+| `request_latency_ms`      | End-to-end latency in ms, starting after request preparation and ending at final completion. |
+| `request_submit_prep_ms`  | Time spent preparing and submitting the request before `request_latency_ms` starts.          |
 | `e2e_total_tokens`        | Total tokens for the request (stage0 input + all stage outputs).      |
 | `transfers_total_time_ms` | Sum of transfer edge `total_time_ms` for this request.                |
 | `transfers_total_kbytes`  | Sum of transfer kbytes for this request.                              |
diff --git a/tests/entrypoints/test_async_omni_abort.py b/tests/entrypoints/test_async_omni_abort.py
index b34652162d0..6ee8a6ef112 100644
--- a/tests/entrypoints/test_async_omni_abort.py
+++ b/tests/entrypoints/test_async_omni_abort.py
@@ -20,8 +20,15 @@ async def fake_add_request_async(*, request_id, prompt, sampling_params_list, fi
         async def fake_abort_async(request_ids):
             aborted_request_batches.append(list(request_ids))
 
-        async def fake_process_results(request_id, metrics, final_stage_id_for_e2e, req_start_ts, wall_start_ts):
-            del metrics, final_stage_id_for_e2e, req_start_ts, wall_start_ts
+        async def fake_process_results(
+            request_id,
+            metrics,
+            final_stage_id_for_e2e,
+            req_start_ts,
+            req_submit_prep_ms,
+            wall_start_ts,
+        ):
+            del metrics, final_stage_id_for_e2e, req_start_ts, req_submit_prep_ms, wall_start_ts
             if request_id.startswith("cancel-"):
                 await asyncio.Future()
                 return
diff --git a/tests/entrypoints/test_pd_disaggregation.py b/tests/entrypoints/test_pd_disaggregation.py
index 5ffabfbf2af..6deb12b0c4a 100644
--- a/tests/entrypoints/test_pd_disaggregation.py
+++ b/tests/entrypoints/test_pd_disaggregation.py
@@ -255,12 +255,14 @@ def __init__(self, num_stages, enable_stats, wall_start_ts, final_stage_id_for_e
             self.accumulated_gen_time_ms = {}
             self.e2e_done = set()
             self.e2e_count = 0
-            self.e2e_total_ms = 0.0
+            self.request_latency_total_ms = 0.0
+            self.request_submit_prep_total_ms = 0.0
 
         def on_stage_metrics(self, stage_id, req_id, metrics, final_output_type=None):
             pass
 
-        def on_finalize_request(self, stage_id, req_id, start_ts):
+        def on_finalize_request(self, stage_id, req_id, start_ts, request_submit_prep_ms=0.0):
+            del start_ts, request_submit_prep_ms
             self.e2e_done.add(req_id)
 
         def on_forward(self, from_stage, to_stage, req_id, size_bytes, tx_ms, use_shm):
diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py
index 8b18c5dffa1..6c38bffd0e2 100644
--- a/tests/metrics/test_stats.py
+++ b/tests/metrics/test_stats.py
@@ -59,6 +59,8 @@ def test_orchestrator_aggregator_builds_summary() -> None:
     summary = agg.build_and_log_summary()
     overall = summary["overall_summary"]
     assert overall["e2e_requests"] == 1
+    assert overall["run_wall_time_ms"] == 70.0
+    assert overall["request_submit_prep_total_ms"] == 0.0
 
     stage_entry = _get_request_entry(summary["stage_table"], "r1")
     stage_ids = [row["stage_id"] for row in stage_entry["stages"]]
@@ -77,7 +79,8 @@ def test_build_and_log_summary_e2e_only() -> None:
     agg.e2e_events.append(
         RequestE2EStats(
             request_id="r",
-            e2e_total_ms=10.0,
+            request_latency_ms=10.0,
+            request_submit_prep_ms=0.0,
             e2e_total_tokens=5,
             transfers_total_time_ms=0.0,
             transfers_total_bytes=0,
@@ -86,6 +89,7 @@ def test_build_and_log_summary_e2e_only() -> None:
 
     summary = agg.build_and_log_summary()
     e2e_entry = _get_request_entry(summary["e2e_table"], "r")
+    assert e2e_entry["request_latency_ms"] == 10.0
     assert e2e_entry["e2e_total_tokens"] == 5
     stage_entry = _get_request_entry(summary["stage_table"], "r")
     assert stage_entry["stages"] == []
@@ -150,6 +154,7 @@ def test_build_and_log_summary_multiple_requests() -> None:
     summary = agg.build_and_log_summary()
     assert len(summary["stage_table"]) == 2
     assert {entry["request_id"] for entry in summary["e2e_table"]} == {"r1", "r2"}
+    assert summary["overall_summary"]["request_latency_total_ms"] > 0.0
     # Check that r1 has two stages and r2 has one
     r1_stage_entry = next(e for e in summary["stage_table"] if e["request_id"] == "r1")
     r2_stage_entry = next(e for e in summary["stage_table"] if e["request_id"] == "r2")
diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index 8299a577a41..cea726054c9 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -1493,6 +1493,7 @@ def add_request(
         a queue + coroutine-switch round-trip.  The Orchestrator receives a
         ready-to-submit OmniEngineCoreRequest.
         """
+        build_msg_t0 = time.perf_counter()
         msg = self._build_add_request_message(
             request_id=request_id,
             prompt=prompt,
@@ -1508,6 +1509,12 @@ def add_request(
             reasoning_ended=reasoning_ended,
             resumable=resumable,
         )
+        build_msg_ms = (time.perf_counter() - build_msg_t0) * 1000.0
+        logger.info(
+            "[AsyncOmniEngine] req=%s _build_add_request_message took %.3f ms",
+            request_id,
+            build_msg_ms,
+        )
         if self.request_queue is None:
             raise RuntimeError("request_queue is not initialized")
         self.request_queue.sync_q.put_nowait(msg)
diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py
index 119232ce767..ea0e56b6594 100644
--- a/vllm_omni/entrypoints/async_omni.py
+++ b/vllm_omni/entrypoints/async_omni.py
@@ -282,7 +282,9 @@ async def generate(
 
             # Track per-request metrics
             wall_start_ts = time.time()
+            request_prep_start_ts = wall_start_ts
             req_start_ts: dict[str, float] = {}
+            req_submit_prep_ms: dict[str, float] = {}
 
             # Determine the final stage for E2E stats
             final_stage_id_for_e2e = self._compute_final_stage_id(output_modalities)
@@ -321,6 +323,7 @@ async def generate(
                     final_stage_id=final_stage_id_for_e2e,
                 )
             submit_ts = time.time()
+            req_submit_prep_ms[request_id] = (submit_ts - request_prep_start_ts) * 1000.0
             req_state.metrics.stage_first_ts[0] = submit_ts
             req_start_ts[request_id] = submit_ts
 
@@ -333,6 +336,7 @@ async def generate(
                 metrics,
                 final_stage_id_for_e2e,
                 req_start_ts,
+                req_submit_prep_ms,
                 wall_start_ts,
             ):
                 yield output
@@ -479,6 +483,7 @@ async def _process_orchestrator_results(
         metrics: OrchestratorMetrics,
         final_stage_id_for_e2e: int,
         req_start_ts: dict[str, float],
+        req_submit_prep_ms: dict[str, float],
         wall_start_ts: float,
     ) -> AsyncGenerator[OmniRequestOutput, None]:
         """Read results from the Orchestrator (via the request's asyncio.Queue)
@@ -520,6 +525,7 @@ async def _process_orchestrator_results(
                 stage_id,
                 metrics,
                 req_start_ts,
+                req_submit_prep_ms,
                 wall_start_ts,
                 final_stage_id_for_e2e,
             )
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
index 223c208af98..e31a9da6726 100644
--- a/vllm_omni/entrypoints/omni.py
+++ b/vllm_omni/entrypoints/omni.py
@@ -114,10 +114,12 @@ def _run_generation(
 
             request_ids = [f"{i}_{uuid.uuid4()}" for i in range(len(request_prompts))]
             req_start_ts: dict[str, float] = {}
+            req_submit_prep_ms: dict[str, float] = {}
             wall_start_ts = time.time()
             req_final_stage_ids: dict[str, int] = {}
 
             for req_id, prompt in zip(request_ids, request_prompts):
+                request_prep_start_ts = time.time()
                 prompt_modalities = prompt.get("modalities", None) if isinstance(prompt, dict) else None
                 final_stage_id = self._compute_final_stage_id(prompt_modalities)
                 req_final_stage_ids[req_id] = final_stage_id
@@ -146,6 +148,7 @@ def _run_generation(
                     final_stage_id=final_stage_id,
                 )
                 submit_ts = time.time()
+                req_submit_prep_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0
                 req_state.metrics.stage_first_ts[0] = submit_ts
                 req_start_ts[req_id] = submit_ts
 
@@ -175,6 +178,7 @@ def _run_generation(
                     stage_id=stage_id,
                     metrics=req_state.metrics,
                     req_start_ts=req_start_ts,
+                    req_submit_prep_ms=req_submit_prep_ms,
                     wall_start_ts=wall_start_ts,
                     final_stage_id_for_e2e=req_final_stage_ids[req_id],
                 )
diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index 51b816fb796..b27dc3ece9e 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -368,6 +368,7 @@ def _process_single_result(
         stage_id: int,
         metrics: OrchestratorMetrics,
         req_start_ts: dict[str, float],
+        req_submit_prep_ms: dict[str, float],
         wall_start_ts: float,
         final_stage_id_for_e2e: int,
     ) -> OmniRequestOutput | None:
@@ -398,6 +399,7 @@ def _process_single_result(
                     stage_id,
                     req_id,
                     req_start_ts.get(req_id, wall_start_ts),
+                    req_submit_prep_ms=req_submit_prep_ms.get(req_id, 0.0),
                 )
         except Exception:
             logger.exception("[%s] Finalize request handling error", self.__class__.__name__)
diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py
index 46fec6fcc1f..7be2bbf21c4 100644
--- a/vllm_omni/metrics/stats.py
+++ b/vllm_omni/metrics/stats.py
@@ -74,14 +74,15 @@ def total_time_ms(self) -> float:
 @dataclass
 class RequestE2EStats:
     request_id: str
-    e2e_total_ms: float
+    request_latency_ms: float
+    request_submit_prep_ms: float
     e2e_total_tokens: int
     transfers_total_time_ms: float
     transfers_total_bytes: int
 
     @property
     def e2e_tpt(self) -> float:
-        return (self.e2e_total_ms / self.e2e_total_tokens) if self.e2e_total_tokens > 0 else 0.0
+        return (self.request_latency_ms / self.e2e_total_tokens) if self.e2e_total_tokens > 0 else 0.0
 
 
 # === Field Configuration ===
@@ -133,7 +134,8 @@ def __init__(
     def init_run_state(self, wall_start_ts: float) -> None:
         # Per-run aggregates and timing state
         self.stage_total_tokens = [0 for _ in range(self.num_stages)]
-        self.e2e_total_ms = 0.0
+        self.request_latency_total_ms = 0.0
+        self.request_submit_prep_total_ms = 0.0
         self.e2e_total_tokens = 0
         self.e2e_count = 0
         self.e2e_done = set()
@@ -423,6 +425,7 @@ def on_finalize_request(
         stage_id: int,
         req_id: Any,
         req_start_ts: float,
+        request_submit_prep_ms: float = 0.0,
     ) -> None:
         rid_key = str(req_id)
         if rid_key in self.e2e_done:
@@ -444,13 +447,15 @@ def on_finalize_request(
                     total_tokens += int(evt.num_tokens_in)
                 total_tokens += int(evt.num_tokens_out)
 
-        self.e2e_total_ms += e2e_ms
+        self.request_latency_total_ms += e2e_ms
+        self.request_submit_prep_total_ms += float(request_submit_prep_ms)
         self.e2e_total_tokens += total_tokens
         self.e2e_count += 1
         self.e2e_done.add(rid_key)
         per_req_record = RequestE2EStats(
             request_id=rid_key,
-            e2e_total_ms=e2e_ms,
+            request_latency_ms=e2e_ms,
+            request_submit_prep_ms=float(request_submit_prep_ms),
             e2e_total_tokens=total_tokens,
             transfers_total_time_ms=float(
                 sum(evt.total_time_ms for evt in self.transfer_events.values() if evt.request_id == rid_key)
@@ -482,9 +487,14 @@ def build_and_log_summary(self) -> dict[str, Any]:
 
         overall_summary = {
             "e2e_requests": int(self.e2e_count),
-            "e2e_wall_time_ms": float(wall_time_ms),
+            "run_wall_time_ms": float(wall_time_ms),
+            "request_latency_total_ms": float(self.request_latency_total_ms),
+            "request_submit_prep_total_ms": float(self.request_submit_prep_total_ms),
             "e2e_total_tokens": int(self.e2e_total_tokens),
-            "e2e_avg_time_per_request_ms": float(e2e_avg_req),
+            "avg_run_wall_time_per_request_ms": float(e2e_avg_req),
+            "avg_request_submit_prep_ms": float(
+                self.request_submit_prep_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
+            ),
             "e2e_avg_tokens_per_s": float(e2e_avg_tok),
         }
         # Add stage_wall_time_ms as separate fields for each stage

From 31f9e4c2c826545c41cc231c1be20743d76e8b0e Mon Sep 17 00:00:00 2001
From: bjf-frz <frz123db@gmail.com>
Date: Thu, 23 Apr 2026 19:05:55 +0800
Subject: [PATCH 02/11] Improve Omni metrics timing visibility

Signed-off-by: bjf-frz <frz123db@gmail.com>
---
 docs/contributing/metrics.md                |  40 ++++---
 tests/entrypoints/test_async_omni_abort.py  |  12 +-
 tests/entrypoints/test_pd_disaggregation.py |  19 ++-
 tests/metrics/test_stats.py                 |  15 ++-
 vllm_omni/engine/async_omni_engine.py       |  15 ++-
 vllm_omni/engine/stage_init_utils.py        |   1 +
 vllm_omni/entrypoints/async_omni.py         |  26 ++--
 vllm_omni/entrypoints/omni.py               |  16 ++-
 vllm_omni/entrypoints/omni_base.py          |  10 +-
 vllm_omni/metrics/stats.py                  | 124 ++++++++++++++++----
 10 files changed, 209 insertions(+), 69 deletions(-)

diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md
index b021d05b445..fa148366e66 100644
--- a/docs/contributing/metrics.md
+++ b/docs/contributing/metrics.md
@@ -3,7 +3,7 @@
 
 You can use these metrics in production to monitor the health and performance of the vLLM-omni system. Typical scenarios include:
 
-- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `request_latency_ms`), and resource utilization to verify that the system meets expected standards.
+- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `request_wall_time_ms` / `engine_pipeline_time_ms`), and resource utilization to verify that the system meets expected standards.
 
 - **Debugging and Troubleshooting**: Use detailed per-request metrics to diagnose issues, such as high transfer times or unexpected token counts.
 
@@ -25,16 +25,23 @@ python openai_chat_completion_client_for_multimodal_generation.py --query-type u
 
 With `--log-stats` enabled, the server will output detailed metrics logs after each request. Example output:
 
+For multi-stage pipelines, vLLM-Omni also emits a concise per-request timing line:
+
+```text
+[OmniTiming] req=chatcmpl-a0edd05 total=32.99s preprocess=2.80s engine=30.19s stages=[0:ar=16.23s,1:diffusion=13.96s] transfers=[0->1=0.780ms]
+```
+
 
 #### Overall Summary
 
 | Field                       | Value        |
 |-----------------------------|--------------|
 | e2e_requests                | 1            |
-| run_wall_time_ms            | 41,299.190   |
-| request_latency_total_ms    | 41,299.133   |
+| request_wall_time_ms        | 41,299.190   |
+| input_preprocess_time_ms    |     57.000   |
+| engine_pipeline_time_ms     | 41,299.133   |
 | e2e_total_tokens            | 5,202        |
-| avg_run_wall_time_per_request_ms | 41,299.190   |
+| avg_request_wall_time_ms    | 41,299.190   |
 | e2e_avg_tokens_per_s        | 125.959      |
 | e2e_stage_0_wall_time_ms    | 10,192.289   |
 | e2e_stage_1_wall_time_ms    | 30,541.409   |
@@ -44,8 +51,9 @@ With `--log-stats` enabled, the server will output detailed metrics logs after e
 
 | Field                   | Value      |
 |-------------------------|------------|
-| request_latency_ms      | 41,299.133 |
-| request_submit_prep_ms  | 0.000      |
+| request_wall_time_ms    | 41,356.133 |
+| input_preprocess_time_ms |    57.000 |
+| engine_pipeline_time_ms | 41,299.133 |
 | e2e_total_tokens        | 5,202      |
 | transfers_total_time_ms | 245.895    |
 | transfers_total_kbytes  | 138,089.939|
@@ -100,13 +108,15 @@ For **online inference** (serving mode), the summary is always per-request. `e2e
 | Field                     | Meaning                                                                                       |
 |---------------------------|----------------------------------------------------------------------------------------------|
 | `e2e_requests`            | Number of completed requests.                                                                |
-| `run_wall_time_ms`        | Wall-clock time span from run start to last completion, in ms.                               |
-| `request_latency_total_ms` | Sum of per-request `request_latency_ms` across completed requests.                           |
-| `request_submit_prep_total_ms` | Sum of pre-submit request preparation time across completed requests.                    |
+| `request_wall_time_ms`        | Wall-clock time span from request preparation start to final completion, in ms.          |
+| `input_preprocess_time_ms` | Time spent preparing and submitting requests before the engine pipeline starts.             |
+| `build_add_request_message_time_ms` | Time spent inside `AsyncOmniEngine._build_add_request_message()`.                  |
+| `engine_pipeline_time_ms` | Time from engine request submission to final completion.                                     |
 | `e2e_total_tokens`        | Total tokens counted across all completed requests (stage0 input + all stage outputs).       |
-| `avg_run_wall_time_per_request_ms` | Average wall time per request: `run_wall_time_ms / e2e_requests`.                 |
-| `avg_request_submit_prep_ms` | Average pre-submit request preparation time per completed request.                        |
-| `e2e_avg_tokens_per_s`    | Average token throughput over wall time: `e2e_total_tokens * 1000 / run_wall_time_ms`.      |
+| `avg_request_wall_time_ms` | Average wall time per request: `request_wall_time_ms / e2e_requests`.                     |
+| `avg_input_preprocess_time_ms` | Average pre-submit request preparation time per completed request.                    |
+| `avg_engine_pipeline_time_ms` | Average engine pipeline time per completed request.                                      |
+| `e2e_avg_tokens_per_s`    | Average token throughput over wall time: `e2e_total_tokens * 1000 / request_wall_time_ms`.  |
 | `e2e_stage_{i}_wall_time_ms` | Wall-clock time span for stage i, in ms. Each stage's wall time is reported as a separate field, e.g., `e2e_stage_0_wall_time_ms`, `e2e_stage_1_wall_time_ms`, etc. |
 
 ---
@@ -115,8 +125,10 @@ For **online inference** (serving mode), the summary is always per-request. `e2e
 
 | Field                     | Meaning                                                               |
 |---------------------------|-----------------------------------------------------------------------|
-| `request_latency_ms`      | End-to-end latency in ms, starting after request preparation and ending at final completion. |
-| `request_submit_prep_ms`  | Time spent preparing and submitting the request before `request_latency_ms` starts.          |
+| `request_wall_time_ms`    | End-to-end latency in ms, including input preprocessing and engine pipeline time.            |
+| `input_preprocess_time_ms` | Time spent preparing and submitting the request before `engine_pipeline_time_ms` starts.    |
+| `build_add_request_message_time_ms` | Time spent inside `_build_add_request_message()` for this request.                 |
+| `engine_pipeline_time_ms` | Time from engine request submission to final completion.                                     |
 | `e2e_total_tokens`        | Total tokens for the request (stage0 input + all stage outputs).      |
 | `transfers_total_time_ms` | Sum of transfer edge `total_time_ms` for this request.                |
 | `transfers_total_kbytes`  | Sum of transfer kbytes for this request.                              |
diff --git a/tests/entrypoints/test_async_omni_abort.py b/tests/entrypoints/test_async_omni_abort.py
index 6ee8a6ef112..f7144086689 100644
--- a/tests/entrypoints/test_async_omni_abort.py
+++ b/tests/entrypoints/test_async_omni_abort.py
@@ -25,10 +25,18 @@ async def fake_process_results(
             metrics,
             final_stage_id_for_e2e,
             req_start_ts,
-            req_submit_prep_ms,
+            input_preprocess_time_ms,
+            build_add_request_message_time_ms,
             wall_start_ts,
         ):
-            del metrics, final_stage_id_for_e2e, req_start_ts, req_submit_prep_ms, wall_start_ts
+            del (
+                metrics,
+                final_stage_id_for_e2e,
+                req_start_ts,
+                input_preprocess_time_ms,
+                build_add_request_message_time_ms,
+                wall_start_ts,
+            )
             if request_id.startswith("cancel-"):
                 await asyncio.Future()
                 return
diff --git a/tests/entrypoints/test_pd_disaggregation.py b/tests/entrypoints/test_pd_disaggregation.py
index 6deb12b0c4a..4b220fbcb8c 100644
--- a/tests/entrypoints/test_pd_disaggregation.py
+++ b/tests/entrypoints/test_pd_disaggregation.py
@@ -246,7 +246,8 @@ def _setup_ipc_mocks(monkeypatch):
 
 def _setup_log_mocks(monkeypatch):
     class _FakeOrchestratorAggregator:
-        def __init__(self, num_stages, enable_stats, wall_start_ts, final_stage_id_for_e2e=None):
+        def __init__(self, num_stages, enable_stats, wall_start_ts, final_stage_id_for_e2e=None, **kwargs):
+            del kwargs
             self.num_stages = num_stages
             self.enable_stats = enable_stats
             self.stage_first_ts = [None] * num_stages
@@ -255,14 +256,22 @@ def __init__(self, num_stages, enable_stats, wall_start_ts, final_stage_id_for_e
             self.accumulated_gen_time_ms = {}
             self.e2e_done = set()
             self.e2e_count = 0
-            self.request_latency_total_ms = 0.0
-            self.request_submit_prep_total_ms = 0.0
+            self.engine_pipeline_total_ms = 0.0
+            self.input_preprocess_total_ms = 0.0
+            self.build_add_request_message_total_ms = 0.0
 
         def on_stage_metrics(self, stage_id, req_id, metrics, final_output_type=None):
             pass
 
-        def on_finalize_request(self, stage_id, req_id, start_ts, request_submit_prep_ms=0.0):
-            del start_ts, request_submit_prep_ms
+        def on_finalize_request(
+            self,
+            stage_id,
+            req_id,
+            start_ts,
+            input_preprocess_time_ms=0.0,
+            build_add_request_message_time_ms=0.0,
+        ):
+            del start_ts, input_preprocess_time_ms, build_add_request_message_time_ms
             self.e2e_done.add(req_id)
 
         def on_forward(self, from_stage, to_stage, req_id, size_bytes, tx_ms, use_shm):
diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py
index 6c38bffd0e2..aa81b09421f 100644
--- a/tests/metrics/test_stats.py
+++ b/tests/metrics/test_stats.py
@@ -59,8 +59,8 @@ def test_orchestrator_aggregator_builds_summary() -> None:
     summary = agg.build_and_log_summary()
     overall = summary["overall_summary"]
     assert overall["e2e_requests"] == 1
-    assert overall["run_wall_time_ms"] == 70.0
-    assert overall["request_submit_prep_total_ms"] == 0.0
+    assert overall["request_wall_time_ms"] == 70.0
+    assert overall["input_preprocess_time_ms"] == 0.0
 
     stage_entry = _get_request_entry(summary["stage_table"], "r1")
     stage_ids = [row["stage_id"] for row in stage_entry["stages"]]
@@ -79,8 +79,10 @@ def test_build_and_log_summary_e2e_only() -> None:
     agg.e2e_events.append(
         RequestE2EStats(
             request_id="r",
-            request_latency_ms=10.0,
-            request_submit_prep_ms=0.0,
+            request_wall_time_ms=10.0,
+            input_preprocess_time_ms=0.0,
+            build_add_request_message_time_ms=0.0,
+            engine_pipeline_time_ms=10.0,
             e2e_total_tokens=5,
             transfers_total_time_ms=0.0,
             transfers_total_bytes=0,
@@ -89,7 +91,8 @@ def test_build_and_log_summary_e2e_only() -> None:
 
     summary = agg.build_and_log_summary()
     e2e_entry = _get_request_entry(summary["e2e_table"], "r")
-    assert e2e_entry["request_latency_ms"] == 10.0
+    assert e2e_entry["request_wall_time_ms"] == 10.0
+    assert e2e_entry["engine_pipeline_time_ms"] == 10.0
     assert e2e_entry["e2e_total_tokens"] == 5
     stage_entry = _get_request_entry(summary["stage_table"], "r")
     assert stage_entry["stages"] == []
@@ -154,7 +157,7 @@ def test_build_and_log_summary_multiple_requests() -> None:
     summary = agg.build_and_log_summary()
     assert len(summary["stage_table"]) == 2
     assert {entry["request_id"] for entry in summary["e2e_table"]} == {"r1", "r2"}
-    assert summary["overall_summary"]["request_latency_total_ms"] > 0.0
+    assert summary["overall_summary"]["engine_pipeline_time_ms"] > 0.0
     # Check that r1 has two stages and r2 has one
     r1_stage_entry = next(e for e in summary["stage_table"] if e["request_id"] == "r1")
     r2_stage_entry = next(e for e in summary["stage_table"] if e["request_id"] == "r2")
diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index cea726054c9..bd289ca6b1b 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -1485,7 +1485,7 @@ def add_request(
         reasoning_ended: bool | None = None,
         *,
         resumable: bool = False,
-    ) -> None:
+    ) -> dict[str, float]:
         """Process stage-0 input locally, then send to the Orchestrator.
 
         Input processing and output
@@ -1510,11 +1510,6 @@ def add_request(
             resumable=resumable,
         )
         build_msg_ms = (time.perf_counter() - build_msg_t0) * 1000.0
-        logger.info(
-            "[AsyncOmniEngine] req=%s _build_add_request_message took %.3f ms",
-            request_id,
-            build_msg_ms,
-        )
         if self.request_queue is None:
             raise RuntimeError("request_queue is not initialized")
         self.request_queue.sync_q.put_nowait(msg)
@@ -1527,6 +1522,10 @@ def add_request(
             stage0_params = effective_spl[0] if effective_spl else None
             if stage0_params is not None:
                 self._enqueue_cfg_companions(request_id, original_prompt, stage0_params, effective_spl)
+        return {
+            "input_preprocess_time_ms": build_msg_ms,
+            "build_add_request_message_time_ms": build_msg_ms,
+        }
 
     async def add_request_async(
         self,
@@ -1544,9 +1543,9 @@ async def add_request_async(
         reasoning_ended: bool | None = None,
         *,
         resumable: bool = False,
-    ) -> None:
+    ) -> dict[str, float]:
         """Async add_request API."""
-        self.add_request(
+        return self.add_request(
             request_id=request_id,
             prompt=prompt,
             prompt_text=prompt_text,
diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py
index 9fdcd6216ee..2249efc57f4 100644
--- a/vllm_omni/engine/stage_init_utils.py
+++ b/vllm_omni/engine/stage_init_utils.py
@@ -765,6 +765,7 @@ def finalize_initialized_stages(
             "final_output": stage_client.final_output,
             "final_output_type": stage_client.final_output_type,
             "stage_type": stage_client.stage_type,
+            "model_stage": getattr(stage_client, "model_stage", None),
         }
         for stage_client in initialized_stage_clients
     ]
diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py
index ea0e56b6594..2641e8437b3 100644
--- a/vllm_omni/entrypoints/async_omni.py
+++ b/vllm_omni/entrypoints/async_omni.py
@@ -284,7 +284,8 @@ async def generate(
             wall_start_ts = time.time()
             request_prep_start_ts = wall_start_ts
             req_start_ts: dict[str, float] = {}
-            req_submit_prep_ms: dict[str, float] = {}
+            input_preprocess_time_ms: dict[str, float] = {}
+            build_add_request_message_time_ms: dict[str, float] = {}
 
             # Determine the final stage for E2E stats
             final_stage_id_for_e2e = self._compute_final_stage_id(output_modalities)
@@ -294,6 +295,7 @@ async def generate(
                 self.log_stats,
                 wall_start_ts,
                 final_stage_id_for_e2e,
+                stage_metadata=getattr(self.engine, "stage_metadata", None),
             )
             req_state = ClientRequestState(request_id)
             req_state.metrics = metrics
@@ -315,15 +317,22 @@ async def generate(
                     sampling_params_list=req_sp_list,
                     final_stage_id=final_stage_id_for_e2e,
                 )
+                add_request_timings = {}
             else:
-                await self.engine.add_request_async(
+                add_request_timings = await self.engine.add_request_async(
                     request_id=request_id,
                     prompt=prompt,
                     sampling_params_list=req_sp_list,
                     final_stage_id=final_stage_id_for_e2e,
-                )
+                ) or {}
             submit_ts = time.time()
-            req_submit_prep_ms[request_id] = (submit_ts - request_prep_start_ts) * 1000.0
+            input_preprocess_time_ms[request_id] = (submit_ts - request_prep_start_ts) * 1000.0
+            build_add_request_message_time_ms[request_id] = float(
+                add_request_timings.get(
+                    "build_add_request_message_time_ms",
+                    input_preprocess_time_ms[request_id],
+                )
+            )
             req_state.metrics.stage_first_ts[0] = submit_ts
             req_start_ts[request_id] = submit_ts
 
@@ -336,7 +345,8 @@ async def generate(
                 metrics,
                 final_stage_id_for_e2e,
                 req_start_ts,
-                req_submit_prep_ms,
+                input_preprocess_time_ms,
+                build_add_request_message_time_ms,
                 wall_start_ts,
             ):
                 yield output
@@ -483,7 +493,8 @@ async def _process_orchestrator_results(
         metrics: OrchestratorMetrics,
         final_stage_id_for_e2e: int,
         req_start_ts: dict[str, float],
-        req_submit_prep_ms: dict[str, float],
+        input_preprocess_time_ms: dict[str, float],
+        build_add_request_message_time_ms: dict[str, float],
         wall_start_ts: float,
     ) -> AsyncGenerator[OmniRequestOutput, None]:
         """Read results from the Orchestrator (via the request's asyncio.Queue)
@@ -525,7 +536,8 @@ async def _process_orchestrator_results(
                 stage_id,
                 metrics,
                 req_start_ts,
-                req_submit_prep_ms,
+                input_preprocess_time_ms,
+                build_add_request_message_time_ms,
                 wall_start_ts,
                 final_stage_id_for_e2e,
             )
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
index e31a9da6726..7432b9cabae 100644
--- a/vllm_omni/entrypoints/omni.py
+++ b/vllm_omni/entrypoints/omni.py
@@ -114,7 +114,8 @@ def _run_generation(
 
             request_ids = [f"{i}_{uuid.uuid4()}" for i in range(len(request_prompts))]
             req_start_ts: dict[str, float] = {}
-            req_submit_prep_ms: dict[str, float] = {}
+            input_preprocess_time_ms: dict[str, float] = {}
+            build_add_request_message_time_ms: dict[str, float] = {}
             wall_start_ts = time.time()
             req_final_stage_ids: dict[str, int] = {}
 
@@ -129,6 +130,7 @@ def _run_generation(
                     self.log_stats,
                     wall_start_ts,
                     final_stage_id,
+                    stage_metadata=getattr(self.engine, "stage_metadata", None),
                 )
                 req_state = ClientRequestState(req_id)
                 req_state.metrics = metrics
@@ -141,14 +143,17 @@ def _run_generation(
                     p_id = pd_pair[0]
                     req_sp_list[p_id] = self._prepare_prefill_sampling_params(req_id, req_sp_list[p_id])
 
-                self.engine.add_request(
+                add_request_timings = self.engine.add_request(
                     request_id=req_id,
                     prompt=prompt,
                     sampling_params_list=req_sp_list,
                     final_stage_id=final_stage_id,
-                )
+                ) or {}
                 submit_ts = time.time()
-                req_submit_prep_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0
+                input_preprocess_time_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0
+                build_add_request_message_time_ms[req_id] = float(
+                    add_request_timings.get("build_add_request_message_time_ms", input_preprocess_time_ms[req_id])
+                )
                 req_state.metrics.stage_first_ts[0] = submit_ts
                 req_start_ts[req_id] = submit_ts
 
@@ -178,7 +183,8 @@ def _run_generation(
                     stage_id=stage_id,
                     metrics=req_state.metrics,
                     req_start_ts=req_start_ts,
-                    req_submit_prep_ms=req_submit_prep_ms,
+                    input_preprocess_time_ms=input_preprocess_time_ms,
+                    build_add_request_message_time_ms=build_add_request_message_time_ms,
                     wall_start_ts=wall_start_ts,
                     final_stage_id_for_e2e=req_final_stage_ids[req_id],
                 )
diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index b27dc3ece9e..2d25860ef1e 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -7,6 +7,7 @@
 import types
 import weakref
 from collections.abc import Sequence
+from pprint import pformat
 from typing import TYPE_CHECKING, Any, Literal
 
 import huggingface_hub
@@ -249,6 +250,9 @@ def _log_summary_and_cleanup(self, request_id: str) -> None:
         try:
             if req_state is None or req_state.metrics is None:
                 return
+            summary = req_state.metrics.build_and_log_summary()
+            if summary:
+                logger.debug("[Summary] %s", pformat(summary, sort_dicts=False))
         except Exception:
             logger.exception(
                 "[%s] Failed to build/log summary for req=%s",
@@ -368,7 +372,8 @@ def _process_single_result(
         stage_id: int,
         metrics: OrchestratorMetrics,
         req_start_ts: dict[str, float],
-        req_submit_prep_ms: dict[str, float],
+        input_preprocess_time_ms: dict[str, float],
+        build_add_request_message_time_ms: dict[str, float],
         wall_start_ts: float,
         final_stage_id_for_e2e: int,
     ) -> OmniRequestOutput | None:
@@ -399,7 +404,8 @@ def _process_single_result(
                     stage_id,
                     req_id,
                     req_start_ts.get(req_id, wall_start_ts),
-                    req_submit_prep_ms=req_submit_prep_ms.get(req_id, 0.0),
+                    input_preprocess_time_ms=input_preprocess_time_ms.get(req_id, 0.0),
+                    build_add_request_message_time_ms=build_add_request_message_time_ms.get(req_id, 0.0),
                 )
         except Exception:
             logger.exception("[%s] Finalize request handling error", self.__class__.__name__)
diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py
index 7be2bbf21c4..4b22670ad07 100644
--- a/vllm_omni/metrics/stats.py
+++ b/vllm_omni/metrics/stats.py
@@ -36,6 +36,8 @@ class StageRequestStats:
     rx_in_flight_time_ms: float
     stage_stats: StageStats
     stage_id: int | None = None
+    stage_name: str | None = None
+    stage_type: str | None = None
     final_output_type: str | None = None
     request_id: str | None = None
     postprocess_time_ms: float = 0.0
@@ -74,15 +76,17 @@ def total_time_ms(self) -> float:
 @dataclass
 class RequestE2EStats:
     request_id: str
-    request_latency_ms: float
-    request_submit_prep_ms: float
+    request_wall_time_ms: float
+    input_preprocess_time_ms: float
+    build_add_request_message_time_ms: float
+    engine_pipeline_time_ms: float
     e2e_total_tokens: int
     transfers_total_time_ms: float
     transfers_total_bytes: int
 
     @property
     def e2e_tpt(self) -> float:
-        return (self.request_latency_ms / self.e2e_total_tokens) if self.e2e_total_tokens > 0 else 0.0
+        return (self.engine_pipeline_time_ms / self.e2e_total_tokens) if self.e2e_total_tokens > 0 else 0.0
 
 
 # === Field Configuration ===
@@ -120,10 +124,12 @@ def __init__(
         log_stats: bool,
         wall_start_ts: float,
         final_stage_id_for_e2e: dict[str, int] | int,
+        stage_metadata: list[dict[str, Any]] | None = None,
     ) -> None:
         self.num_stages = int(num_stages)
         self.log_stats = bool(log_stats)
         self.final_stage_id_for_e2e = final_stage_id_for_e2e
+        self.stage_metadata = list(stage_metadata or [])
         self.init_run_state(wall_start_ts)
         self.stage_events: dict[str, list[StageRequestStats]] = {}
         self.transfer_events: dict[
@@ -134,8 +140,9 @@ def __init__(
     def init_run_state(self, wall_start_ts: float) -> None:
         # Per-run aggregates and timing state
         self.stage_total_tokens = [0 for _ in range(self.num_stages)]
-        self.request_latency_total_ms = 0.0
-        self.request_submit_prep_total_ms = 0.0
+        self.engine_pipeline_total_ms = 0.0
+        self.input_preprocess_total_ms = 0.0
+        self.build_add_request_message_total_ms = 0.0
         self.e2e_total_tokens = 0
         self.e2e_count = 0
         self.e2e_done = set()
@@ -150,6 +157,62 @@ def init_run_state(self, wall_start_ts: float) -> None:
             lambda: defaultdict(float)
         )  # {request_id: {diffusion_metrics_key: accumulated_metrics_data}}
 
+    def _get_stage_metadata(self, stage_id: int | None) -> dict[str, Any]:
+        if stage_id is None or stage_id < 0 or stage_id >= len(self.stage_metadata):
+            return {}
+        meta = self.stage_metadata[stage_id]
+        return dict(meta) if isinstance(meta, dict) else {}
+
+    @staticmethod
+    def _get_stage_type(stage_meta: dict[str, Any]) -> str:
+        return str(stage_meta.get("stage_type") or "unknown")
+
+    @classmethod
+    def _get_stage_name(cls, stage_id: int, stage_meta: dict[str, Any]) -> str:
+        stage_type = cls._get_stage_type(stage_meta)
+        model_stage = stage_meta.get("model_stage")
+        if model_stage:
+            return str(model_stage)
+        if stage_type == "diffusion":
+            return "diffusion"
+        return f"stage_{stage_id}"
+
+    def _stage_label(self, stage_id: int | None) -> str:
+        if stage_id is None:
+            return "unknown"
+        stage_meta = self._get_stage_metadata(stage_id)
+        return f"{stage_id}:{self._get_stage_name(stage_id, stage_meta)}"
+
+    def _format_seconds(self, ms: float) -> str:
+        return f"{ms / 1000.0:.2f}s"
+
+    def _log_omni_timing(self, evt: RequestE2EStats) -> None:
+        rid = evt.request_id
+        stages = []
+        for stage_evt in sorted(
+            self.stage_events.get(rid, []),
+            key=lambda e: e.stage_id if e.stage_id is not None else -1,
+        ):
+            stages.append(
+                f"{self._stage_label(stage_evt.stage_id)}={self._format_seconds(stage_evt.stage_gen_time_ms)}"
+            )
+        transfers = []
+        for transfer_evt in sorted(
+            [e for e in self.transfer_events.values() if e.request_id == rid],
+            key=lambda e: (e.from_stage, e.to_stage),
+        ):
+            transfers.append(f"{transfer_evt.from_stage}->{transfer_evt.to_stage}={transfer_evt.total_time_ms:.3f}ms")
+
+        logger.info(
+            "[OmniTiming] req=%s total=%s preprocess=%s engine=%s stages=[%s] transfers=[%s]",
+            rid,
+            self._format_seconds(evt.request_wall_time_ms),
+            self._format_seconds(evt.input_preprocess_time_ms),
+            self._format_seconds(evt.engine_pipeline_time_ms),
+            ",".join(stages),
+            ",".join(transfers),
+        )
+
     def _get_or_create_transfer_event(
         self,
         from_stage: int,
@@ -328,6 +391,9 @@ def _as_stage_request_stats(
         "Convert dict to StageRequestStats if needed."
         stats = metrics
         stats.stage_id = stage_id
+        stage_meta = self._get_stage_metadata(stage_id)
+        stats.stage_type = self._get_stage_type(stage_meta)
+        stats.stage_name = self._get_stage_name(stage_id, stage_meta)
         stats.request_id = req_id
         stats.final_output_type = final_output_type
         stats.diffusion_metrics = (
@@ -425,7 +491,8 @@ def on_finalize_request(
         stage_id: int,
         req_id: Any,
         req_start_ts: float,
-        request_submit_prep_ms: float = 0.0,
+        input_preprocess_time_ms: float = 0.0,
+        build_add_request_message_time_ms: float = 0.0,
     ) -> None:
         rid_key = str(req_id)
         if rid_key in self.e2e_done:
@@ -436,7 +503,10 @@ def on_finalize_request(
         prev_last = self.stage_last_ts[stage_id]
         self.stage_last_ts[stage_id] = _t1 if prev_last is None else max(prev_last, _t1)
         self.last_finish_ts = max(self.last_finish_ts, _t1)
-        e2e_ms = (_t1 - _t0) * 1000.0
+        engine_pipeline_ms = (_t1 - _t0) * 1000.0
+        input_preprocess_ms = float(input_preprocess_time_ms)
+        build_msg_ms = float(build_add_request_message_time_ms)
+        request_wall_ms = engine_pipeline_ms + input_preprocess_ms
 
         # Sum tokens from all stages for this request
         # Include input tokens from stage 0 + output tokens from all stages
@@ -447,15 +517,18 @@ def on_finalize_request(
                     total_tokens += int(evt.num_tokens_in)
                 total_tokens += int(evt.num_tokens_out)
 
-        self.request_latency_total_ms += e2e_ms
-        self.request_submit_prep_total_ms += float(request_submit_prep_ms)
+        self.engine_pipeline_total_ms += engine_pipeline_ms
+        self.input_preprocess_total_ms += input_preprocess_ms
+        self.build_add_request_message_total_ms += build_msg_ms
         self.e2e_total_tokens += total_tokens
         self.e2e_count += 1
         self.e2e_done.add(rid_key)
         per_req_record = RequestE2EStats(
             request_id=rid_key,
-            request_latency_ms=e2e_ms,
-            request_submit_prep_ms=float(request_submit_prep_ms),
+            request_wall_time_ms=request_wall_ms,
+            input_preprocess_time_ms=input_preprocess_ms,
+            build_add_request_message_time_ms=build_msg_ms,
+            engine_pipeline_time_ms=engine_pipeline_ms,
             e2e_total_tokens=total_tokens,
             transfers_total_time_ms=float(
                 sum(evt.total_time_ms for evt in self.transfer_events.values() if evt.request_id == rid_key)
@@ -466,6 +539,9 @@ def on_finalize_request(
         )
         self.e2e_events.append(per_req_record)
 
+        if self.num_stages > 1:
+            self._log_omni_timing(per_req_record)
+
     def build_and_log_summary(self) -> dict[str, Any]:
         if not self.log_stats:
             return {}
@@ -487,13 +563,17 @@ def build_and_log_summary(self) -> dict[str, Any]:
 
         overall_summary = {
             "e2e_requests": int(self.e2e_count),
-            "run_wall_time_ms": float(wall_time_ms),
-            "request_latency_total_ms": float(self.request_latency_total_ms),
-            "request_submit_prep_total_ms": float(self.request_submit_prep_total_ms),
+            "request_wall_time_ms": float(wall_time_ms),
+            "input_preprocess_time_ms": float(self.input_preprocess_total_ms),
+            "build_add_request_message_time_ms": float(self.build_add_request_message_total_ms),
+            "engine_pipeline_time_ms": float(self.engine_pipeline_total_ms),
             "e2e_total_tokens": int(self.e2e_total_tokens),
-            "avg_run_wall_time_per_request_ms": float(e2e_avg_req),
-            "avg_request_submit_prep_ms": float(
-                self.request_submit_prep_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
+            "avg_request_wall_time_ms": float(e2e_avg_req),
+            "avg_input_preprocess_time_ms": float(
+                self.input_preprocess_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
+            ),
+            "avg_engine_pipeline_time_ms": float(
+                self.engine_pipeline_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
             ),
             "e2e_avg_tokens_per_s": float(e2e_avg_tok),
         }
@@ -561,7 +641,11 @@ def build_and_log_summary(self) -> dict[str, Any]:
             # then remove diffusion_metrics from the table
             stage_rows = []
             for evt in stage_evts:
-                row = {"stage_id": evt.stage_id, **_build_row(evt, local_stage_fields)}
+                row = {
+                    "stage": self._stage_label(evt.stage_id),
+                    "stage_id": evt.stage_id,
+                    **_build_row(evt, local_stage_fields),
+                }
                 if evt.diffusion_metrics:
                     row.update(evt.diffusion_metrics)
                 row.pop("diffusion_metrics", None)  # Remove the dict itself
@@ -574,7 +658,7 @@ def build_and_log_summary(self) -> dict[str, Any]:
                 all_value_fields = set()
                 for row in stage_rows:
                     for k in row.keys():
-                        if k != "stage_id":
+                        if k not in ("stage", "stage_id"):
                             all_value_fields.add(k)
                 value_fields_list = []
                 for field in sorted(all_value_fields):
@@ -593,7 +677,7 @@ def build_and_log_summary(self) -> dict[str, Any]:
                         _format_table(
                             f"StageRequestStats [request_id={rid}]",
                             stage_rows,
-                            column_key="stage_id",
+                            column_key="stage",
                             value_fields=value_fields_list,
                         ),
                     )

From f366015ccb43852f1f08d59bf6dd7710e9c49c25 Mon Sep 17 00:00:00 2001
From: bjf-frz <frz123db@gmail.com>
Date: Thu, 23 Apr 2026 20:13:38 +0800
Subject: [PATCH 03/11] Refine Omni timing metrics

Signed-off-by: bjf-frz <frz123db@gmail.com>
---
 docs/contributing/metrics.md                |  3 +-
 tests/entrypoints/test_async_omni_abort.py  |  2 -
 tests/entrypoints/test_pd_disaggregation.py | 12 +----
 tests/metrics/test_stats.py                 |  3 +-
 vllm_omni/engine/async_omni_engine.py       | 12 ++---
 vllm_omni/engine/cfg_companion_tracker.py   |  3 +-
 vllm_omni/engine/orchestrator.py            | 37 ++++++++--------
 vllm_omni/entrypoints/async_omni.py         | 15 +------
 vllm_omni/entrypoints/omni.py               |  9 +---
 vllm_omni/entrypoints/omni_base.py          |  2 -
 vllm_omni/metrics/stats.py                  | 49 ++++++++-------------
 11 files changed, 51 insertions(+), 96 deletions(-)

diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md
index fa148366e66..f1fc37cbddf 100644
--- a/docs/contributing/metrics.md
+++ b/docs/contributing/metrics.md
@@ -110,7 +110,6 @@ For **online inference** (serving mode), the summary is always per-request. `e2e
 | `e2e_requests`            | Number of completed requests.                                                                |
 | `request_wall_time_ms`        | Wall-clock time span from request preparation start to final completion, in ms.          |
 | `input_preprocess_time_ms` | Time spent preparing and submitting requests before the engine pipeline starts.             |
-| `build_add_request_message_time_ms` | Time spent inside `AsyncOmniEngine._build_add_request_message()`.                  |
 | `engine_pipeline_time_ms` | Time from engine request submission to final completion.                                     |
 | `e2e_total_tokens`        | Total tokens counted across all completed requests (stage0 input + all stage outputs).       |
 | `avg_request_wall_time_ms` | Average wall time per request: `request_wall_time_ms / e2e_requests`.                     |
@@ -127,7 +126,6 @@ For **online inference** (serving mode), the summary is always per-request. `e2e
 |---------------------------|-----------------------------------------------------------------------|
 | `request_wall_time_ms`    | End-to-end latency in ms, including input preprocessing and engine pipeline time.            |
 | `input_preprocess_time_ms` | Time spent preparing and submitting the request before `engine_pipeline_time_ms` starts.    |
-| `build_add_request_message_time_ms` | Time spent inside `_build_add_request_message()` for this request.                 |
 | `engine_pipeline_time_ms` | Time from engine request submission to final completion.                                     |
 | `e2e_total_tokens`        | Total tokens for the request (stage0 input + all stage outputs).      |
 | `transfers_total_time_ms` | Sum of transfer edge `total_time_ms` for this request.                |
@@ -148,6 +146,7 @@ For **online inference** (serving mode), the summary is always per-request. `e2e
 | `image_num`               | Number of images generated (for diffusion/image stages).                                        |
 | `resolution`              | Image resolution (for diffusion/image stages).                                                                  |
 | `postprocess_time_ms` | Diffusion/image: post-processing time in ms.                                                    |
+| `ar2diffusion_time_ms` | Time spent converting AR-stage output into the diffusion-stage prompt before diffusion request submission. |
 
 ---
 
diff --git a/tests/entrypoints/test_async_omni_abort.py b/tests/entrypoints/test_async_omni_abort.py
index f7144086689..baf488cada8 100644
--- a/tests/entrypoints/test_async_omni_abort.py
+++ b/tests/entrypoints/test_async_omni_abort.py
@@ -26,7 +26,6 @@ async def fake_process_results(
             final_stage_id_for_e2e,
             req_start_ts,
             input_preprocess_time_ms,
-            build_add_request_message_time_ms,
             wall_start_ts,
         ):
             del (
@@ -34,7 +33,6 @@ async def fake_process_results(
                 final_stage_id_for_e2e,
                 req_start_ts,
                 input_preprocess_time_ms,
-                build_add_request_message_time_ms,
                 wall_start_ts,
             )
             if request_id.startswith("cancel-"):
diff --git a/tests/entrypoints/test_pd_disaggregation.py b/tests/entrypoints/test_pd_disaggregation.py
index 4b220fbcb8c..b03f425084e 100644
--- a/tests/entrypoints/test_pd_disaggregation.py
+++ b/tests/entrypoints/test_pd_disaggregation.py
@@ -258,20 +258,12 @@ def __init__(self, num_stages, enable_stats, wall_start_ts, final_stage_id_for_e
             self.e2e_count = 0
             self.engine_pipeline_total_ms = 0.0
             self.input_preprocess_total_ms = 0.0
-            self.build_add_request_message_total_ms = 0.0
 
         def on_stage_metrics(self, stage_id, req_id, metrics, final_output_type=None):
             pass
 
-        def on_finalize_request(
-            self,
-            stage_id,
-            req_id,
-            start_ts,
-            input_preprocess_time_ms=0.0,
-            build_add_request_message_time_ms=0.0,
-        ):
-            del start_ts, input_preprocess_time_ms, build_add_request_message_time_ms
+        def on_finalize_request(self, stage_id, req_id, start_ts, input_preprocess_time_ms=0.0):
+            del start_ts, input_preprocess_time_ms
             self.e2e_done.add(req_id)
 
         def on_forward(self, from_stage, to_stage, req_id, size_bytes, tx_ms, use_shm):
diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py
index aa81b09421f..5c60f02ca18 100644
--- a/tests/metrics/test_stats.py
+++ b/tests/metrics/test_stats.py
@@ -39,6 +39,7 @@ def test_orchestrator_aggregator_builds_summary() -> None:
             stage_stats=StageStats(),
         ),
     )
+    agg.record_ar2diffusion_time(0, "r1", 12.5)
     agg.on_stage_metrics(
         1,
         "r1",
@@ -65,6 +66,7 @@ def test_orchestrator_aggregator_builds_summary() -> None:
     stage_entry = _get_request_entry(summary["stage_table"], "r1")
     stage_ids = [row["stage_id"] for row in stage_entry["stages"]]
     assert stage_ids == [0, 1]
+    assert stage_entry["stages"][0]["ar2diffusion_time_ms"] == 12.5
 
     transfer_entry = _get_request_entry(summary["trans_table"], "r1")
     assert transfer_entry["transfers"][0]["edge"] == "0->1"
@@ -81,7 +83,6 @@ def test_build_and_log_summary_e2e_only() -> None:
             request_id="r",
             request_wall_time_ms=10.0,
             input_preprocess_time_ms=0.0,
-            build_add_request_message_time_ms=0.0,
             engine_pipeline_time_ms=10.0,
             e2e_total_tokens=5,
             transfers_total_time_ms=0.0,
diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index bd289ca6b1b..8299a577a41 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -1485,7 +1485,7 @@ def add_request(
         reasoning_ended: bool | None = None,
         *,
         resumable: bool = False,
-    ) -> dict[str, float]:
+    ) -> None:
         """Process stage-0 input locally, then send to the Orchestrator.
 
         Input processing and output
@@ -1493,7 +1493,6 @@ def add_request(
         a queue + coroutine-switch round-trip.  The Orchestrator receives a
         ready-to-submit OmniEngineCoreRequest.
         """
-        build_msg_t0 = time.perf_counter()
         msg = self._build_add_request_message(
             request_id=request_id,
             prompt=prompt,
@@ -1509,7 +1508,6 @@ def add_request(
             reasoning_ended=reasoning_ended,
             resumable=resumable,
         )
-        build_msg_ms = (time.perf_counter() - build_msg_t0) * 1000.0
         if self.request_queue is None:
             raise RuntimeError("request_queue is not initialized")
         self.request_queue.sync_q.put_nowait(msg)
@@ -1522,10 +1520,6 @@ def add_request(
             stage0_params = effective_spl[0] if effective_spl else None
             if stage0_params is not None:
                 self._enqueue_cfg_companions(request_id, original_prompt, stage0_params, effective_spl)
-        return {
-            "input_preprocess_time_ms": build_msg_ms,
-            "build_add_request_message_time_ms": build_msg_ms,
-        }
 
     async def add_request_async(
         self,
@@ -1543,9 +1537,9 @@ async def add_request_async(
         reasoning_ended: bool | None = None,
         *,
         resumable: bool = False,
-    ) -> dict[str, float]:
+    ) -> None:
         """Async add_request API."""
-        return self.add_request(
+        self.add_request(
             request_id=request_id,
             prompt=prompt,
             prompt_text=prompt_text,
diff --git a/vllm_omni/engine/cfg_companion_tracker.py b/vllm_omni/engine/cfg_companion_tracker.py
index b9dfae833e2..74cdf818775 100644
--- a/vllm_omni/engine/cfg_companion_tracker.py
+++ b/vllm_omni/engine/cfg_companion_tracker.py
@@ -80,7 +80,7 @@ def on_companion_completed(self, companion_id: str) -> str | None:
             return parent_id
         return None
 
-    def defer_parent(self, parent_id: str, engine_outputs: Any, stage_id: int) -> None:
+    def defer_parent(self, parent_id: str, engine_outputs: Any, stage_id: int, stage_metrics: Any | None = None) -> None:
         """Hold parent result while waiting for companions to finish."""
         # TODO: Add timeout/error recovery when the orchestrator grows a
         # companion-failure path. Today deferred parents are released only when
@@ -88,6 +88,7 @@ def defer_parent(self, parent_id: str, engine_outputs: Any, stage_id: int) -> No
         self._pending_parents[parent_id] = {
             "engine_outputs": engine_outputs,
             "stage_id": stage_id,
+            "stage_metrics": stage_metrics,
         }
         logger.debug("Parent %s deferred, waiting for CFG companions", parent_id)
 
diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index a7edf43394a..24830b9b840 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -425,17 +425,6 @@ async def _route_output(
                     "stage_submit_ts": submit_ts,
                 }
             )
-        elif stage_metrics is not None:
-            await self.output_async_queue.put(
-                {
-                    "type": "stage_metrics",
-                    "request_id": req_id,
-                    "stage_id": stage_id,
-                    "metrics": stage_metrics,
-                    "stage_submit_ts": submit_ts,
-                }
-            )
-
         # PD disaggregation: extract KV transfer params from prefill stage output
         if self._pd_pair is not None and finished and stage_id == self._pd_pair[0]:
             kv_params = getattr(output, "kv_transfer_params", None)
@@ -464,13 +453,14 @@ async def _route_output(
                 and self._cfg_tracker.has_companions(req_id)
                 and not self._cfg_tracker.all_companions_done(req_id)
             ):
-                self._cfg_tracker.defer_parent(req_id, output, stage_id)
+                self._cfg_tracker.defer_parent(req_id, output, stage_id, stage_metrics)
             else:
                 await self._forward_to_next_stage(
                     req_id,
                     stage_id,
                     output,
                     req_state,
+                    stage_metrics=stage_metrics,
                     is_streaming_session=req_state.streaming.enabled,
                     is_final_update=False,
                 )
@@ -481,10 +471,22 @@ async def _route_output(
                         stage_id,
                         output,
                         req_state,
+                        stage_metrics=stage_metrics,
                         is_streaming_session=True,
                         is_final_update=True,
                     )
 
+        if not stage_client.final_output and stage_metrics is not None:
+            await self.output_async_queue.put(
+                {
+                    "type": "stage_metrics",
+                    "request_id": req_id,
+                    "stage_id": stage_id,
+                    "metrics": stage_metrics,
+                    "stage_submit_ts": submit_ts,
+                }
+            )
+
         if finished and stage_id == req_state.final_stage_id:
             # PD: clean up any lingering KV params for this request
             self._pd_kv_params.pop(req_id, None)
@@ -509,6 +511,7 @@ async def _handle_cfg_companion_ready(self, req_id: str) -> None:
                 deferred["stage_id"],
                 deferred["engine_outputs"],
                 parent_state,
+                stage_metrics=deferred.get("stage_metrics"),
             )
 
     async def _handle_kv_ready_raw_outputs(self, stage_id: int, raw_outputs: EngineCoreOutputs) -> None:
@@ -657,6 +660,7 @@ async def _forward_to_next_stage(
         stage_id: int,
         output: Any,
         req_state: OrchestratorRequestState,
+        stage_metrics: Any | None = None,
         *,
         is_streaming_session: bool = False,
         is_final_update: bool = False,
@@ -682,13 +686,8 @@ async def _forward_to_next_stage(
                     False,
                 )
                 _dt_ar2d = (_time.perf_counter() - _t_ar2d) * 1000
-                logger.info(
-                    "[Orchestrator] ar2diffusion req=%s wall_time=%.3fms stage=%d->%d",
-                    req_id,
-                    _dt_ar2d,
-                    stage_id,
-                    next_stage_id,
-                )
+                if stage_metrics is not None:
+                    stage_metrics.ar2diffusion_time_ms = _dt_ar2d
                 if isinstance(diffusion_prompt, list):
                     if not diffusion_prompt:
                         error_output = OmniRequestOutput.from_error(
diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py
index 2641e8437b3..bac15dbb835 100644
--- a/vllm_omni/entrypoints/async_omni.py
+++ b/vllm_omni/entrypoints/async_omni.py
@@ -285,7 +285,6 @@ async def generate(
             request_prep_start_ts = wall_start_ts
             req_start_ts: dict[str, float] = {}
             input_preprocess_time_ms: dict[str, float] = {}
-            build_add_request_message_time_ms: dict[str, float] = {}
 
             # Determine the final stage for E2E stats
             final_stage_id_for_e2e = self._compute_final_stage_id(output_modalities)
@@ -317,22 +316,15 @@ async def generate(
                     sampling_params_list=req_sp_list,
                     final_stage_id=final_stage_id_for_e2e,
                 )
-                add_request_timings = {}
             else:
-                add_request_timings = await self.engine.add_request_async(
+                await self.engine.add_request_async(
                     request_id=request_id,
                     prompt=prompt,
                     sampling_params_list=req_sp_list,
                     final_stage_id=final_stage_id_for_e2e,
-                ) or {}
+                )
             submit_ts = time.time()
             input_preprocess_time_ms[request_id] = (submit_ts - request_prep_start_ts) * 1000.0
-            build_add_request_message_time_ms[request_id] = float(
-                add_request_timings.get(
-                    "build_add_request_message_time_ms",
-                    input_preprocess_time_ms[request_id],
-                )
-            )
             req_state.metrics.stage_first_ts[0] = submit_ts
             req_start_ts[request_id] = submit_ts
 
@@ -346,7 +338,6 @@ async def generate(
                 final_stage_id_for_e2e,
                 req_start_ts,
                 input_preprocess_time_ms,
-                build_add_request_message_time_ms,
                 wall_start_ts,
             ):
                 yield output
@@ -494,7 +485,6 @@ async def _process_orchestrator_results(
         final_stage_id_for_e2e: int,
         req_start_ts: dict[str, float],
         input_preprocess_time_ms: dict[str, float],
-        build_add_request_message_time_ms: dict[str, float],
         wall_start_ts: float,
     ) -> AsyncGenerator[OmniRequestOutput, None]:
         """Read results from the Orchestrator (via the request's asyncio.Queue)
@@ -537,7 +527,6 @@ async def _process_orchestrator_results(
                 metrics,
                 req_start_ts,
                 input_preprocess_time_ms,
-                build_add_request_message_time_ms,
                 wall_start_ts,
                 final_stage_id_for_e2e,
             )
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
index 7432b9cabae..fcff91df56d 100644
--- a/vllm_omni/entrypoints/omni.py
+++ b/vllm_omni/entrypoints/omni.py
@@ -115,7 +115,6 @@ def _run_generation(
             request_ids = [f"{i}_{uuid.uuid4()}" for i in range(len(request_prompts))]
             req_start_ts: dict[str, float] = {}
             input_preprocess_time_ms: dict[str, float] = {}
-            build_add_request_message_time_ms: dict[str, float] = {}
             wall_start_ts = time.time()
             req_final_stage_ids: dict[str, int] = {}
 
@@ -143,17 +142,14 @@ def _run_generation(
                     p_id = pd_pair[0]
                     req_sp_list[p_id] = self._prepare_prefill_sampling_params(req_id, req_sp_list[p_id])
 
-                add_request_timings = self.engine.add_request(
+                self.engine.add_request(
                     request_id=req_id,
                     prompt=prompt,
                     sampling_params_list=req_sp_list,
                     final_stage_id=final_stage_id,
-                ) or {}
+                )
                 submit_ts = time.time()
                 input_preprocess_time_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0
-                build_add_request_message_time_ms[req_id] = float(
-                    add_request_timings.get("build_add_request_message_time_ms", input_preprocess_time_ms[req_id])
-                )
                 req_state.metrics.stage_first_ts[0] = submit_ts
                 req_start_ts[req_id] = submit_ts
 
@@ -184,7 +180,6 @@ def _run_generation(
                     metrics=req_state.metrics,
                     req_start_ts=req_start_ts,
                     input_preprocess_time_ms=input_preprocess_time_ms,
-                    build_add_request_message_time_ms=build_add_request_message_time_ms,
                     wall_start_ts=wall_start_ts,
                     final_stage_id_for_e2e=req_final_stage_ids[req_id],
                 )
diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index 2d25860ef1e..1cf359f7c08 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -373,7 +373,6 @@ def _process_single_result(
         metrics: OrchestratorMetrics,
         req_start_ts: dict[str, float],
         input_preprocess_time_ms: dict[str, float],
-        build_add_request_message_time_ms: dict[str, float],
         wall_start_ts: float,
         final_stage_id_for_e2e: int,
     ) -> OmniRequestOutput | None:
@@ -405,7 +404,6 @@ def _process_single_result(
                     req_id,
                     req_start_ts.get(req_id, wall_start_ts),
                     input_preprocess_time_ms=input_preprocess_time_ms.get(req_id, 0.0),
-                    build_add_request_message_time_ms=build_add_request_message_time_ms.get(req_id, 0.0),
                 )
         except Exception:
             logger.exception("[%s] Finalize request handling error", self.__class__.__name__)
diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py
index 4b22670ad07..618c292c7e7 100644
--- a/vllm_omni/metrics/stats.py
+++ b/vllm_omni/metrics/stats.py
@@ -43,6 +43,7 @@ class StageRequestStats:
     postprocess_time_ms: float = 0.0
     diffusion_metrics: dict[str, int] = None
     audio_generated_frames: int = 0
+    ar2diffusion_time_ms: float = 0.0
 
     @property
     def rx_mbps(self) -> float:
@@ -78,7 +79,6 @@ class RequestE2EStats:
     request_id: str
     request_wall_time_ms: float
     input_preprocess_time_ms: float
-    build_add_request_message_time_ms: float
     engine_pipeline_time_ms: float
     e2e_total_tokens: int
     transfers_total_time_ms: float
@@ -142,7 +142,6 @@ def init_run_state(self, wall_start_ts: float) -> None:
         self.stage_total_tokens = [0 for _ in range(self.num_stages)]
         self.engine_pipeline_total_ms = 0.0
         self.input_preprocess_total_ms = 0.0
-        self.build_add_request_message_total_ms = 0.0
         self.e2e_total_tokens = 0
         self.e2e_count = 0
         self.e2e_done = set()
@@ -189,6 +188,7 @@ def _format_seconds(self, ms: float) -> str:
     def _log_omni_timing(self, evt: RequestE2EStats) -> None:
         rid = evt.request_id
         stages = []
+        ar2diffusion_ms = 0.0
         for stage_evt in sorted(
             self.stage_events.get(rid, []),
             key=lambda e: e.stage_id if e.stage_id is not None else -1,
@@ -196,6 +196,7 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None:
             stages.append(
                 f"{self._stage_label(stage_evt.stage_id)}={self._format_seconds(stage_evt.stage_gen_time_ms)}"
             )
+            ar2diffusion_ms += float(stage_evt.ar2diffusion_time_ms or 0.0)
         transfers = []
         for transfer_evt in sorted(
             [e for e in self.transfer_events.values() if e.request_id == rid],
@@ -203,12 +204,14 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None:
         ):
             transfers.append(f"{transfer_evt.from_stage}->{transfer_evt.to_stage}={transfer_evt.total_time_ms:.3f}ms")
 
+        ar2diffusion = f" ar2diffusion={self._format_seconds(ar2diffusion_ms)}" if ar2diffusion_ms > 0.0 else ""
         logger.info(
-            "[OmniTiming] req=%s total=%s preprocess=%s engine=%s stages=[%s] transfers=[%s]",
+            "[OmniTiming] req=%s total=%s preprocess=%s engine=%s%s stages=[%s] transfers=[%s]",
             rid,
             self._format_seconds(evt.request_wall_time_ms),
             self._format_seconds(evt.input_preprocess_time_ms),
             self._format_seconds(evt.engine_pipeline_time_ms),
+            ar2diffusion,
             ",".join(stages),
             ",".join(transfers),
         )
@@ -418,19 +421,27 @@ def on_stage_metrics(
 
         self.record_transfer_rx(stats)
 
-    def record_stage_postprocess_time(self, stage_id: int, req_id: Any, postproc_time_ms: float) -> None:
-        if req_id in self.stage_events:
-            for stats in self.stage_events[req_id]:
+    def _update_stage_event_field(self, stage_id: int, req_id: Any, field_name: str, value: float) -> None:
+        rid_key = str(req_id)
+        if rid_key in self.stage_events:
+            for stats in self.stage_events[rid_key]:
                 if stats.stage_id == stage_id:
-                    stats.postprocess_time_ms = float(postproc_time_ms)
+                    setattr(stats, field_name, float(value))
                     break
         else:
             logger.warning(
-                "Failed to record postprocess time for request %s at stage %s: no stage event found",
+                "Failed to record %s for request %s at stage %s: no stage event found",
+                field_name,
                 req_id,
                 stage_id,
             )
 
+    def record_stage_postprocess_time(self, stage_id: int, req_id: Any, postproc_time_ms: float) -> None:
+        self._update_stage_event_field(stage_id, req_id, "postprocess_time_ms", postproc_time_ms)
+
+    def record_ar2diffusion_time(self, stage_id: int, req_id: Any, ar2diffusion_time_ms: float) -> None:
+        self._update_stage_event_field(stage_id, req_id, "ar2diffusion_time_ms", ar2diffusion_time_ms)
+
     @contextmanager
     def stage_postprocess_timer(self, stage_id: int, req_id: Any):
         """Context manager for measuring and recording stage postprocessing time.
@@ -492,7 +503,6 @@ def on_finalize_request(
         req_id: Any,
         req_start_ts: float,
         input_preprocess_time_ms: float = 0.0,
-        build_add_request_message_time_ms: float = 0.0,
     ) -> None:
         rid_key = str(req_id)
         if rid_key in self.e2e_done:
@@ -505,7 +515,6 @@ def on_finalize_request(
         self.last_finish_ts = max(self.last_finish_ts, _t1)
         engine_pipeline_ms = (_t1 - _t0) * 1000.0
         input_preprocess_ms = float(input_preprocess_time_ms)
-        build_msg_ms = float(build_add_request_message_time_ms)
         request_wall_ms = engine_pipeline_ms + input_preprocess_ms
 
         # Sum tokens from all stages for this request
@@ -519,7 +528,6 @@ def on_finalize_request(
 
         self.engine_pipeline_total_ms += engine_pipeline_ms
         self.input_preprocess_total_ms += input_preprocess_ms
-        self.build_add_request_message_total_ms += build_msg_ms
         self.e2e_total_tokens += total_tokens
         self.e2e_count += 1
         self.e2e_done.add(rid_key)
@@ -527,7 +535,6 @@ def on_finalize_request(
             request_id=rid_key,
             request_wall_time_ms=request_wall_ms,
             input_preprocess_time_ms=input_preprocess_ms,
-            build_add_request_message_time_ms=build_msg_ms,
             engine_pipeline_time_ms=engine_pipeline_ms,
             e2e_total_tokens=total_tokens,
             transfers_total_time_ms=float(
@@ -565,7 +572,6 @@ def build_and_log_summary(self) -> dict[str, Any]:
             "e2e_requests": int(self.e2e_count),
             "request_wall_time_ms": float(wall_time_ms),
             "input_preprocess_time_ms": float(self.input_preprocess_total_ms),
-            "build_add_request_message_time_ms": float(self.build_add_request_message_total_ms),
             "engine_pipeline_time_ms": float(self.engine_pipeline_total_ms),
             "e2e_total_tokens": int(self.e2e_total_tokens),
             "avg_request_wall_time_ms": float(e2e_avg_req),
@@ -607,23 +613,6 @@ def build_and_log_summary(self) -> dict[str, Any]:
                 e2e_data = _build_row(e2e_evt, E2E_FIELDS)
                 result_e2e_table.append({"request_id": rid, **e2e_data})
 
-                # filter out all-zero fields for logging
-                nonzero_e2e_fields = set()
-                for k, v in e2e_data.items():
-                    if v not in (0, 0.000, None, ""):
-                        nonzero_e2e_fields.add(k)
-                value_fields_e2e = sorted(nonzero_e2e_fields)
-
-                if value_fields_e2e:
-                    logger.info(
-                        "\n%s",
-                        _format_table(
-                            f"RequestE2EStats [request_id={rid}]",
-                            e2e_data,
-                            value_fields=value_fields_e2e,
-                        ),
-                    )
-
             # === Stage table (columns = stage_id) ===
             stage_evts = sorted(
                 self.stage_events.get(rid, []),

From 49337b8c668117ed37805027d5dfbf7ece478a8c Mon Sep 17 00:00:00 2001
From: bjf-frz <frz123db@gmail.com>
Date: Thu, 23 Apr 2026 20:50:48 +0800
Subject: [PATCH 04/11] Clarify Omni timing breakdown

Signed-off-by: bjf-frz <frz123db@gmail.com>
---
 docs/contributing/metrics.md     |  23 ++++-
 tests/metrics/test_stats.py      |  12 ++-
 vllm_omni/engine/orchestrator.py |  12 ++-
 vllm_omni/metrics/stats.py       | 164 ++++++++++++++++++++++++++++++-
 4 files changed, 203 insertions(+), 8 deletions(-)

diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md
index f1fc37cbddf..76978da9aa7 100644
--- a/docs/contributing/metrics.md
+++ b/docs/contributing/metrics.md
@@ -99,6 +99,8 @@ For **offline inference** (batch mode), the summary includes both system-level m
 
 For **online inference** (serving mode), the summary is always per-request. `e2e_requests` is always 1, and only request-level metrics are reported for each completion.
 
+When `e2e_requests` is 1, average fields are omitted from the printed `Overall Summary` because they are identical to the total/request-level values. They remain meaningful for offline batches with multiple completed requests.
+
 ---
 
 ## Parameter Details
@@ -111,13 +113,33 @@ For **online inference** (serving mode), the summary is always per-request. `e2e
 | `request_wall_time_ms`        | Wall-clock time span from request preparation start to final completion, in ms.          |
 | `input_preprocess_time_ms` | Time spent preparing and submitting requests before the engine pipeline starts.             |
 | `engine_pipeline_time_ms` | Time from engine request submission to final completion.                                     |
+| `stage_gen_total_time_ms` | Sum of all stage `stage_gen_time_ms` values.                                                 |
+| `stage_handoff_total_time_ms` | Sum of inter-stage handoff time, measured after an upstream stage finishes and before the downstream stage is submitted. |
+| `ar2diffusion_total_time_ms` | Subset of `stage_handoff_total_time_ms` spent converting AR outputs into diffusion inputs. |
+| `final_output_overhead_time_ms` | Final engine-side overhead not covered by stage generation or inter-stage handoff.       |
+| `stage_{i}_to_{j}_handoff_time_ms` | Handoff time for a specific stage edge, e.g. `stage_0_to_1_handoff_time_ms`.       |
+| `stage_{i}_to_{j}_ar2diffusion_time_ms` | AR-to-diffusion conversion time included in that edge's handoff time.           |
 | `e2e_total_tokens`        | Total tokens counted across all completed requests (stage0 input + all stage outputs).       |
 | `avg_request_wall_time_ms` | Average wall time per request: `request_wall_time_ms / e2e_requests`.                     |
 | `avg_input_preprocess_time_ms` | Average pre-submit request preparation time per completed request.                    |
 | `avg_engine_pipeline_time_ms` | Average engine pipeline time per completed request.                                      |
+| `avg_stage_gen_total_time_ms` | Average summed stage generation time per completed request.                              |
+| `avg_stage_handoff_total_time_ms` | Average summed inter-stage handoff time per completed request.                    |
+| `avg_ar2diffusion_time_ms` | Average AR-to-diffusion conversion time per completed request.                             |
+| `avg_final_output_overhead_time_ms` | Average final engine-side overhead per completed request.                         |
 | `e2e_avg_tokens_per_s`    | Average token throughput over wall time: `e2e_total_tokens * 1000 / request_wall_time_ms`.  |
 | `e2e_stage_{i}_wall_time_ms` | Wall-clock time span for stage i, in ms. Each stage's wall time is reported as a separate field, e.g., `e2e_stage_0_wall_time_ms`, `e2e_stage_1_wall_time_ms`, etc. |
 
+### Timing Breakdown
+
+The printed summary also includes a `Timing Breakdown` section that shows the containment relationship:
+
+`request_wall_time_ms = input_preprocess_time_ms + engine_pipeline_time_ms`
+
+`engine_pipeline_time_ms = stage_gen_total_time_ms + stage_handoff_total_time_ms + final_output_overhead_time_ms`
+
+`ar2diffusion_total_time_ms` is included in `stage_handoff_total_time_ms`. For a concrete AR-to-diffusion edge, `stage_0_to_1_ar2diffusion_time_ms` is included in `stage_0_to_1_handoff_time_ms`.
+
 ---
 
 ### E2E Table (per request)
@@ -146,7 +168,6 @@ For **online inference** (serving mode), the summary is always per-request. `e2e
 | `image_num`               | Number of images generated (for diffusion/image stages).                                        |
 | `resolution`              | Image resolution (for diffusion/image stages).                                                                  |
 | `postprocess_time_ms` | Diffusion/image: post-processing time in ms.                                                    |
-| `ar2diffusion_time_ms` | Time spent converting AR-stage output into the diffusion-stage prompt before diffusion request submission. |
 
 ---
 
diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py
index 5c60f02ca18..1f01d6f40c9 100644
--- a/tests/metrics/test_stats.py
+++ b/tests/metrics/test_stats.py
@@ -37,9 +37,11 @@ def test_orchestrator_aggregator_builds_summary() -> None:
             rx_decode_time_ms=0.0,
             rx_in_flight_time_ms=0.0,
             stage_stats=StageStats(),
+            handoff_to_stage_id=1,
+            stage_handoff_time_ms=20.0,
+            ar2diffusion_time_ms=12.5,
         ),
     )
-    agg.record_ar2diffusion_time(0, "r1", 12.5)
     agg.on_stage_metrics(
         1,
         "r1",
@@ -62,11 +64,17 @@ def test_orchestrator_aggregator_builds_summary() -> None:
     assert overall["e2e_requests"] == 1
     assert overall["request_wall_time_ms"] == 70.0
     assert overall["input_preprocess_time_ms"] == 0.0
+    assert overall["stage_gen_total_time_ms"] == 50.0
+    assert overall["stage_handoff_total_time_ms"] == 20.0
+    assert overall["stage_0_to_1_handoff_time_ms"] == 20.0
+    assert overall["ar2diffusion_total_time_ms"] == 12.5
+    assert overall["stage_0_to_1_ar2diffusion_time_ms"] == 12.5
 
     stage_entry = _get_request_entry(summary["stage_table"], "r1")
     stage_ids = [row["stage_id"] for row in stage_entry["stages"]]
     assert stage_ids == [0, 1]
-    assert stage_entry["stages"][0]["ar2diffusion_time_ms"] == 12.5
+    assert "stage_handoff_time_ms" not in stage_entry["stages"][0]
+    assert "ar2diffusion_time_ms" not in stage_entry["stages"][0]
 
     transfer_entry = _get_request_entry(summary["trans_table"], "r1")
     assert transfer_entry["transfers"][0]["edge"] == "0->1"
diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index 24830b9b840..f61f6346a12 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -628,6 +628,7 @@ def _build_stage_metrics(
                 total_token=self._agg_total_tokens[stage_id],
                 total_gen_time_ms=self._agg_total_gen_time_ms[stage_id],
             ),
+            stage_end_ts=now,
         )
 
     def _build_kv_sender_info(self, sender_stage_ids: list[int]) -> dict[int, dict[str, Any]] | None:
@@ -676,7 +677,9 @@ async def _forward_to_next_stage(
         next_stage_resumable = is_streaming_session and not is_final_update
 
         if next_client.stage_type == "diffusion":
+            handoff_start_ts = getattr(stage_metrics, "stage_end_ts", None) or _time.time()
             self.stage_clients[stage_id].set_engine_outputs([output])
+            ar2diffusion_ms = 0.0
             if next_client.custom_process_input_func is not None:
                 _t_ar2d = _time.perf_counter()
                 diffusion_prompt = next_client.custom_process_input_func(
@@ -686,6 +689,7 @@ async def _forward_to_next_stage(
                     False,
                 )
                 _dt_ar2d = (_time.perf_counter() - _t_ar2d) * 1000
+                ar2diffusion_ms = _dt_ar2d
                 if stage_metrics is not None:
                     stage_metrics.ar2diffusion_time_ms = _dt_ar2d
                 if isinstance(diffusion_prompt, list):
@@ -737,7 +741,13 @@ async def _forward_to_next_stage(
                     params,
                     kv_sender_info=kv_sender_info,
                 )
-            req_state.stage_submit_ts[next_stage_id] = _time.time()
+            next_submit_ts = _time.time()
+            req_state.stage_submit_ts[next_stage_id] = next_submit_ts
+            if stage_metrics is not None:
+                stage_metrics.handoff_to_stage_id = next_stage_id
+                stage_metrics.stage_handoff_time_ms = max(0.0, (next_submit_ts - handoff_start_ts) * 1000.0)
+                if stage_metrics.ar2diffusion_time_ms == 0.0:
+                    stage_metrics.ar2diffusion_time_ms = ar2diffusion_ms
             return
 
         # PD disaggregation: prefill → decode routing uses original prompt + KV transfer params
diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py
index 618c292c7e7..7ab31cc3b65 100644
--- a/vllm_omni/metrics/stats.py
+++ b/vllm_omni/metrics/stats.py
@@ -43,6 +43,9 @@ class StageRequestStats:
     postprocess_time_ms: float = 0.0
     diffusion_metrics: dict[str, int] = None
     audio_generated_frames: int = 0
+    stage_end_ts: float | None = None
+    handoff_to_stage_id: int | None = None
+    stage_handoff_time_ms: float = 0.0
     ar2diffusion_time_ms: float = 0.0
 
     @property
@@ -106,12 +109,34 @@ def e2e_tpt(self) -> float:
     "rx_decode_time_ms",
     "rx_in_flight_time_ms",
     "final_output_type",
+    "stage_end_ts",
+    "handoff_to_stage_id",
+    "stage_handoff_time_ms",
+    "ar2diffusion_time_ms",
 }
 TRANSFER_EXCLUDE = {"from_stage", "to_stage", "request_id", "used_shm"}
 E2E_EXCLUDE = {"request_id"}
 
 # Decide the order of overall summary fields, or None for auto
-OVERALL_FIELDS: list[str] | None = None
+OVERALL_FIELDS: list[str] | None = [
+    "e2e_requests",
+    "request_wall_time_ms",
+    "input_preprocess_time_ms",
+    "engine_pipeline_time_ms",
+    "stage_gen_total_time_ms",
+    "stage_handoff_total_time_ms",
+    "ar2diffusion_total_time_ms",
+    "final_output_overhead_time_ms",
+    "e2e_total_tokens",
+    "avg_request_wall_time_ms",
+    "avg_input_preprocess_time_ms",
+    "avg_engine_pipeline_time_ms",
+    "avg_stage_gen_total_time_ms",
+    "avg_stage_handoff_total_time_ms",
+    "avg_ar2diffusion_time_ms",
+    "avg_final_output_overhead_time_ms",
+    "e2e_avg_tokens_per_s",
+]
 STAGE_FIELDS = _build_field_defs(StageRequestStats, STAGE_EXCLUDE, FIELD_TRANSFORMS)
 TRANSFER_FIELDS = _build_field_defs(TransferEdgeStats, TRANSFER_EXCLUDE, FIELD_TRANSFORMS)
 E2E_FIELDS = _build_field_defs(RequestE2EStats, E2E_EXCLUDE, FIELD_TRANSFORMS)
@@ -185,9 +210,13 @@ def _stage_label(self, stage_id: int | None) -> str:
     def _format_seconds(self, ms: float) -> str:
         return f"{ms / 1000.0:.2f}s"
 
+    def _format_ms(self, ms: float) -> str:
+        return f"{ms:,.3f} ms"
+
     def _log_omni_timing(self, evt: RequestE2EStats) -> None:
         rid = evt.request_id
         stages = []
+        handoffs = []
         ar2diffusion_ms = 0.0
         for stage_evt in sorted(
             self.stage_events.get(rid, []),
@@ -197,6 +226,14 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None:
                 f"{self._stage_label(stage_evt.stage_id)}={self._format_seconds(stage_evt.stage_gen_time_ms)}"
             )
             ar2diffusion_ms += float(stage_evt.ar2diffusion_time_ms or 0.0)
+            if stage_evt.handoff_to_stage_id is not None and stage_evt.stage_handoff_time_ms > 0.0:
+                handoff = (
+                    f"{stage_evt.stage_id}->{stage_evt.handoff_to_stage_id}="
+                    f"{stage_evt.stage_handoff_time_ms:.3f}ms"
+                )
+                if stage_evt.ar2diffusion_time_ms > 0.0:
+                    handoff += f"(ar2diffusion={stage_evt.ar2diffusion_time_ms:.3f}ms)"
+                handoffs.append(handoff)
         transfers = []
         for transfer_evt in sorted(
             [e for e in self.transfer_events.values() if e.request_id == rid],
@@ -206,16 +243,60 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None:
 
         ar2diffusion = f" ar2diffusion={self._format_seconds(ar2diffusion_ms)}" if ar2diffusion_ms > 0.0 else ""
         logger.info(
-            "[OmniTiming] req=%s total=%s preprocess=%s engine=%s%s stages=[%s] transfers=[%s]",
+            "[OmniTiming] req=%s total=%s preprocess=%s engine=%s%s stages=[%s] handoffs=[%s] transfers=[%s]",
             rid,
             self._format_seconds(evt.request_wall_time_ms),
             self._format_seconds(evt.input_preprocess_time_ms),
             self._format_seconds(evt.engine_pipeline_time_ms),
             ar2diffusion,
             ",".join(stages),
+            ",".join(handoffs),
             ",".join(transfers),
         )
 
+    def _build_timing_breakdown_lines(self, summary: dict[str, Any]) -> list[str]:
+        lines = [
+            "[Timing Breakdown]",
+            (
+                "request_wall_time_ms = input_preprocess_time_ms + engine_pipeline_time_ms "
+                f"= {self._format_ms(summary.get('input_preprocess_time_ms', 0.0))} + "
+                f"{self._format_ms(summary.get('engine_pipeline_time_ms', 0.0))} "
+                f"= {self._format_ms(summary.get('request_wall_time_ms', 0.0))}"
+            ),
+            (
+                "engine_pipeline_time_ms = stage_gen_total_time_ms + "
+                "stage_handoff_total_time_ms + final_output_overhead_time_ms "
+                f"= {self._format_ms(summary.get('stage_gen_total_time_ms', 0.0))} + "
+                f"{self._format_ms(summary.get('stage_handoff_total_time_ms', 0.0))} + "
+                f"{self._format_ms(summary.get('final_output_overhead_time_ms', 0.0))} "
+                f"= {self._format_ms(summary.get('engine_pipeline_time_ms', 0.0))}"
+            ),
+        ]
+
+        ar2diffusion_total_ms = float(summary.get("ar2diffusion_total_time_ms", 0.0) or 0.0)
+        if ar2diffusion_total_ms > 0.0:
+            lines.append(
+                "ar2diffusion_total_time_ms is included in stage_handoff_total_time_ms: "
+                f"{self._format_ms(ar2diffusion_total_ms)} <= "
+                f"{self._format_ms(summary.get('stage_handoff_total_time_ms', 0.0))}"
+            )
+
+        edge_handoff_fields = sorted(
+            k
+            for k in summary
+            if k.startswith("stage_") and "_to_" in k and k.endswith("_handoff_time_ms")
+        )
+        for handoff_field in edge_handoff_fields:
+            edge_name = handoff_field.removesuffix("_handoff_time_ms")
+            ar2d_field = f"{edge_name}_ar2diffusion_time_ms"
+            if ar2d_field in summary and float(summary.get(ar2d_field, 0.0) or 0.0) > 0.0:
+                lines.append(
+                    f"{ar2d_field} is included in {handoff_field}: "
+                    f"{self._format_ms(summary[ar2d_field])} <= {self._format_ms(summary[handoff_field])}"
+                )
+
+        return lines
+
     def _get_or_create_transfer_event(
         self,
         from_stage: int,
@@ -426,7 +507,7 @@ def _update_stage_event_field(self, stage_id: int, req_id: Any, field_name: str,
         if rid_key in self.stage_events:
             for stats in self.stage_events[rid_key]:
                 if stats.stage_id == stage_id:
-                    setattr(stats, field_name, float(value))
+                    setattr(stats, field_name, value if field_name == "handoff_to_stage_id" else float(value))
                     break
         else:
             logger.warning(
@@ -442,6 +523,16 @@ def record_stage_postprocess_time(self, stage_id: int, req_id: Any, postproc_tim
     def record_ar2diffusion_time(self, stage_id: int, req_id: Any, ar2diffusion_time_ms: float) -> None:
         self._update_stage_event_field(stage_id, req_id, "ar2diffusion_time_ms", ar2diffusion_time_ms)
 
+    def record_stage_handoff_time(
+        self,
+        from_stage: int,
+        to_stage: int,
+        req_id: Any,
+        handoff_time_ms: float,
+    ) -> None:
+        self._update_stage_event_field(from_stage, req_id, "handoff_to_stage_id", to_stage)
+        self._update_stage_event_field(from_stage, req_id, "stage_handoff_time_ms", handoff_time_ms)
+
     @contextmanager
     def stage_postprocess_timer(self, stage_id: int, req_id: Any):
         """Context manager for measuring and recording stage postprocessing time.
@@ -567,12 +658,36 @@ def build_and_log_summary(self) -> dict[str, Any]:
             else 0.0
             for i in range(self.num_stages)
         ]
+        stage_gen_total_ms = 0.0
+        stage_handoff_total_ms = 0.0
+        ar2diffusion_total_ms = 0.0
+        handoff_edge_totals: defaultdict[str, float] = defaultdict(float)
+        handoff_edge_ar2diffusion: defaultdict[str, float] = defaultdict(float)
+        for stage_evts in self.stage_events.values():
+            for evt in stage_evts:
+                stage_gen_total_ms += float(evt.stage_gen_time_ms or 0.0)
+                handoff_ms = float(evt.stage_handoff_time_ms or 0.0)
+                ar2d_ms = float(evt.ar2diffusion_time_ms or 0.0)
+                stage_handoff_total_ms += handoff_ms
+                ar2diffusion_total_ms += ar2d_ms
+                if evt.stage_id is not None and evt.handoff_to_stage_id is not None:
+                    edge = f"stage_{evt.stage_id}_to_{evt.handoff_to_stage_id}"
+                    handoff_edge_totals[edge] += handoff_ms
+                    handoff_edge_ar2diffusion[edge] += ar2d_ms
+        final_output_overhead_ms = max(
+            0.0,
+            float(self.engine_pipeline_total_ms) - stage_gen_total_ms - stage_handoff_total_ms,
+        )
 
         overall_summary = {
             "e2e_requests": int(self.e2e_count),
             "request_wall_time_ms": float(wall_time_ms),
             "input_preprocess_time_ms": float(self.input_preprocess_total_ms),
             "engine_pipeline_time_ms": float(self.engine_pipeline_total_ms),
+            "stage_gen_total_time_ms": float(stage_gen_total_ms),
+            "stage_handoff_total_time_ms": float(stage_handoff_total_ms),
+            "ar2diffusion_total_time_ms": float(ar2diffusion_total_ms),
+            "final_output_overhead_time_ms": float(final_output_overhead_ms),
             "e2e_total_tokens": int(self.e2e_total_tokens),
             "avg_request_wall_time_ms": float(e2e_avg_req),
             "avg_input_preprocess_time_ms": float(
@@ -581,8 +696,23 @@ def build_and_log_summary(self) -> dict[str, Any]:
             "avg_engine_pipeline_time_ms": float(
                 self.engine_pipeline_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
             ),
+            "avg_stage_gen_total_time_ms": float(stage_gen_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0),
+            "avg_stage_handoff_total_time_ms": float(
+                stage_handoff_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
+            ),
+            "avg_ar2diffusion_time_ms": float(
+                ar2diffusion_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
+            ),
+            "avg_final_output_overhead_time_ms": float(
+                final_output_overhead_ms / self.e2e_count if self.e2e_count > 0 else 0.0
+            ),
             "e2e_avg_tokens_per_s": float(e2e_avg_tok),
         }
+        for edge, handoff_time in sorted(handoff_edge_totals.items()):
+            overall_summary[f"{edge}_handoff_time_ms"] = float(handoff_time)
+            ar2d_time = handoff_edge_ar2diffusion.get(edge, 0.0)
+            if ar2d_time > 0.0:
+                overall_summary[f"{edge}_ar2diffusion_time_ms"] = float(ar2d_time)
         # Add stage_wall_time_ms as separate fields for each stage
         for idx, wall_time in enumerate(stage_wall_time_ms):
             overall_summary[f"e2e_stage_{idx}_wall_time_ms"] = wall_time
@@ -590,7 +720,32 @@ def build_and_log_summary(self) -> dict[str, Any]:
         # Print overall summary
         # filter out all-zero fields for logging
         overall_fields = []
-        for k in OVERALL_FIELDS or list(overall_summary.keys()):
+        avg_fields = {
+            "avg_request_wall_time_ms",
+            "avg_input_preprocess_time_ms",
+            "avg_engine_pipeline_time_ms",
+            "avg_stage_gen_total_time_ms",
+            "avg_stage_handoff_total_time_ms",
+            "avg_ar2diffusion_time_ms",
+            "avg_final_output_overhead_time_ms",
+        }
+        dynamic_overall_fields = [
+            k
+            for k in overall_summary
+            if k.startswith("stage_")
+            and "_to_" in k
+            and (k.endswith("_handoff_time_ms") or k.endswith("_ar2diffusion_time_ms"))
+        ]
+        stage_wall_fields = [k for k in overall_summary if k.startswith("e2e_stage_") and k.endswith("_wall_time_ms")]
+        ordered_overall_fields = list(OVERALL_FIELDS or [])
+        insert_at = ordered_overall_fields.index("e2e_total_tokens")
+        ordered_overall_fields[insert_at:insert_at] = sorted(
+            dynamic_overall_fields,
+            key=lambda name: (name.replace("_ar2diffusion_time_ms", "_zz_ar2diffusion_time_ms")),
+        ) + sorted(stage_wall_fields)
+        for k in ordered_overall_fields or list(overall_summary.keys()):
+            if self.e2e_count <= 1 and k in avg_fields:
+                continue
             v = overall_summary.get(k, None)
             if v not in (0, 0.0, 0.000, None, ""):
                 overall_fields.append(k)
@@ -599,6 +754,7 @@ def build_and_log_summary(self) -> dict[str, Any]:
                 "\n%s",
                 _format_table("Overall Summary", overall_summary, overall_fields),
             )
+            logger.info("\n%s", "\n".join(self._build_timing_breakdown_lines(overall_summary)))
 
         all_request_ids = sorted(set(self.stage_events.keys()) | {e.request_id for e in self.e2e_events})
 

From 8323309af63df33467bb4642b9f261dabde8ec5e Mon Sep 17 00:00:00 2001
From: bjf-frz <frz123db@gmail.com>
Date: Thu, 23 Apr 2026 21:01:25 +0800
Subject: [PATCH 05/11] Apply pre-commit formatting

Signed-off-by: bjf-frz <frz123db@gmail.com>
---
 vllm_omni/engine/cfg_companion_tracker.py |  8 +++++++-
 vllm_omni/metrics/stats.py                | 11 +++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/vllm_omni/engine/cfg_companion_tracker.py b/vllm_omni/engine/cfg_companion_tracker.py
index 74cdf818775..919a842af1d 100644
--- a/vllm_omni/engine/cfg_companion_tracker.py
+++ b/vllm_omni/engine/cfg_companion_tracker.py
@@ -80,7 +80,13 @@ def on_companion_completed(self, companion_id: str) -> str | None:
             return parent_id
         return None
 
-    def defer_parent(self, parent_id: str, engine_outputs: Any, stage_id: int, stage_metrics: Any | None = None) -> None:
+    def defer_parent(
+        self,
+        parent_id: str,
+        engine_outputs: Any,
+        stage_id: int,
+        stage_metrics: Any | None = None,
+    ) -> None:
         """Hold parent result while waiting for companions to finish."""
         # TODO: Add timeout/error recovery when the orchestrator grows a
         # companion-failure path. Today deferred parents are released only when
diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py
index 7ab31cc3b65..3e4f0f6bd7e 100644
--- a/vllm_omni/metrics/stats.py
+++ b/vllm_omni/metrics/stats.py
@@ -228,8 +228,7 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None:
             ar2diffusion_ms += float(stage_evt.ar2diffusion_time_ms or 0.0)
             if stage_evt.handoff_to_stage_id is not None and stage_evt.stage_handoff_time_ms > 0.0:
                 handoff = (
-                    f"{stage_evt.stage_id}->{stage_evt.handoff_to_stage_id}="
-                    f"{stage_evt.stage_handoff_time_ms:.3f}ms"
+                    f"{stage_evt.stage_id}->{stage_evt.handoff_to_stage_id}={stage_evt.stage_handoff_time_ms:.3f}ms"
                 )
                 if stage_evt.ar2diffusion_time_ms > 0.0:
                     handoff += f"(ar2diffusion={stage_evt.ar2diffusion_time_ms:.3f}ms)"
@@ -282,9 +281,7 @@ def _build_timing_breakdown_lines(self, summary: dict[str, Any]) -> list[str]:
             )
 
         edge_handoff_fields = sorted(
-            k
-            for k in summary
-            if k.startswith("stage_") and "_to_" in k and k.endswith("_handoff_time_ms")
+            k for k in summary if k.startswith("stage_") and "_to_" in k and k.endswith("_handoff_time_ms")
         )
         for handoff_field in edge_handoff_fields:
             edge_name = handoff_field.removesuffix("_handoff_time_ms")
@@ -700,9 +697,7 @@ def build_and_log_summary(self) -> dict[str, Any]:
             "avg_stage_handoff_total_time_ms": float(
                 stage_handoff_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
             ),
-            "avg_ar2diffusion_time_ms": float(
-                ar2diffusion_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
-            ),
+            "avg_ar2diffusion_time_ms": float(ar2diffusion_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0),
             "avg_final_output_overhead_time_ms": float(
                 final_output_overhead_ms / self.e2e_count if self.e2e_count > 0 else 0.0
             ),

From 35283eccfebba6901c151ce5fd23306837d0ae0b Mon Sep 17 00:00:00 2001
From: bjf-frz <frz123db@gmail.com>
Date: Fri, 24 Apr 2026 16:13:09 +0800
Subject: [PATCH 06/11] Address metrics review comments

Signed-off-by: bjf-frz <frz123db@gmail.com>
---
 docs/contributing/metrics.md       | 148 +++++------
 tests/metrics/test_stats.py        |  16 +-
 vllm_omni/engine/orchestrator.py   |  19 +-
 vllm_omni/entrypoints/omni_base.py |  30 ++-
 vllm_omni/metrics/stats.py         | 395 +++++++++++++++++++----------
 5 files changed, 386 insertions(+), 222 deletions(-)

diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md
index 76978da9aa7..ca7f596dda2 100644
--- a/docs/contributing/metrics.md
+++ b/docs/contributing/metrics.md
@@ -3,7 +3,7 @@
 
 You can use these metrics in production to monitor the health and performance of the vLLM-omni system. Typical scenarios include:
 
-- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `request_wall_time_ms` / `engine_pipeline_time_ms`), and resource utilization to verify that the system meets expected standards.
+- **Performance Monitoring**: Track throughput (e.g., `avg_tokens_per_s`), latency (e.g., `request_wall_time_ms` / `engine_pipeline_time_ms`), and resource utilization to verify that the system meets expected standards.
 
 - **Debugging and Troubleshooting**: Use detailed per-request metrics to diagnose issues, such as high transfer times or unexpected token counts.
 
@@ -32,74 +32,68 @@ For multi-stage pipelines, vLLM-Omni also emits a concise per-request timing lin
 ```
 
 
-#### Overall Summary
+#### Omni Metrics Summary
 
-| Field                       | Value        |
-|-----------------------------|--------------|
-| e2e_requests                | 1            |
-| request_wall_time_ms        | 41,299.190   |
-| input_preprocess_time_ms    |     57.000   |
-| engine_pipeline_time_ms     | 41,299.133   |
-| e2e_total_tokens            | 5,202        |
-| avg_request_wall_time_ms    | 41,299.190   |
-| e2e_avg_tokens_per_s        | 125.959      |
-| e2e_stage_0_wall_time_ms    | 10,192.289   |
-| e2e_stage_1_wall_time_ms    | 30,541.409   |
-| e2e_stage_2_wall_time_ms    |    207.496   |
-
-#### RequestE2EStats
-
-| Field                   | Value      |
-|-------------------------|------------|
-| request_wall_time_ms    | 41,356.133 |
-| input_preprocess_time_ms |    57.000 |
-| engine_pipeline_time_ms | 41,299.133 |
-| e2e_total_tokens        | 5,202      |
-| transfers_total_time_ms | 245.895    |
-| transfers_total_kbytes  | 138,089.939|
-
-#### StageRequestStats
-
-| Field                  | 0      | 1      | 2      |
-|------------------------|--------|--------|--------|
-| audio_generated_frames | 0      | 0      | 525,525|
-| batch_id               | 38     | 274    | 0      |
-| batch_size             | 1      | 1      | 1      |
-| num_tokens_in          | 4,860  | 4,826  | 4,384  |
-| num_tokens_out         | 67     | 275    | 0      |
-| postprocess_time_ms    | 256.158| 0.491  | 0.000  |
-| stage_gen_time_ms      | 9,910.007|30,379.198|160.745|
-
-#### TransferEdgeStats
-
-| Field               | 0->1        | 1->2       |
-|---------------------|-------------|------------|
-| size_kbytes         | 109,277.349 | 28,812.591 |
-| tx_time_ms          | 78.701      | 18.790     |
-| rx_decode_time_ms   | 111.865     | 31.706     |
-| in_flight_time_ms   | 2.015       | 2.819      |
-
-
-These logs include:
-
-- **Overall summary**: total requests, wall time, average tokens/sec, etc.
-
-- **E2E table**: per-request latency and token counts.
-
-- **Stage table**: per-stage batch and timing details.
+```text
+============ Omni Metrics Summary ============
+Successful requests:                                  1
+Total E2E time (ms):                         41,356.133
+Input preprocess time (ms):                      57.000
+Engine pipeline time (ms):                   41,299.133
+Sum check (ms):                              41,356.133
+
+------------ Overall Time Breakdown ------------
+Input preprocess time (ms):                      57.000
+Stage 0 generation time (ms):                 9,910.007
+Stage 0 output processor time (ms):               7.000
+Stage 0 -> Stage 1 handoff time (ms):           245.895
+Stage 1 generation time (ms):                30,379.198
+Stage 1 output processor time (ms):              10.000
+Final output time (ms):                         747.033
+Component sum (ms):                          41,356.133
+E2E - component sum (ms):                         0.000
+
+------------ Stage 0 Breakdown ------------
+Stage generation time (ms):                  9,910.007
+Output processor time (ms):                      7.000
+Stage sum check (ms):                        9,917.007
+
+Stage id:                                            0
+Stage name:                                         ar
+Stage type:                                        llm
+Final output type:                                text
+Batch id:                                           38
+Batch size:                                          1
+
+Input tokens:                                    4,860
+Output tokens:                                      67
+Output token throughput (tok/s):                 6.761
+Postprocess time (ms):                         256.158
+
+------------ Stage 0 -> Stage 1 Handoff ------------
+Handoff total time (ms):                       245.895
+AR to diffusion time (ms):                      53.314
+Other handoff processing time (ms):            192.581
+
+------------ Final Output Breakdown ------------
+Final output wrapping time (ms):                160.745
+Final orchestration overhead time (ms):         586.288
+Final output total time (ms):                   747.033
+Final output sum check (ms):                    747.033
+```
 
-- **Transfer table**: data transfer and timing for each edge.
+These logs include a high-level end-to-end summary, a non-overlapping stage/time breakdown, and per-stage details from `StageRequestStats`. The final output layer includes output wrapping plus the remaining orchestration overhead so the top-level component sum can be checked directly against E2E time.
 
 You can use these logs to monitor system health, debug performance, and analyze request-level metrics as described above.
 
 
 ## Metrics Scope: Offline vs Online Inference
 
-For **offline inference** (batch mode), the summary includes both system-level metrics (aggregated across all requests) and per-request metrics. In this case, `e2e_requests` can be greater than 1, reflecting multiple completed requests in a batch.
+For **offline inference** (batch mode), the summary includes both system-level metrics (aggregated across all requests) and per-request metrics. In this case, `num_of_requests` can be greater than 1, reflecting multiple completed requests in a batch.
 
-For **online inference** (serving mode), the summary is always per-request. `e2e_requests` is always 1, and only request-level metrics are reported for each completion.
+For **online inference** (serving mode), the summary is always per-request. `num_of_requests` is always 1, and only request-level metrics are reported for each completion.
 
-When `e2e_requests` is 1, average fields are omitted from the printed `Overall Summary` because they are identical to the total/request-level values. They remain meaningful for offline batches with multiple completed requests.
+When `num_of_requests` is 1, average fields are omitted from the returned overall summary because they are identical to the total/request-level values. They remain meaningful for offline batches with multiple completed requests.
 
 ---
 
@@ -109,34 +103,40 @@ When `e2e_requests` is 1, average fields are omitted from the printed `Overall S
 
 | Field                     | Meaning                                                                                       |
 |---------------------------|----------------------------------------------------------------------------------------------|
-| `e2e_requests`            | Number of completed requests.                                                                |
+| `num_of_requests`         | Number of completed requests.                                                                |
 | `request_wall_time_ms`        | Wall-clock time span from request preparation start to final completion, in ms.          |
 | `input_preprocess_time_ms` | Time spent preparing and submitting requests before the engine pipeline starts.             |
 | `engine_pipeline_time_ms` | Time from engine request submission to final completion.                                     |
 | `stage_gen_total_time_ms` | Sum of all stage `stage_gen_time_ms` values.                                                 |
+| `output_processor_total_time_ms` | Sum of time spent in per-stage output processors after raw engine outputs are returned. |
 | `stage_handoff_total_time_ms` | Sum of inter-stage handoff time, measured after an upstream stage finishes and before the downstream stage is submitted. |
 | `ar2diffusion_total_time_ms` | Subset of `stage_handoff_total_time_ms` spent converting AR outputs into diffusion inputs. |
-| `final_output_overhead_time_ms` | Final engine-side overhead not covered by stage generation or inter-stage handoff.       |
+| `final_output_total_time_ms` | Measured time spent wrapping final stage outputs into `OmniRequestOutput` objects before yielding them. |
+| `breakdown_delta_time_ms` | Difference between E2E wall time and the printed component sum; useful for detecting missing timers or overlap. |
 | `stage_{i}_to_{j}_handoff_time_ms` | Handoff time for a specific stage edge, e.g. `stage_0_to_1_handoff_time_ms`.       |
 | `stage_{i}_to_{j}_ar2diffusion_time_ms` | AR-to-diffusion conversion time included in that edge's handoff time.           |
-| `e2e_total_tokens`        | Total tokens counted across all completed requests (stage0 input + all stage outputs).       |
-| `avg_request_wall_time_ms` | Average wall time per request: `request_wall_time_ms / e2e_requests`.                     |
+| `total_tokens`            | Total tokens counted across all completed requests (stage0 input + all stage outputs).       |
+| `avg_request_wall_time_ms` | Average wall time per request: `request_wall_time_ms / num_of_requests`.                  |
 | `avg_input_preprocess_time_ms` | Average pre-submit request preparation time per completed request.                    |
 | `avg_engine_pipeline_time_ms` | Average engine pipeline time per completed request.                                      |
 | `avg_stage_gen_total_time_ms` | Average summed stage generation time per completed request.                              |
+| `avg_output_processor_time_ms` | Average output processor time per completed request.                                   |
 | `avg_stage_handoff_total_time_ms` | Average summed inter-stage handoff time per completed request.                    |
 | `avg_ar2diffusion_time_ms` | Average AR-to-diffusion conversion time per completed request.                             |
-| `avg_final_output_overhead_time_ms` | Average final engine-side overhead per completed request.                         |
-| `e2e_avg_tokens_per_s`    | Average token throughput over wall time: `e2e_total_tokens * 1000 / request_wall_time_ms`.  |
-| `e2e_stage_{i}_wall_time_ms` | Wall-clock time span for stage i, in ms. Each stage's wall time is reported as a separate field, e.g., `e2e_stage_0_wall_time_ms`, `e2e_stage_1_wall_time_ms`, etc. |
+| `avg_final_output_time_ms` | Average final output wrapping time per completed request.                                  |
+| `avg_breakdown_delta_time_ms` | Average difference between E2E wall time and the printed component sum per completed request. |
+| `avg_tokens_per_s`        | Average token throughput over wall time: `total_tokens * 1000 / request_wall_time_ms`.      |
+| `stage_{i}_wall_time_ms`  | Wall-clock time span for stage i, in ms. Each stage's wall time is reported as a separate field, e.g., `stage_0_wall_time_ms`, `stage_1_wall_time_ms`, etc. |
 
 ### Timing Breakdown
 
-The printed summary also includes a `Timing Breakdown` section that shows the containment relationship:
+The printed summary includes sum checks that show the containment relationship:
 
 `request_wall_time_ms = input_preprocess_time_ms + engine_pipeline_time_ms`
 
-`engine_pipeline_time_ms = stage_gen_total_time_ms + stage_handoff_total_time_ms + final_output_overhead_time_ms`
+`request_wall_time_ms ~= input_preprocess_time_ms + sum(stage_gen_time_ms) + sum(output_processor_time_ms) + sum(stage_handoff_time_ms) + final_output_total_time_ms`
+
+Any remaining difference is reported as `breakdown_delta_time_ms`.
 
 `ar2diffusion_total_time_ms` is included in `stage_handoff_total_time_ms`. For a concrete AR-to-diffusion edge, `stage_0_to_1_ar2diffusion_time_ms` is included in `stage_0_to_1_handoff_time_ms`.
 
@@ -149,9 +149,10 @@ The printed summary also includes a `Timing Breakdown` section that shows the co
 | `request_wall_time_ms`    | End-to-end latency in ms, including input preprocessing and engine pipeline time.            |
 | `input_preprocess_time_ms` | Time spent preparing and submitting the request before `engine_pipeline_time_ms` starts.    |
 | `engine_pipeline_time_ms` | Time from engine request submission to final completion.                                     |
-| `e2e_total_tokens`        | Total tokens for the request (stage0 input + all stage outputs).      |
+| `total_tokens`            | Total tokens for the request (stage0 input + all stage outputs).      |
 | `transfers_total_time_ms` | Sum of transfer edge `total_time_ms` for this request.                |
 | `transfers_total_kbytes`  | Sum of transfer kbytes for this request.                              |
+| `final_output_time_ms`    | Time spent wrapping the final engine output into an `OmniRequestOutput` before yielding it. |
 
 
 ---
@@ -164,7 +165,8 @@ The printed summary also includes a `Timing Breakdown` section that shows the co
 | `batch_size`              | Batch size.                                                                                     |
 | `num_tokens_in`           | Input tokens to the stage.                                                                      |
 | `num_tokens_out`          | Output tokens from the stage.                                                                   |
-| `stage_gen_time_ms`       | Stage compute time in ms, excluding postprocessing time (reported separately as `postprocess_time_ms`). |
+| `stage_gen_time_ms`       | Stage compute time in ms, excluding output processor time and postprocessing time.             |
+| `output_processor_time_ms` | Time spent in the stage output processor after raw engine outputs are returned.              |
 | `image_num`               | Number of images generated (for diffusion/image stages).                                        |
 | `resolution`              | Image resolution (for diffusion/image stages).                                                                  |
 | `postprocess_time_ms` | Diffusion/image: post-processing time in ms.                                                    |
@@ -185,20 +187,20 @@ The printed summary also includes a `Timing Breakdown` section that shows the co
 
 **Formulas:**
 
-- `e2e_total_tokens = Stage0's num_tokens_in + sum(all stages' num_tokens_out)`
+- `total_tokens = Stage0's num_tokens_in + sum(all stages' num_tokens_out)`
 
 - `transfers_total_time_ms = sum(tx_time_ms + rx_decode_time_ms + in_flight_time_ms)` for every edge
 
 **Using the example above:**
 
-**e2e_total_tokens**
+**total_tokens**
 
 - Stage0's `num_tokens_in`: **4,860**
 - Stage0's `num_tokens_out`: **67**
 - Stage1's `num_tokens_out`: **275**
 - Stage2's `num_tokens_out`: **0**
 
-so `e2e_total_tokens = 4,860 + 67 + 275 + 0 = 5,202`, which matches the table value `e2e_total_tokens`.
+so `total_tokens = 4,860 + 67 + 275 + 0 = 5,202`, which matches the table value `total_tokens`.
 
 **transfers_total_time_ms**
 
diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py
index 1f01d6f40c9..3bb722b732d 100644
--- a/tests/metrics/test_stats.py
+++ b/tests/metrics/test_stats.py
@@ -15,7 +15,7 @@ def _get_request_entry(table: list[dict], request_id: str) -> dict:
     raise AssertionError(f"request_id={request_id} not found")
 
 
-def test_orchestrator_aggregator_builds_summary() -> None:
+def test_orchestrator_aggregator_builds_summary(monkeypatch: pytest.MonkeyPatch) -> None:
     agg = OrchestratorAggregator(num_stages=2, log_stats=True, wall_start_ts=0.0, final_stage_id_for_e2e=1)
     agg.stage_first_ts[0] = 0.0
     agg.stage_last_ts[0] = 0.03
@@ -40,6 +40,7 @@ def test_orchestrator_aggregator_builds_summary() -> None:
             handoff_to_stage_id=1,
             stage_handoff_time_ms=20.0,
             ar2diffusion_time_ms=12.5,
+            output_processor_time_ms=7.0,
         ),
     )
     agg.on_stage_metrics(
@@ -57,15 +58,19 @@ def test_orchestrator_aggregator_builds_summary() -> None:
             stage_stats=StageStats(),
         ),
     )
+    agg.record_final_output_time("r1", 3.0)
+    monkeypatch.setattr("vllm_omni.metrics.stats.time.time", lambda: 0.07)
     agg.on_finalize_request(1, "r1", req_start_ts=0.0)
 
     summary = agg.build_and_log_summary()
     overall = summary["overall_summary"]
-    assert overall["e2e_requests"] == 1
+    assert overall["num_of_requests"] == 1
     assert overall["request_wall_time_ms"] == 70.0
     assert overall["input_preprocess_time_ms"] == 0.0
     assert overall["stage_gen_total_time_ms"] == 50.0
+    assert overall["output_processor_total_time_ms"] == 7.0
     assert overall["stage_handoff_total_time_ms"] == 20.0
+    assert overall["final_output_total_time_ms"] == 3.0
     assert overall["stage_0_to_1_handoff_time_ms"] == 20.0
     assert overall["ar2diffusion_total_time_ms"] == 12.5
     assert overall["stage_0_to_1_ar2diffusion_time_ms"] == 12.5
@@ -81,7 +86,8 @@ def test_orchestrator_aggregator_builds_summary() -> None:
     assert transfer_entry["transfers"][0]["size_kbytes"] == 1.0
 
     e2e_entry = _get_request_entry(summary["e2e_table"], "r1")
-    assert e2e_entry["e2e_total_tokens"] == 10
+    assert e2e_entry["total_tokens"] == 10
+    assert e2e_entry["final_output_time_ms"] == 3.0
 
 
 def test_build_and_log_summary_e2e_only() -> None:
@@ -92,7 +98,7 @@ def test_build_and_log_summary_e2e_only() -> None:
             request_wall_time_ms=10.0,
             input_preprocess_time_ms=0.0,
             engine_pipeline_time_ms=10.0,
-            e2e_total_tokens=5,
+            total_tokens=5,
             transfers_total_time_ms=0.0,
             transfers_total_bytes=0,
         )
@@ -102,7 +108,7 @@ def test_build_and_log_summary_e2e_only() -> None:
     e2e_entry = _get_request_entry(summary["e2e_table"], "r")
     assert e2e_entry["request_wall_time_ms"] == 10.0
     assert e2e_entry["engine_pipeline_time_ms"] == 10.0
-    assert e2e_entry["e2e_total_tokens"] == 5
+    assert e2e_entry["total_tokens"] == 5
     stage_entry = _get_request_entry(summary["stage_table"], "r")
     assert stage_entry["stages"] == []
 
diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index f61f6346a12..76a3170c214 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -109,6 +109,7 @@ class OrchestratorRequestState:
 
     # Metrics: timestamp when request was submitted to each stage
     stage_submit_ts: dict[int, float] = field(default_factory=dict)
+    output_processor_time_ms: dict[int, float] = field(default_factory=dict)
     mm_processor_kwargs: dict | None = None
     mm_features: list | None = None
 
@@ -597,7 +598,9 @@ def _build_stage_metrics(
         """
         now = _time.time()
         submit_ts = req_state.stage_submit_ts.get(stage_id, now)
-        stage_gen_time_ms = (now - submit_ts) * 1000.0
+        output_processor_time_ms = float(req_state.output_processor_time_ms.get(stage_id, 0.0))
+        stage_wall_time_ms = (now - submit_ts) * 1000.0
+        stage_gen_time_ms = max(0.0, stage_wall_time_ms - output_processor_time_ms)
 
         num_tokens_out = count_tokens_from_outputs(request_outputs)
         num_tokens_in = 0
@@ -619,6 +622,7 @@ def _build_stage_metrics(
             num_tokens_in=num_tokens_in,
             num_tokens_out=num_tokens_out,
             stage_gen_time_ms=stage_gen_time_ms,
+            output_processor_time_ms=output_processor_time_ms,
             batch_id=batch_id,
             batch_size=1,
             rx_decode_time_ms=0.0,
@@ -862,11 +866,24 @@ async def _process_stage_outputs(self, stage_id: int, raw_outputs: EngineCoreOut
         """
         processor = self.output_processors[stage_id]
 
+        output_processor_start = _time.perf_counter()
         processed = processor.process_outputs(
             raw_outputs.outputs,
             raw_outputs.timestamp,
             None,
         )
+        output_processor_time_ms = (_time.perf_counter() - output_processor_start) * 1000.0
+        processed_request_ids = {output.request_id for output in processed.request_outputs}
+        if processed_request_ids:
+            # process_outputs is batch-scoped, so attribute the batch cost evenly.
+            per_request_time_ms = output_processor_time_ms / len(processed_request_ids)
+            for request_id in processed_request_ids:
+                req_state = self.request_states.get(request_id)
+                if req_state is not None:
+                    req_state.output_processor_time_ms[stage_id] = (
+                        req_state.output_processor_time_ms.get(stage_id, 0.0) + per_request_time_ms
+                    )
+
         for eco in raw_outputs.outputs:
             if not hasattr(eco, "request_id"):
                 continue
diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index 1cf359f7c08..11cba2ee23c 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -396,20 +396,9 @@ def _process_single_result(
         if not stage_meta["final_output"]:
             return None
 
-        try:
-            rid_key = str(req_id)
-            if stage_id == final_stage_id_for_e2e and rid_key not in metrics.e2e_done and finished:
-                metrics.on_finalize_request(
-                    stage_id,
-                    req_id,
-                    req_start_ts.get(req_id, wall_start_ts),
-                    input_preprocess_time_ms=input_preprocess_time_ms.get(req_id, 0.0),
-                )
-        except Exception:
-            logger.exception("[%s] Finalize request handling error", self.__class__.__name__)
-
+        final_output_start = time.perf_counter()
         images = getattr(engine_outputs, "images", []) if stage_meta["final_output_type"] == "image" else []
-        return OmniRequestOutput(
+        output_to_yield = OmniRequestOutput(
             request_id=req_id or "",
             stage_id=stage_id,
             final_output_type=stage_meta["final_output_type"],
@@ -423,6 +412,21 @@ def _process_single_result(
             stage_durations=stage_durations,
             peak_memory_mb=peak_memory_mb,
         )
+        metrics.record_final_output_time(req_id, (time.perf_counter() - final_output_start) * 1000.0)
+
+        try:
+            rid_key = str(req_id)
+            if stage_id == final_stage_id_for_e2e and rid_key not in metrics.e2e_done and finished:
+                metrics.on_finalize_request(
+                    stage_id,
+                    req_id,
+                    req_start_ts.get(req_id, wall_start_ts),
+                    input_preprocess_time_ms=input_preprocess_time_ms.get(req_id, 0.0),
+                )
+        except Exception:
+            logger.exception("[%s] Finalize request handling error", self.__class__.__name__)
+
+        return output_to_yield
 
     def shutdown(self) -> None:
         logger.info("[%s] Shutting down", self.__class__.__name__)
diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py
index 3e4f0f6bd7e..cb73949ff3a 100644
--- a/vllm_omni/metrics/stats.py
+++ b/vllm_omni/metrics/stats.py
@@ -9,7 +9,7 @@
 
 from vllm.logger import init_logger
 
-from vllm_omni.metrics.utils import _build_field_defs, _build_row, _format_table
+from vllm_omni.metrics.utils import _build_field_defs, _build_row
 
 logger = init_logger(__name__)
 
@@ -41,6 +41,7 @@ class StageRequestStats:
     final_output_type: str | None = None
     request_id: str | None = None
     postprocess_time_ms: float = 0.0
+    output_processor_time_ms: float = 0.0
     diffusion_metrics: dict[str, int] = None
     audio_generated_frames: int = 0
     stage_end_ts: float | None = None
@@ -83,13 +84,14 @@ class RequestE2EStats:
     request_wall_time_ms: float
     input_preprocess_time_ms: float
     engine_pipeline_time_ms: float
-    e2e_total_tokens: int
+    total_tokens: int
     transfers_total_time_ms: float
     transfers_total_bytes: int
+    final_output_time_ms: float = 0.0
 
     @property
     def e2e_tpt(self) -> float:
-        return (self.engine_pipeline_time_ms / self.e2e_total_tokens) if self.e2e_total_tokens > 0 else 0.0
+        return (self.engine_pipeline_time_ms / self.total_tokens) if self.total_tokens > 0 else 0.0
 
 
 # === Field Configuration ===
@@ -119,23 +121,27 @@ def e2e_tpt(self) -> float:
 
 # Decide the order of overall summary fields, or None for auto
 OVERALL_FIELDS: list[str] | None = [
-    "e2e_requests",
+    "num_of_requests",
     "request_wall_time_ms",
     "input_preprocess_time_ms",
     "engine_pipeline_time_ms",
     "stage_gen_total_time_ms",
+    "output_processor_total_time_ms",
     "stage_handoff_total_time_ms",
     "ar2diffusion_total_time_ms",
-    "final_output_overhead_time_ms",
-    "e2e_total_tokens",
+    "final_output_total_time_ms",
+    "breakdown_delta_time_ms",
+    "total_tokens",
     "avg_request_wall_time_ms",
     "avg_input_preprocess_time_ms",
     "avg_engine_pipeline_time_ms",
     "avg_stage_gen_total_time_ms",
+    "avg_output_processor_time_ms",
     "avg_stage_handoff_total_time_ms",
     "avg_ar2diffusion_time_ms",
-    "avg_final_output_overhead_time_ms",
-    "e2e_avg_tokens_per_s",
+    "avg_final_output_time_ms",
+    "avg_breakdown_delta_time_ms",
+    "avg_tokens_per_s",
 ]
 STAGE_FIELDS = _build_field_defs(StageRequestStats, STAGE_EXCLUDE, FIELD_TRANSFORMS)
 TRANSFER_FIELDS = _build_field_defs(TransferEdgeStats, TRANSFER_EXCLUDE, FIELD_TRANSFORMS)
@@ -167,7 +173,8 @@ def init_run_state(self, wall_start_ts: float) -> None:
         self.stage_total_tokens = [0 for _ in range(self.num_stages)]
         self.engine_pipeline_total_ms = 0.0
         self.input_preprocess_total_ms = 0.0
-        self.e2e_total_tokens = 0
+        self.final_output_total_ms = 0.0
+        self.total_tokens = 0
         self.e2e_count = 0
         self.e2e_done = set()
         self.wall_start_ts = float(wall_start_ts)
@@ -180,6 +187,7 @@ def init_run_state(self, wall_start_ts: float) -> None:
         self.diffusion_metrics: defaultdict[str, defaultdict[str, float]] = defaultdict(
             lambda: defaultdict(float)
         )  # {request_id: {diffusion_metrics_key: accumulated_metrics_data}}
+        self.final_output_time_ms: defaultdict[str, float] = defaultdict(float)
 
     def _get_stage_metadata(self, stage_id: int | None) -> dict[str, Any]:
         if stage_id is None or stage_id < 0 or stage_id >= len(self.stage_metadata):
@@ -253,43 +261,212 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None:
             ",".join(transfers),
         )
 
-    def _build_timing_breakdown_lines(self, summary: dict[str, Any]) -> list[str]:
-        lines = [
-            "[Timing Breakdown]",
-            (
-                "request_wall_time_ms = input_preprocess_time_ms + engine_pipeline_time_ms "
-                f"= {self._format_ms(summary.get('input_preprocess_time_ms', 0.0))} + "
-                f"{self._format_ms(summary.get('engine_pipeline_time_ms', 0.0))} "
-                f"= {self._format_ms(summary.get('request_wall_time_ms', 0.0))}"
-            ),
-            (
-                "engine_pipeline_time_ms = stage_gen_total_time_ms + "
-                "stage_handoff_total_time_ms + final_output_overhead_time_ms "
-                f"= {self._format_ms(summary.get('stage_gen_total_time_ms', 0.0))} + "
-                f"{self._format_ms(summary.get('stage_handoff_total_time_ms', 0.0))} + "
-                f"{self._format_ms(summary.get('final_output_overhead_time_ms', 0.0))} "
-                f"= {self._format_ms(summary.get('engine_pipeline_time_ms', 0.0))}"
-            ),
-        ]
+    @staticmethod
+    def _summary_line(label: str, value: Any) -> str:
+        if isinstance(value, bool):
+            text = str(value).lower()
+        elif isinstance(value, int):
+            text = f"{value:d}"
+        elif isinstance(value, float):
+            text = f"{value:,.3f}"
+        else:
+            text = str(value)
+        return f"{label:<42}{text:>16}"
 
-        ar2diffusion_total_ms = float(summary.get("ar2diffusion_total_time_ms", 0.0) or 0.0)
-        if ar2diffusion_total_ms > 0.0:
-            lines.append(
-                "ar2diffusion_total_time_ms is included in stage_handoff_total_time_ms: "
-                f"{self._format_ms(ar2diffusion_total_ms)} <= "
-                f"{self._format_ms(summary.get('stage_handoff_total_time_ms', 0.0))}"
+    @staticmethod
+    def _summary_header(title: str, char: str = "=") -> str:
+        return f"{char * 12} {title} {char * 12}"
+
+    def _append_stage_info_lines(self, lines: list[str], evt: StageRequestStats) -> None:
+        lines.extend(
+            [
+                self._summary_line("Stage id:", evt.stage_id if evt.stage_id is not None else "unknown"),
+                self._summary_line("Stage name:", evt.stage_name or ""),
+                self._summary_line("Stage type:", evt.stage_type or ""),
+                self._summary_line("Final output type:", evt.final_output_type or ""),
+                self._summary_line("Batch id:", evt.batch_id),
+                self._summary_line("Batch size:", evt.batch_size),
+            ]
+        )
+        if evt.num_tokens_in > 0 or evt.num_tokens_out > 0:
+            lines.extend(
+                [
+                    "",
+                    self._summary_line("Input tokens:", evt.num_tokens_in),
+                    self._summary_line("Output tokens:", evt.num_tokens_out),
+                    self._summary_line("Output token throughput (tok/s):", evt.tokens_per_s),
+                ]
+            )
+        if evt.postprocess_time_ms > 0.0:
+            lines.extend(
+                [
+                    "",
+                    self._summary_line("Postprocess time (ms):", evt.postprocess_time_ms),
+                ]
             )
+        if evt.diffusion_metrics:
+            lines.append("")
+            for key, value in sorted(evt.diffusion_metrics.items()):
+                lines.append(self._summary_line(f"{key}:", value))
+
+    def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> list[str]:
+        lines = [self._summary_header("Omni Metrics Summary")]
+        lines.extend(
+            [
+                self._summary_line("Successful requests:", int(overall_summary.get("num_of_requests", 0))),
+                self._summary_line("Total E2E time (ms):", float(overall_summary.get("request_wall_time_ms", 0.0))),
+                self._summary_line(
+                    "Input preprocess time (ms):",
+                    float(overall_summary.get("input_preprocess_time_ms", 0.0)),
+                ),
+                self._summary_line(
+                    "Engine pipeline time (ms):",
+                    float(overall_summary.get("engine_pipeline_time_ms", 0.0)),
+                ),
+                self._summary_line(
+                    "Sum check (ms):",
+                    float(overall_summary.get("input_preprocess_time_ms", 0.0))
+                    + float(overall_summary.get("engine_pipeline_time_ms", 0.0)),
+                ),
+                "",
+                self._summary_header("Overall Time Breakdown", "-"),
+                self._summary_line(
+                    "Input preprocess time (ms):",
+                    float(overall_summary.get("input_preprocess_time_ms", 0.0)),
+                ),
+            ]
+        )
 
-        edge_handoff_fields = sorted(
-            k for k in summary if k.startswith("stage_") and "_to_" in k and k.endswith("_handoff_time_ms")
+        component_sum = float(overall_summary.get("input_preprocess_time_ms", 0.0))
+        stage_ids = sorted(
+            {
+                evt.stage_id
+                for stage_evts in self.stage_events.values()
+                for evt in stage_evts
+                if evt.stage_id is not None
+            }
         )
-        for handoff_field in edge_handoff_fields:
-            edge_name = handoff_field.removesuffix("_handoff_time_ms")
-            ar2d_field = f"{edge_name}_ar2diffusion_time_ms"
-            if ar2d_field in summary and float(summary.get(ar2d_field, 0.0) or 0.0) > 0.0:
-                lines.append(
-                    f"{ar2d_field} is included in {handoff_field}: "
-                    f"{self._format_ms(summary[ar2d_field])} <= {self._format_ms(summary[handoff_field])}"
+        for stage_id in stage_ids:
+            stage_gen_ms = sum(
+                float(evt.stage_gen_time_ms or 0.0)
+                for stage_evts in self.stage_events.values()
+                for evt in stage_evts
+                if evt.stage_id == stage_id
+            )
+            output_processor_ms = sum(
+                float(evt.output_processor_time_ms or 0.0)
+                for stage_evts in self.stage_events.values()
+                for evt in stage_evts
+                if evt.stage_id == stage_id
+            )
+            lines.append(self._summary_line(f"Stage {stage_id} generation time (ms):", stage_gen_ms))
+            lines.append(self._summary_line(f"Stage {stage_id} output processor time (ms):", output_processor_ms))
+            component_sum += stage_gen_ms + output_processor_ms
+
+            handoff_ms = sum(
+                float(evt.stage_handoff_time_ms or 0.0)
+                for stage_evts in self.stage_events.values()
+                for evt in stage_evts
+                if evt.stage_id == stage_id
+            )
+            if handoff_ms > 0.0:
+                to_stage_ids = sorted(
+                    {
+                        evt.handoff_to_stage_id
+                        for stage_evts in self.stage_events.values()
+                        for evt in stage_evts
+                        if evt.stage_id == stage_id and evt.handoff_to_stage_id is not None
+                    }
+                )
+                to_stage = to_stage_ids[0] if len(to_stage_ids) == 1 else "next"
+                lines.append(self._summary_line(f"Stage {stage_id} -> Stage {to_stage} handoff time (ms):", handoff_ms))
+                component_sum += handoff_ms
+
+        final_output_ms = float(overall_summary.get("final_output_total_time_ms", 0.0))
+        final_orchestration_ms = float(overall_summary.get("breakdown_delta_time_ms", 0.0))
+        final_output_bucket_ms = final_output_ms + final_orchestration_ms
+        lines.extend(
+            [
+                self._summary_line("Final output time (ms):", final_output_bucket_ms),
+                self._summary_line("Component sum (ms):", component_sum + final_output_bucket_ms),
+                self._summary_line(
+                    "E2E - component sum (ms):",
+                    float(overall_summary.get("request_wall_time_ms", 0.0)) - component_sum - final_output_bucket_ms,
+                ),
+            ]
+        )
+
+        for rid in sorted(set(self.stage_events.keys()) | {evt.request_id for evt in self.e2e_events}):
+            stage_evts = sorted(
+                self.stage_events.get(rid, []),
+                key=lambda evt: evt.stage_id if evt.stage_id is not None else -1,
+            )
+            for evt in stage_evts:
+                stage_id = evt.stage_id if evt.stage_id is not None else "unknown"
+                lines.extend(
+                    [
+                        "",
+                        self._summary_header(f"Stage {stage_id} Breakdown", "-"),
+                        self._summary_line("Stage generation time (ms):", float(evt.stage_gen_time_ms or 0.0)),
+                        self._summary_line(
+                            "Output processor time (ms):",
+                            float(evt.output_processor_time_ms or 0.0),
+                        ),
+                        self._summary_line(
+                            "Stage sum check (ms):",
+                            float(evt.stage_gen_time_ms or 0.0) + float(evt.output_processor_time_ms or 0.0),
+                        ),
+                        "",
+                    ]
+                )
+                self._append_stage_info_lines(lines, evt)
+
+                if evt.handoff_to_stage_id is not None and evt.stage_handoff_time_ms > 0.0:
+                    ar2diffusion_ms = float(evt.ar2diffusion_time_ms or 0.0)
+                    lines.extend(
+                        [
+                            "",
+                            self._summary_header(
+                                f"Stage {stage_id} -> Stage {evt.handoff_to_stage_id} Handoff",
+                                "-",
+                            ),
+                            self._summary_line("Handoff total time (ms):", float(evt.stage_handoff_time_ms)),
+                            self._summary_line("AR to diffusion time (ms):", ar2diffusion_ms),
+                            self._summary_line(
+                                "Other handoff processing time (ms):",
+                                float(evt.stage_handoff_time_ms) - ar2diffusion_ms,
+                            ),
+                        ]
+                    )
+
+            final_output_ms = float(self.final_output_time_ms.get(rid, 0.0))
+            e2e_evt = next((evt for evt in self.e2e_events if evt.request_id == rid), None)
+            if e2e_evt is None:
+                final_orchestration_ms = 0.0
+            else:
+                stage_component_ms = sum(
+                    float(evt.stage_gen_time_ms or 0.0)
+                    + float(evt.output_processor_time_ms or 0.0)
+                    + float(evt.stage_handoff_time_ms or 0.0)
+                    for evt in stage_evts
+                )
+                final_orchestration_ms = (
+                    float(e2e_evt.request_wall_time_ms)
+                    - float(e2e_evt.input_preprocess_time_ms)
+                    - stage_component_ms
+                    - final_output_ms
+                )
+            final_output_bucket_ms = final_output_ms + final_orchestration_ms
+            if final_output_bucket_ms != 0.0:
+                lines.extend(
+                    [
+                        "",
+                        self._summary_header("Final Output Breakdown", "-"),
+                        self._summary_line("Final output wrapping time (ms):", final_output_ms),
+                        self._summary_line("Final orchestration overhead time (ms):", final_orchestration_ms),
+                        self._summary_line("Final output total time (ms):", final_output_bucket_ms),
+                        self._summary_line("Final output sum check (ms):", final_output_bucket_ms),
+                    ]
                 )
 
         return lines
@@ -520,6 +697,9 @@ def record_stage_postprocess_time(self, stage_id: int, req_id: Any, postproc_tim
     def record_ar2diffusion_time(self, stage_id: int, req_id: Any, ar2diffusion_time_ms: float) -> None:
         self._update_stage_event_field(stage_id, req_id, "ar2diffusion_time_ms", ar2diffusion_time_ms)
 
+    def record_final_output_time(self, req_id: Any, final_output_time_ms: float) -> None:
+        self.final_output_time_ms[str(req_id)] += float(final_output_time_ms)
+
     def record_stage_handoff_time(
         self,
         from_stage: int,
@@ -604,6 +784,7 @@ def on_finalize_request(
         engine_pipeline_ms = (_t1 - _t0) * 1000.0
         input_preprocess_ms = float(input_preprocess_time_ms)
         request_wall_ms = engine_pipeline_ms + input_preprocess_ms
+        final_output_ms = float(self.final_output_time_ms.get(rid_key, 0.0))
 
         # Sum tokens from all stages for this request
         # Include input tokens from stage 0 + output tokens from all stages
@@ -616,7 +797,8 @@ def on_finalize_request(
 
         self.engine_pipeline_total_ms += engine_pipeline_ms
         self.input_preprocess_total_ms += input_preprocess_ms
-        self.e2e_total_tokens += total_tokens
+        self.final_output_total_ms += final_output_ms
+        self.total_tokens += total_tokens
         self.e2e_count += 1
         self.e2e_done.add(rid_key)
         per_req_record = RequestE2EStats(
@@ -624,13 +806,14 @@ def on_finalize_request(
             request_wall_time_ms=request_wall_ms,
             input_preprocess_time_ms=input_preprocess_ms,
             engine_pipeline_time_ms=engine_pipeline_ms,
-            e2e_total_tokens=total_tokens,
+            total_tokens=total_tokens,
             transfers_total_time_ms=float(
                 sum(evt.total_time_ms for evt in self.transfer_events.values() if evt.request_id == rid_key)
             ),
             transfers_total_bytes=int(
                 sum(evt.size_bytes for evt in self.transfer_events.values() if evt.request_id == rid_key)
             ),
+            final_output_time_ms=final_output_ms,
         )
         self.e2e_events.append(per_req_record)
 
@@ -642,7 +825,7 @@ def build_and_log_summary(self) -> dict[str, Any]:
             return {}
         wall_time_ms = max(0.0, (self.last_finish_ts - self.wall_start_ts) * 1000.0)
         e2e_avg_req = (wall_time_ms / self.e2e_count) if self.e2e_count > 0 else 0.0
-        e2e_avg_tok = (self.e2e_total_tokens * 1000.0 / wall_time_ms) if wall_time_ms > 0 else 0.0
+        avg_tok = (self.total_tokens * 1000.0 / wall_time_ms) if wall_time_ms > 0 else 0.0
 
         if isinstance(self.final_stage_id_for_e2e, int):
             final_stage_id_map: dict[str, int] = {"*": int(self.final_stage_id_for_e2e)}
@@ -656,6 +839,7 @@ def build_and_log_summary(self) -> dict[str, Any]:
             for i in range(self.num_stages)
         ]
         stage_gen_total_ms = 0.0
+        output_processor_total_ms = 0.0
         stage_handoff_total_ms = 0.0
         ar2diffusion_total_ms = 0.0
         handoff_edge_totals: defaultdict[str, float] = defaultdict(float)
@@ -663,6 +847,7 @@ def build_and_log_summary(self) -> dict[str, Any]:
         for stage_evts in self.stage_events.values():
             for evt in stage_evts:
                 stage_gen_total_ms += float(evt.stage_gen_time_ms or 0.0)
+                output_processor_total_ms += float(evt.output_processor_time_ms or 0.0)
                 handoff_ms = float(evt.stage_handoff_time_ms or 0.0)
                 ar2d_ms = float(evt.ar2diffusion_time_ms or 0.0)
                 stage_handoff_total_ms += handoff_ms
@@ -671,21 +856,27 @@ def build_and_log_summary(self) -> dict[str, Any]:
                     edge = f"stage_{evt.stage_id}_to_{evt.handoff_to_stage_id}"
                     handoff_edge_totals[edge] += handoff_ms
                     handoff_edge_ar2diffusion[edge] += ar2d_ms
-        final_output_overhead_ms = max(
-            0.0,
-            float(self.engine_pipeline_total_ms) - stage_gen_total_ms - stage_handoff_total_ms,
+        breakdown_delta_ms = (
+            float(wall_time_ms)
+            - float(self.input_preprocess_total_ms)
+            - stage_gen_total_ms
+            - output_processor_total_ms
+            - stage_handoff_total_ms
+            - float(self.final_output_total_ms)
         )
 
         overall_summary = {
-            "e2e_requests": int(self.e2e_count),
+            "num_of_requests": int(self.e2e_count),
             "request_wall_time_ms": float(wall_time_ms),
             "input_preprocess_time_ms": float(self.input_preprocess_total_ms),
             "engine_pipeline_time_ms": float(self.engine_pipeline_total_ms),
             "stage_gen_total_time_ms": float(stage_gen_total_ms),
+            "output_processor_total_time_ms": float(output_processor_total_ms),
             "stage_handoff_total_time_ms": float(stage_handoff_total_ms),
             "ar2diffusion_total_time_ms": float(ar2diffusion_total_ms),
-            "final_output_overhead_time_ms": float(final_output_overhead_ms),
-            "e2e_total_tokens": int(self.e2e_total_tokens),
+            "final_output_total_time_ms": float(self.final_output_total_ms),
+            "breakdown_delta_time_ms": float(breakdown_delta_ms),
+            "total_tokens": int(self.total_tokens),
             "avg_request_wall_time_ms": float(e2e_avg_req),
             "avg_input_preprocess_time_ms": float(
                 self.input_preprocess_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
@@ -694,14 +885,18 @@ def build_and_log_summary(self) -> dict[str, Any]:
                 self.engine_pipeline_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
             ),
             "avg_stage_gen_total_time_ms": float(stage_gen_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0),
+            "avg_output_processor_time_ms": float(
+                output_processor_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
+            ),
             "avg_stage_handoff_total_time_ms": float(
                 stage_handoff_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
             ),
             "avg_ar2diffusion_time_ms": float(ar2diffusion_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0),
-            "avg_final_output_overhead_time_ms": float(
-                final_output_overhead_ms / self.e2e_count if self.e2e_count > 0 else 0.0
+            "avg_final_output_time_ms": float(
+                self.final_output_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
             ),
-            "e2e_avg_tokens_per_s": float(e2e_avg_tok),
+            "avg_breakdown_delta_time_ms": float(breakdown_delta_ms / self.e2e_count if self.e2e_count > 0 else 0.0),
+            "avg_tokens_per_s": float(avg_tok),
         }
         for edge, handoff_time in sorted(handoff_edge_totals.items()):
             overall_summary[f"{edge}_handoff_time_ms"] = float(handoff_time)
@@ -710,19 +905,18 @@ def build_and_log_summary(self) -> dict[str, Any]:
                 overall_summary[f"{edge}_ar2diffusion_time_ms"] = float(ar2d_time)
         # Add stage_wall_time_ms as separate fields for each stage
         for idx, wall_time in enumerate(stage_wall_time_ms):
-            overall_summary[f"e2e_stage_{idx}_wall_time_ms"] = wall_time
+            overall_summary[f"stage_{idx}_wall_time_ms"] = wall_time
 
-        # Print overall summary
-        # filter out all-zero fields for logging
-        overall_fields = []
         avg_fields = {
             "avg_request_wall_time_ms",
             "avg_input_preprocess_time_ms",
             "avg_engine_pipeline_time_ms",
             "avg_stage_gen_total_time_ms",
+            "avg_output_processor_time_ms",
             "avg_stage_handoff_total_time_ms",
             "avg_ar2diffusion_time_ms",
-            "avg_final_output_overhead_time_ms",
+            "avg_final_output_time_ms",
+            "avg_breakdown_delta_time_ms",
         }
         dynamic_overall_fields = [
             k
@@ -731,25 +925,21 @@ def build_and_log_summary(self) -> dict[str, Any]:
             and "_to_" in k
             and (k.endswith("_handoff_time_ms") or k.endswith("_ar2diffusion_time_ms"))
         ]
-        stage_wall_fields = [k for k in overall_summary if k.startswith("e2e_stage_") and k.endswith("_wall_time_ms")]
+        stage_wall_fields = [
+            k for k in overall_summary if k.startswith("stage_") and k.endswith("_wall_time_ms") and "_to_" not in k
+        ]
         ordered_overall_fields = list(OVERALL_FIELDS or [])
-        insert_at = ordered_overall_fields.index("e2e_total_tokens")
+        insert_at = ordered_overall_fields.index("total_tokens")
         ordered_overall_fields[insert_at:insert_at] = sorted(
             dynamic_overall_fields,
             key=lambda name: (name.replace("_ar2diffusion_time_ms", "_zz_ar2diffusion_time_ms")),
         ) + sorted(stage_wall_fields)
-        for k in ordered_overall_fields or list(overall_summary.keys()):
-            if self.e2e_count <= 1 and k in avg_fields:
-                continue
-            v = overall_summary.get(k, None)
-            if v not in (0, 0.0, 0.000, None, ""):
-                overall_fields.append(k)
-        if overall_fields:
-            logger.info(
-                "\n%s",
-                _format_table("Overall Summary", overall_summary, overall_fields),
-            )
-            logger.info("\n%s", "\n".join(self._build_timing_breakdown_lines(overall_summary)))
+        overall_fields = [
+            k
+            for k in ordered_overall_fields or list(overall_summary.keys())
+            if not (self.e2e_count <= 1 and k in avg_fields)
+            and overall_summary.get(k, None) not in (0, 0.0, 0.000, None, "")
+        ]
 
         all_request_ids = sorted(set(self.stage_events.keys()) | {e.request_id for e in self.e2e_events})
 
@@ -793,35 +983,6 @@ def build_and_log_summary(self) -> dict[str, Any]:
 
             result_stage_table.append({"request_id": rid, "stages": stage_rows})
 
-            if stage_rows:
-                # filter out all-zero fields for logging
-                all_value_fields = set()
-                for row in stage_rows:
-                    for k in row.keys():
-                        if k not in ("stage", "stage_id"):
-                            all_value_fields.add(k)
-                value_fields_list = []
-                for field in sorted(all_value_fields):
-                    all_zero = True
-                    for row in stage_rows:
-                        v = row.get(field, None)
-                        if v not in (0, 0.0, 0.000, None, ""):
-                            all_zero = False
-                            break
-                    if not all_zero:
-                        value_fields_list.append(field)
-
-                if value_fields_list:
-                    logger.info(
-                        "\n%s",
-                        _format_table(
-                            f"StageRequestStats [request_id={rid}]",
-                            stage_rows,
-                            column_key="stage",
-                            value_fields=value_fields_list,
-                        ),
-                    )
-
             # === Transfer table (columns = edge) ===
             transfer_evts = sorted(
                 [e for e in self.transfer_events.values() if e.request_id == rid],
@@ -833,34 +994,8 @@ def build_and_log_summary(self) -> dict[str, Any]:
             ]
             result_trans_table.append({"request_id": rid, "transfers": transfer_rows})
 
-            if transfer_rows:
-                # filter out all-zero fields for logging
-                all_value_fields = set()
-                for row in transfer_rows:
-                    for k in row.keys():
-                        if k != "edge":
-                            all_value_fields.add(k)
-                value_fields_list = []
-                for field in sorted(all_value_fields):
-                    all_zero = True
-                    for row in transfer_rows:
-                        v = row.get(field, None)
-                        if v not in (0, 0.0, 0.000, None, ""):
-                            all_zero = False
-                            break
-                    if not all_zero:
-                        value_fields_list.append(field)
-
-                if value_fields_list:
-                    logger.info(
-                        "\n%s",
-                        _format_table(
-                            f"TransferEdgeStats [request_id={rid}]",
-                            transfer_rows,
-                            column_key="edge",
-                            value_fields=value_fields_list,
-                        ),
-                    )
+        if overall_fields:
+            logger.info("\n%s", "\n".join(self._build_omni_metrics_summary_lines(overall_summary)))
 
         return {
             "final_stage_id": final_stage_id_map,

From 9d93f453a754e34e11ff03f26a326c1cde605d9b Mon Sep 17 00:00:00 2001
From: bjf-frz <frz123db@gmail.com>
Date: Sat, 25 Apr 2026 10:35:13 +0800
Subject: [PATCH 07/11] Batch offline metrics summaries

Signed-off-by: bjf-frz <frz123db@gmail.com>
---
 tests/entrypoints/test_omni_entrypoints.py |  71 +++++++++
 tests/metrics/test_stats.py                |  95 +++++++++++
 vllm_omni/entrypoints/omni.py              |  36 +++--
 vllm_omni/entrypoints/omni_base.py         |   4 +-
 vllm_omni/metrics/stats.py                 | 177 +++++++++++++++------
 5 files changed, 320 insertions(+), 63 deletions(-)

diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py
index a96b4dd1df6..bac4897da5d 100644
--- a/tests/entrypoints/test_omni_entrypoints.py
+++ b/tests/entrypoints/test_omni_entrypoints.py
@@ -645,6 +645,77 @@ def test_omni_generate_returns_list_when_not_using_generator(monkeypatch: pytest
     assert [output.stage_id for output in outputs] == [0, 2, 0, 2]
 
 
+def test_omni_generate_batches_offline_metrics_summary(monkeypatch: pytest.MonkeyPatch):
+    class FakeMetrics:
+        instances: list[FakeMetrics] = []
+
+        def __init__(
+            self,
+            num_stages: int,
+            log_stats: bool,
+            wall_start_ts: float,
+            final_stage_id_for_e2e: dict[str, int],
+            **_: Any,
+        ) -> None:
+            self.num_stages = num_stages
+            self.log_stats = log_stats
+            self.wall_start_ts = wall_start_ts
+            self.final_stage_id_for_e2e = final_stage_id_for_e2e
+            self.stage_first_ts = [None] * num_stages
+            self.stage_last_ts = [None] * num_stages
+            self.e2e_done: set[str] = set()
+            self.final_output_time_ms: dict[str, float] = {}
+            self.build_count = 0
+            FakeMetrics.instances.append(self)
+
+        def on_stage_metrics(self, *_: Any, **__: Any) -> None:
+            pass
+
+        def record_final_output_time(self, req_id: str, final_output_time_ms: float) -> None:
+            self.final_output_time_ms[req_id] = final_output_time_ms
+
+        def on_finalize_request(
+            self,
+            stage_id: int,
+            req_id: str,
+            req_start_ts: float,
+            input_preprocess_time_ms: float = 0.0,
+        ) -> None:
+            del stage_id, req_start_ts, input_preprocess_time_ms
+            self.e2e_done.add(req_id)
+
+        def build_and_log_summary(self) -> dict[str, Any]:
+            self.build_count += 1
+            return {"num_requests": len(self.e2e_done)}
+
+    sampling_params = [SamplingParams(max_tokens=8) for _ in range(2)]
+    engine = FakeAsyncOmniEngine(
+        stage_metadata=LLM_DIFFUSION_META,
+        default_sampling_params_list=sampling_params,
+        on_add_request=_enqueue_omni_llm_diffusion_outputs,
+    )
+    _patch_engine(monkeypatch, engine)
+    monkeypatch.setattr("vllm_omni.entrypoints.omni.OrchestratorMetrics", FakeMetrics)
+
+    app = Omni("dummy-model", log_stats=True)
+    try:
+        outputs = list(app.generate(["p1", "p2"], py_generator=True, use_tqdm=False))
+    finally:
+        if not engine.shutdown_called:
+            app.shutdown()
+
+    assert len(outputs) == 4
+    assert len(FakeMetrics.instances) == 1
+    metrics = FakeMetrics.instances[0]
+    assert metrics.final_stage_id_for_e2e == {
+        engine.submitted[0]["request_id"]: 1,
+        engine.submitted[1]["request_id"]: 1,
+    }
+    assert metrics.e2e_done == {engine.submitted[0]["request_id"], engine.submitted[1]["request_id"]}
+    assert metrics.build_count == 1
+    assert app.request_states == {}
+
+
 def test_omni_generate_diffusion_only_yields_single_image_per_request(monkeypatch: pytest.MonkeyPatch):
     sampling_params = [SamplingParams(max_tokens=8)]
     engine = FakeAsyncOmniEngine(
diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py
index 3bb722b732d..bc998a77e92 100644
--- a/tests/metrics/test_stats.py
+++ b/tests/metrics/test_stats.py
@@ -178,3 +178,98 @@ def test_build_and_log_summary_multiple_requests() -> None:
     r2_stage_entry = next(e for e in summary["stage_table"] if e["request_id"] == "r2")
     assert len(r1_stage_entry["stages"]) == 2
     assert len(r2_stage_entry["stages"]) == 1
+
+
+def test_overall_engine_pipeline_time_uses_batch_wall_time() -> None:
+    agg = OrchestratorAggregator(num_stages=1, log_stats=True, wall_start_ts=0.0, final_stage_id_for_e2e=0)
+    agg.stage_first_ts[0] = 1.0
+    agg.last_finish_ts = 251.0
+    agg.engine_pipeline_total_ms = 1_336_711.0
+    agg.e2e_count = 10
+    agg.e2e_events = [
+        RequestE2EStats(
+            request_id=f"r{i}",
+            request_wall_time_ms=250_000.0,
+            input_preprocess_time_ms=1.0,
+            engine_pipeline_time_ms=249_999.0,
+            total_tokens=0,
+            transfers_total_time_ms=0.0,
+            transfers_total_bytes=0,
+        )
+        for i in range(10)
+    ]
+
+    summary = agg.build_and_log_summary()
+
+    assert summary["overall_summary"]["request_wall_time_ms"] == 251_000.0
+    assert summary["overall_summary"]["input_preprocess_wall_time_ms"] == 1_000.0
+    assert summary["overall_summary"]["engine_pipeline_time_ms"] == 250_000.0
+    assert summary["overall_summary"]["avg_engine_pipeline_time_ms"] == 249_999.0
+
+
+def test_multi_request_summary_includes_averages_and_request_input_preprocess() -> None:
+    agg = OrchestratorAggregator(
+        num_stages=1,
+        log_stats=True,
+        wall_start_ts=0.0,
+        final_stage_id_for_e2e={"r1": 0, "r2": 0},
+    )
+    for req_id in ("r1", "r2"):
+        agg.on_stage_metrics(
+            0,
+            req_id,
+            StageRequestStats(
+                batch_id=1,
+                batch_size=2,
+                num_tokens_in=1,
+                num_tokens_out=2,
+                stage_gen_time_ms=10.0,
+                rx_transfer_bytes=0,
+                rx_decode_time_ms=0.0,
+                rx_in_flight_time_ms=0.0,
+                stage_stats=StageStats(),
+            ),
+        )
+        agg.e2e_events.append(
+            RequestE2EStats(
+                request_id=req_id,
+                request_wall_time_ms=20.0,
+                input_preprocess_time_ms=2.0,
+                engine_pipeline_time_ms=18.0,
+                total_tokens=3,
+                transfers_total_time_ms=0.0,
+                transfers_total_bytes=0,
+            )
+        )
+    agg.e2e_count = 2
+    agg.input_preprocess_total_ms = 4.0
+
+    lines = agg._build_omni_metrics_summary_lines(
+        {
+            "num_of_requests": 2,
+            "request_wall_time_ms": 20.0,
+            "input_preprocess_time_ms": 4.0,
+            "input_preprocess_wall_time_ms": 2.0,
+            "engine_pipeline_time_ms": 18.0,
+            "final_output_total_time_ms": 0.0,
+            "breakdown_delta_time_ms": 12.0,
+            "avg_request_wall_time_ms": 20.0,
+            "avg_engine_pipeline_time_ms": 18.0,
+            "avg_input_preprocess_time_ms": 2.0,
+            "avg_final_output_time_ms": 0.0,
+            "avg_breakdown_delta_time_ms": 6.0,
+            "stage_0_wall_time_ms": 12.0,
+        }
+    )
+    rendered = "\n".join(lines)
+
+    assert "Sum check (ms):" in rendered
+    assert "Component total work time (ms):" in rendered
+    assert "Component sum (ms):" not in rendered
+    assert "Average E2E time" not in rendered
+    assert "Average engine pipeline time" not in rendered
+    assert "Average Time Breakdown" in rendered
+    assert "Average Stage 0 generation time" in rendered
+    assert "Request r1 Breakdown" in rendered
+    assert "Request r2 Breakdown" in rendered
+    assert "Input preprocess time (ms):" in rendered
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
index fcff91df56d..c5e10044419 100644
--- a/vllm_omni/entrypoints/omni.py
+++ b/vllm_omni/entrypoints/omni.py
@@ -117,22 +117,23 @@ def _run_generation(
             input_preprocess_time_ms: dict[str, float] = {}
             wall_start_ts = time.time()
             req_final_stage_ids: dict[str, int] = {}
+            for req_id, prompt in zip(request_ids, request_prompts):
+                prompt_modalities = prompt.get("modalities", None) if isinstance(prompt, dict) else None
+                req_final_stage_ids[req_id] = self._compute_final_stage_id(prompt_modalities)
+            batch_metrics = OrchestratorMetrics(
+                self.num_stages,
+                self.log_stats,
+                wall_start_ts,
+                req_final_stage_ids,
+                stage_metadata=getattr(self.engine, "stage_metadata", None),
+            )
 
             for req_id, prompt in zip(request_ids, request_prompts):
                 request_prep_start_ts = time.time()
-                prompt_modalities = prompt.get("modalities", None) if isinstance(prompt, dict) else None
-                final_stage_id = self._compute_final_stage_id(prompt_modalities)
-                req_final_stage_ids[req_id] = final_stage_id
-
-                metrics = OrchestratorMetrics(
-                    self.num_stages,
-                    self.log_stats,
-                    wall_start_ts,
-                    final_stage_id,
-                    stage_metadata=getattr(self.engine, "stage_metadata", None),
-                )
+                final_stage_id = req_final_stage_ids[req_id]
+
                 req_state = ClientRequestState(req_id)
-                req_state.metrics = metrics
+                req_state.metrics = batch_metrics
                 self.request_states[req_id] = req_state
 
                 # PD disaggregation: modify stage-0 (prefill) sampling params per request
@@ -150,7 +151,10 @@ def _run_generation(
                 )
                 submit_ts = time.time()
                 input_preprocess_time_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0
-                req_state.metrics.stage_first_ts[0] = submit_ts
+                if batch_metrics.stage_first_ts[0] is None:
+                    batch_metrics.stage_first_ts[0] = submit_ts
+                else:
+                    batch_metrics.stage_first_ts[0] = min(batch_metrics.stage_first_ts[0], submit_ts)
                 req_start_ts[req_id] = submit_ts
 
             active_reqs = set(request_ids)
@@ -190,7 +194,11 @@ def _run_generation(
                     active_reqs.discard(req_id)
                     if pbar is not None:
                         pbar.update(1)
-                    self._log_summary_and_cleanup(req_id)
+                    self.request_states.pop(req_id, None)
+
+            summary = batch_metrics.build_and_log_summary()
+            if summary:
+                logger.debug("[Summary] %s", summary)
         except Exception:
             if "active_reqs" in locals() and active_reqs:
                 self.abort(list(active_reqs))
diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index 11cba2ee23c..d1c5f658749 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -396,6 +396,7 @@ def _process_single_result(
         if not stage_meta["final_output"]:
             return None
 
+        final_output_start_ts = time.time()
         final_output_start = time.perf_counter()
         images = getattr(engine_outputs, "images", []) if stage_meta["final_output_type"] == "image" else []
         output_to_yield = OmniRequestOutput(
@@ -412,7 +413,8 @@ def _process_single_result(
             stage_durations=stage_durations,
             peak_memory_mb=peak_memory_mb,
         )
-        metrics.record_final_output_time(req_id, (time.perf_counter() - final_output_start) * 1000.0)
+        final_output_time_ms = (time.perf_counter() - final_output_start) * 1000.0
+        metrics.record_final_output_time(req_id, final_output_time_ms, final_output_start_ts, time.time())
 
         try:
             rid_key = str(req_id)
diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py
index cb73949ff3a..df1876e5eb9 100644
--- a/vllm_omni/metrics/stats.py
+++ b/vllm_omni/metrics/stats.py
@@ -309,6 +309,25 @@ def _append_stage_info_lines(self, lines: list[str], evt: StageRequestStats) ->
             for key, value in sorted(evt.diffusion_metrics.items()):
                 lines.append(self._summary_line(f"{key}:", value))
 
+    def _request_stage_component_time_ms(self, request_id: str) -> float:
+        return sum(
+            float(evt.stage_gen_time_ms or 0.0)
+            + float(evt.output_processor_time_ms or 0.0)
+            + float(evt.stage_handoff_time_ms or 0.0)
+            for evt in self.stage_events.get(request_id, [])
+        )
+
+    def _request_final_orchestration_time_ms(self, evt: RequestE2EStats) -> float:
+        return (
+            float(evt.request_wall_time_ms)
+            - float(evt.input_preprocess_time_ms)
+            - self._request_stage_component_time_ms(evt.request_id)
+            - float(self.final_output_time_ms.get(evt.request_id, 0.0))
+        )
+
+    def _get_overall_summary_time_ms(self, overall_summary: dict[str, Any], key: str) -> float:
+        return float(overall_summary.get(key, overall_summary.get(key.replace("_wall_", "_"), 0.0)))
+
     def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> list[str]:
         lines = [self._summary_header("Omni Metrics Summary")]
         lines.extend(
@@ -317,7 +336,7 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                 self._summary_line("Total E2E time (ms):", float(overall_summary.get("request_wall_time_ms", 0.0))),
                 self._summary_line(
                     "Input preprocess time (ms):",
-                    float(overall_summary.get("input_preprocess_time_ms", 0.0)),
+                    self._get_overall_summary_time_ms(overall_summary, "input_preprocess_wall_time_ms"),
                 ),
                 self._summary_line(
                     "Engine pipeline time (ms):",
@@ -325,19 +344,20 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                 ),
                 self._summary_line(
                     "Sum check (ms):",
-                    float(overall_summary.get("input_preprocess_time_ms", 0.0))
+                    self._get_overall_summary_time_ms(overall_summary, "input_preprocess_wall_time_ms")
                     + float(overall_summary.get("engine_pipeline_time_ms", 0.0)),
                 ),
                 "",
                 self._summary_header("Overall Time Breakdown", "-"),
                 self._summary_line(
                     "Input preprocess time (ms):",
-                    float(overall_summary.get("input_preprocess_time_ms", 0.0)),
+                    self._get_overall_summary_time_ms(overall_summary, "input_preprocess_wall_time_ms"),
                 ),
             ]
         )
 
-        component_sum = float(overall_summary.get("input_preprocess_time_ms", 0.0))
+        component_sum = self._get_overall_summary_time_ms(overall_summary, "input_preprocess_wall_time_ms")
+        stage_average_lines: list[str] = []
         stage_ids = sorted(
             {
                 evt.stage_id
@@ -353,15 +373,18 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                 for evt in stage_evts
                 if evt.stage_id == stage_id
             )
-            output_processor_ms = sum(
+            output_processor_total_ms = sum(
                 float(evt.output_processor_time_ms or 0.0)
                 for stage_evts in self.stage_events.values()
                 for evt in stage_evts
                 if evt.stage_id == stage_id
             )
-            lines.append(self._summary_line(f"Stage {stage_id} generation time (ms):", stage_gen_ms))
-            lines.append(self._summary_line(f"Stage {stage_id} output processor time (ms):", output_processor_ms))
-            component_sum += stage_gen_ms + output_processor_ms
+            stage_event_count = sum(
+                1 for stage_evts in self.stage_events.values() for evt in stage_evts if evt.stage_id == stage_id
+            )
+            lines.append(self._summary_line(f"Stage {stage_id} total generation time (ms):", stage_gen_ms))
+            lines.append(self._summary_line(f"Stage {stage_id} output processor time (ms):", output_processor_total_ms))
+            component_sum += stage_gen_ms + output_processor_total_ms
 
             handoff_ms = sum(
                 float(evt.stage_handoff_time_ms or 0.0)
@@ -381,26 +404,83 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                 to_stage = to_stage_ids[0] if len(to_stage_ids) == 1 else "next"
                 lines.append(self._summary_line(f"Stage {stage_id} -> Stage {to_stage} handoff time (ms):", handoff_ms))
                 component_sum += handoff_ms
+            if int(overall_summary.get("num_of_requests", 0)) > 1 and stage_event_count > 0:
+                stage_average_lines.extend(
+                    [
+                        self._summary_line(
+                            f"Average Stage {stage_id} generation time (ms):",
+                            stage_gen_ms / stage_event_count,
+                        ),
+                        self._summary_line(
+                            f"Average Stage {stage_id} output processor time (ms):",
+                            output_processor_total_ms / stage_event_count,
+                        ),
+                    ]
+                )
+                if handoff_ms > 0.0:
+                    stage_average_lines.append(
+                        self._summary_line(
+                            f"Average Stage {stage_id} handoff time (ms):",
+                            handoff_ms / stage_event_count,
+                        )
+                    )
 
-        final_output_ms = float(overall_summary.get("final_output_total_time_ms", 0.0))
-        final_orchestration_ms = float(overall_summary.get("breakdown_delta_time_ms", 0.0))
-        final_output_bucket_ms = final_output_ms + final_orchestration_ms
+        final_output_bucket_ms = float(overall_summary.get("final_output_total_time_ms", 0.0))
         lines.extend(
             [
                 self._summary_line("Final output time (ms):", final_output_bucket_ms),
-                self._summary_line("Component sum (ms):", component_sum + final_output_bucket_ms),
-                self._summary_line(
-                    "E2E - component sum (ms):",
-                    float(overall_summary.get("request_wall_time_ms", 0.0)) - component_sum - final_output_bucket_ms,
-                ),
             ]
         )
+        if int(overall_summary.get("num_of_requests", 0)) == 1:
+            lines.extend(
+                [
+                    self._summary_line("Component sum (ms):", component_sum + final_output_bucket_ms),
+                    self._summary_line(
+                        "E2E - component sum (ms):",
+                        float(overall_summary.get("request_wall_time_ms", 0.0))
+                        - component_sum
+                        - final_output_bucket_ms,
+                    ),
+                ]
+            )
+        else:
+            lines.append(
+                self._summary_line(
+                    "Component total work time (ms):",
+                    component_sum + final_output_bucket_ms,
+                )
+            )
+            lines.extend(
+                [
+                    "",
+                    self._summary_header("Average Time Breakdown", "-"),
+                    self._summary_line(
+                        "Average input preprocess time (ms):",
+                        float(overall_summary.get("avg_input_preprocess_time_ms", 0.0)),
+                    ),
+                    *stage_average_lines,
+                    self._summary_line(
+                        "Average final output time (ms):",
+                        float(overall_summary.get("avg_final_output_time_ms", 0.0)),
+                    ),
+                ]
+            )
 
         for rid in sorted(set(self.stage_events.keys()) | {evt.request_id for evt in self.e2e_events}):
             stage_evts = sorted(
                 self.stage_events.get(rid, []),
                 key=lambda evt: evt.stage_id if evt.stage_id is not None else -1,
             )
+            e2e_evt = next((evt for evt in self.e2e_events if evt.request_id == rid), None)
+            if e2e_evt is not None:
+                lines.extend(
+                    [
+                        "",
+                        self._summary_header(f"Request {rid} Breakdown", "-"),
+                        self._summary_line("Input preprocess time (ms):", e2e_evt.input_preprocess_time_ms),
+                        self._summary_line("Input preprocess sum check (ms):", e2e_evt.input_preprocess_time_ms),
+                    ]
+                )
             for evt in stage_evts:
                 stage_id = evt.stage_id if evt.stage_id is not None else "unknown"
                 lines.extend(
@@ -440,34 +520,19 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                     )
 
             final_output_ms = float(self.final_output_time_ms.get(rid, 0.0))
-            e2e_evt = next((evt for evt in self.e2e_events if evt.request_id == rid), None)
-            if e2e_evt is None:
-                final_orchestration_ms = 0.0
-            else:
-                stage_component_ms = sum(
-                    float(evt.stage_gen_time_ms or 0.0)
-                    + float(evt.output_processor_time_ms or 0.0)
-                    + float(evt.stage_handoff_time_ms or 0.0)
-                    for evt in stage_evts
-                )
-                final_orchestration_ms = (
-                    float(e2e_evt.request_wall_time_ms)
-                    - float(e2e_evt.input_preprocess_time_ms)
-                    - stage_component_ms
-                    - final_output_ms
-                )
-            final_output_bucket_ms = final_output_ms + final_orchestration_ms
-            if final_output_bucket_ms != 0.0:
+            final_orchestration_ms = self._request_final_orchestration_time_ms(e2e_evt) if e2e_evt is not None else 0.0
+            if final_output_ms != 0.0:
                 lines.extend(
                     [
                         "",
                         self._summary_header("Final Output Breakdown", "-"),
                         self._summary_line("Final output wrapping time (ms):", final_output_ms),
-                        self._summary_line("Final orchestration overhead time (ms):", final_orchestration_ms),
-                        self._summary_line("Final output total time (ms):", final_output_bucket_ms),
-                        self._summary_line("Final output sum check (ms):", final_output_bucket_ms),
+                        self._summary_line("Final output total time (ms):", final_output_ms),
+                        self._summary_line("Final output sum check (ms):", final_output_ms),
                     ]
                 )
+            if final_orchestration_ms != 0.0:
+                lines.append(self._summary_line("Unattributed request time (ms):", final_orchestration_ms))
 
         return lines
 
@@ -697,7 +762,14 @@ def record_stage_postprocess_time(self, stage_id: int, req_id: Any, postproc_tim
     def record_ar2diffusion_time(self, stage_id: int, req_id: Any, ar2diffusion_time_ms: float) -> None:
         self._update_stage_event_field(stage_id, req_id, "ar2diffusion_time_ms", ar2diffusion_time_ms)
 
-    def record_final_output_time(self, req_id: Any, final_output_time_ms: float) -> None:
+    def record_final_output_time(
+        self,
+        req_id: Any,
+        final_output_time_ms: float,
+        start_ts: float | None = None,
+        end_ts: float | None = None,
+    ) -> None:
+        del start_ts, end_ts
         self.final_output_time_ms[str(req_id)] += float(final_output_time_ms)
 
     def record_stage_handoff_time(
@@ -824,8 +896,23 @@ def build_and_log_summary(self) -> dict[str, Any]:
         if not self.log_stats:
             return {}
         wall_time_ms = max(0.0, (self.last_finish_ts - self.wall_start_ts) * 1000.0)
-        e2e_avg_req = (wall_time_ms / self.e2e_count) if self.e2e_count > 0 else 0.0
+        e2e_avg_req = (
+            sum(float(evt.request_wall_time_ms) for evt in self.e2e_events) / self.e2e_count
+            if self.e2e_count > 0
+            else 0.0
+        )
         avg_tok = (self.total_tokens * 1000.0 / wall_time_ms) if wall_time_ms > 0 else 0.0
+        first_engine_ts = min((ts for ts in self.stage_first_ts if ts is not None), default=None)
+        engine_pipeline_wall_ms = (
+            max(0.0, (self.last_finish_ts - first_engine_ts) * 1000.0)
+            if first_engine_ts is not None
+            else float(self.engine_pipeline_total_ms)
+        )
+        input_preprocess_wall_ms = (
+            max(0.0, (first_engine_ts - self.wall_start_ts) * 1000.0)
+            if first_engine_ts is not None
+            else float(self.input_preprocess_total_ms)
+        )
 
         if isinstance(self.final_stage_id_for_e2e, int):
             final_stage_id_map: dict[str, int] = {"*": int(self.final_stage_id_for_e2e)}
@@ -856,20 +943,14 @@ def build_and_log_summary(self) -> dict[str, Any]:
                     edge = f"stage_{evt.stage_id}_to_{evt.handoff_to_stage_id}"
                     handoff_edge_totals[edge] += handoff_ms
                     handoff_edge_ar2diffusion[edge] += ar2d_ms
-        breakdown_delta_ms = (
-            float(wall_time_ms)
-            - float(self.input_preprocess_total_ms)
-            - stage_gen_total_ms
-            - output_processor_total_ms
-            - stage_handoff_total_ms
-            - float(self.final_output_total_ms)
-        )
+        breakdown_delta_ms = sum(self._request_final_orchestration_time_ms(evt) for evt in self.e2e_events)
 
         overall_summary = {
             "num_of_requests": int(self.e2e_count),
             "request_wall_time_ms": float(wall_time_ms),
             "input_preprocess_time_ms": float(self.input_preprocess_total_ms),
-            "engine_pipeline_time_ms": float(self.engine_pipeline_total_ms),
+            "input_preprocess_wall_time_ms": float(input_preprocess_wall_ms),
+            "engine_pipeline_time_ms": float(engine_pipeline_wall_ms),
             "stage_gen_total_time_ms": float(stage_gen_total_ms),
             "output_processor_total_time_ms": float(output_processor_total_ms),
             "stage_handoff_total_time_ms": float(stage_handoff_total_ms),

From ab42f7151141cd8b079270d06330be809dc1fb67 Mon Sep 17 00:00:00 2001
From: bjf-frz <frz123db@gmail.com>
Date: Sat, 25 Apr 2026 14:54:53 +0800
Subject: [PATCH 08/11] Split stage latency and execution metrics

Signed-off-by: bjf-frz <frz123db@gmail.com>
---
 tests/metrics/test_stats.py           |   7 +-
 vllm_omni/engine/async_omni_engine.py |   1 +
 vllm_omni/engine/orchestrator.py      |   9 ++
 vllm_omni/entrypoints/omni.py         |   1 +
 vllm_omni/metrics/stats.py            | 141 ++++++++++++++++++++++----
 5 files changed, 135 insertions(+), 24 deletions(-)

diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py
index bc998a77e92..357b6319319 100644
--- a/tests/metrics/test_stats.py
+++ b/tests/metrics/test_stats.py
@@ -264,12 +264,15 @@ def test_multi_request_summary_includes_averages_and_request_input_preprocess()
     rendered = "\n".join(lines)
 
     assert "Sum check (ms):" in rendered
-    assert "Component total work time (ms):" in rendered
+    assert "Component total work time (ms):" not in rendered
     assert "Component sum (ms):" not in rendered
+    assert "Stage sum check (ms):" not in rendered
     assert "Average E2E time" not in rendered
     assert "Average engine pipeline time" not in rendered
     assert "Average Time Breakdown" in rendered
-    assert "Average Stage 0 generation time" in rendered
+    assert "Average Stage 0 latency time" in rendered
+    assert "Average Stage 0 queue wait time" in rendered
+    assert "Average Stage 0 execution time" in rendered
     assert "Request r1 Breakdown" in rendered
     assert "Request r2 Breakdown" in rendered
     assert "Input preprocess time (ms):" in rendered
diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index 8299a577a41..e8fbfc6ee4e 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -1510,6 +1510,7 @@ def add_request(
         )
         if self.request_queue is None:
             raise RuntimeError("request_queue is not initialized")
+        msg["request_dispatch_start_ts"] = time.time()
         self.request_queue.sync_q.put_nowait(msg)
 
         # CFG companion expansion: create and enqueue companion requests
diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index 76a3170c214..5b8260f20ab 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -632,7 +632,10 @@ def _build_stage_metrics(
                 total_token=self._agg_total_tokens[stage_id],
                 total_gen_time_ms=self._agg_total_gen_time_ms[stage_id],
             ),
+            stage_submit_ts=submit_ts,
             stage_end_ts=now,
+            request_dispatch_wait_time_ms=float(getattr(req_state, "request_dispatch_wait_time_ms", 0.0)),
+            stage_latency_time_ms=stage_wall_time_ms,
         )
 
     def _build_kv_sender_info(self, sender_stage_ids: list[int]) -> dict[int, dict[str, Any]] | None:
@@ -937,6 +940,12 @@ async def _handle_add_request(self, msg: dict[str, Any]) -> None:
         )
         req_state.streaming.enabled = is_streaming
         req_state.stage_submit_ts[stage_id] = _time.time()
+        request_dispatch_start_ts = msg.get("request_dispatch_start_ts")
+        if request_dispatch_start_ts is not None:
+            req_state.request_dispatch_wait_time_ms = max(
+                0.0,
+                (req_state.stage_submit_ts[stage_id] - request_dispatch_start_ts) * 1000.0,
+            )
         self.request_states[request_id] = req_state
 
         # Stage-0 prompt is already a fully-formed OmniEngineCoreRequest
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
index c5e10044419..ebf1301c0a0 100644
--- a/vllm_omni/entrypoints/omni.py
+++ b/vllm_omni/entrypoints/omni.py
@@ -148,6 +148,7 @@ def _run_generation(
                     prompt=prompt,
                     sampling_params_list=req_sp_list,
                     final_stage_id=final_stage_id,
+                    arrival_time=request_prep_start_ts,
                 )
                 submit_ts = time.time()
                 input_preprocess_time_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0
diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py
index df1876e5eb9..ee204dcb28a 100644
--- a/vllm_omni/metrics/stats.py
+++ b/vllm_omni/metrics/stats.py
@@ -44,7 +44,12 @@ class StageRequestStats:
     output_processor_time_ms: float = 0.0
     diffusion_metrics: dict[str, int] = None
     audio_generated_frames: int = 0
+    stage_submit_ts: float | None = None
     stage_end_ts: float | None = None
+    request_dispatch_wait_time_ms: float = 0.0
+    stage_latency_time_ms: float = 0.0
+    stage_queue_wait_time_ms: float = 0.0
+    stage_execution_time_ms: float = 0.0
     handoff_to_stage_id: int | None = None
     stage_handoff_time_ms: float = 0.0
     ar2diffusion_time_ms: float = 0.0
@@ -111,7 +116,12 @@ def e2e_tpt(self) -> float:
     "rx_decode_time_ms",
     "rx_in_flight_time_ms",
     "final_output_type",
+    "stage_submit_ts",
     "stage_end_ts",
+    "request_dispatch_wait_time_ms",
+    "stage_latency_time_ms",
+    "stage_queue_wait_time_ms",
+    "stage_execution_time_ms",
     "handoff_to_stage_id",
     "stage_handoff_time_ms",
     "ar2diffusion_time_ms",
@@ -126,6 +136,9 @@ def e2e_tpt(self) -> float:
     "input_preprocess_time_ms",
     "engine_pipeline_time_ms",
     "stage_gen_total_time_ms",
+    "stage_latency_total_time_ms",
+    "stage_queue_wait_total_time_ms",
+    "stage_execution_total_time_ms",
     "output_processor_total_time_ms",
     "stage_handoff_total_time_ms",
     "ar2diffusion_total_time_ms",
@@ -271,7 +284,8 @@ def _summary_line(label: str, value: Any) -> str:
             text = f"{value:,.3f}"
         else:
             text = str(value)
-        return f"{label:<42}{text:>16}"
+        label_width = max(42, len(label) + 1)
+        return f"{label:<{label_width}}{text:>16}"
 
     @staticmethod
     def _summary_header(title: str, char: str = "=") -> str:
@@ -311,7 +325,8 @@ def _append_stage_info_lines(self, lines: list[str], evt: StageRequestStats) ->
 
     def _request_stage_component_time_ms(self, request_id: str) -> float:
         return sum(
-            float(evt.stage_gen_time_ms or 0.0)
+            float(evt.stage_queue_wait_time_ms or 0.0)
+            + float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0)
             + float(evt.output_processor_time_ms or 0.0)
             + float(evt.stage_handoff_time_ms or 0.0)
             for evt in self.stage_events.get(request_id, [])
@@ -325,9 +340,46 @@ def _request_final_orchestration_time_ms(self, evt: RequestE2EStats) -> float:
             - float(self.final_output_time_ms.get(evt.request_id, 0.0))
         )
 
+    def _request_dispatch_wait_time_ms(self, request_id: str) -> float:
+        return sum(
+            float(evt.request_dispatch_wait_time_ms or 0.0)
+            for evt in self.stage_events.get(request_id, [])
+            if evt.stage_id == 0
+        )
+
+    def _request_remaining_orchestration_time_ms(self, evt: RequestE2EStats) -> float:
+        return self._request_final_orchestration_time_ms(evt) - self._request_dispatch_wait_time_ms(evt.request_id)
+
     def _get_overall_summary_time_ms(self, overall_summary: dict[str, Any], key: str) -> float:
         return float(overall_summary.get(key, overall_summary.get(key.replace("_wall_", "_"), 0.0)))
 
+    def _estimate_stage_wait_and_execution_times(self) -> None:
+        for stage_id in range(self.num_stages):
+            stage_evts = sorted(
+                [
+                    evt
+                    for evts in self.stage_events.values()
+                    for evt in evts
+                    if evt.stage_id == stage_id and evt.stage_submit_ts is not None and evt.stage_end_ts is not None
+                ],
+                key=lambda evt: float(evt.stage_end_ts or 0.0),
+            )
+            previous_finish_ts: float | None = None
+            for evt in stage_evts:
+                submit_ts = float(evt.stage_submit_ts or 0.0)
+                end_ts = float(evt.stage_end_ts or submit_ts)
+                latency_ms = max(0.0, (end_ts - submit_ts) * 1000.0)
+                service_start_ts = submit_ts
+                if previous_finish_ts is not None:
+                    service_start_ts = max(submit_ts, previous_finish_ts)
+                queue_wait_ms = max(0.0, (service_start_ts - submit_ts) * 1000.0)
+                service_time_ms = max(0.0, (end_ts - service_start_ts) * 1000.0)
+                execution_ms = max(0.0, service_time_ms - float(evt.output_processor_time_ms or 0.0))
+                evt.stage_latency_time_ms = latency_ms
+                evt.stage_queue_wait_time_ms = queue_wait_ms
+                evt.stage_execution_time_ms = execution_ms
+                previous_finish_ts = end_ts if previous_finish_ts is None else max(previous_finish_ts, end_ts)
+
     def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> list[str]:
         lines = [self._summary_header("Omni Metrics Summary")]
         lines.extend(
@@ -367,8 +419,20 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
             }
         )
         for stage_id in stage_ids:
-            stage_gen_ms = sum(
-                float(evt.stage_gen_time_ms or 0.0)
+            stage_latency_ms = sum(
+                float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0)
+                for stage_evts in self.stage_events.values()
+                for evt in stage_evts
+                if evt.stage_id == stage_id
+            )
+            stage_queue_wait_ms = sum(
+                float(evt.stage_queue_wait_time_ms or 0.0)
+                for stage_evts in self.stage_events.values()
+                for evt in stage_evts
+                if evt.stage_id == stage_id
+            )
+            stage_execution_ms = sum(
+                float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0)
                 for stage_evts in self.stage_events.values()
                 for evt in stage_evts
                 if evt.stage_id == stage_id
@@ -382,9 +446,11 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
             stage_event_count = sum(
                 1 for stage_evts in self.stage_events.values() for evt in stage_evts if evt.stage_id == stage_id
             )
-            lines.append(self._summary_line(f"Stage {stage_id} total generation time (ms):", stage_gen_ms))
+            lines.append(self._summary_line(f"Stage {stage_id} total latency time (ms):", stage_latency_ms))
+            lines.append(self._summary_line(f"Stage {stage_id} queue wait time (ms):", stage_queue_wait_ms))
+            lines.append(self._summary_line(f"Stage {stage_id} execution time (ms):", stage_execution_ms))
             lines.append(self._summary_line(f"Stage {stage_id} output processor time (ms):", output_processor_total_ms))
-            component_sum += stage_gen_ms + output_processor_total_ms
+            component_sum += stage_execution_ms + output_processor_total_ms
 
             handoff_ms = sum(
                 float(evt.stage_handoff_time_ms or 0.0)
@@ -408,8 +474,16 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                 stage_average_lines.extend(
                     [
                         self._summary_line(
-                            f"Average Stage {stage_id} generation time (ms):",
-                            stage_gen_ms / stage_event_count,
+                            f"Average Stage {stage_id} latency time (ms):",
+                            stage_latency_ms / stage_event_count,
+                        ),
+                        self._summary_line(
+                            f"Average Stage {stage_id} queue wait time (ms):",
+                            stage_queue_wait_ms / stage_event_count,
+                        ),
+                        self._summary_line(
+                            f"Average Stage {stage_id} execution time (ms):",
+                            stage_execution_ms / stage_event_count,
                         ),
                         self._summary_line(
                             f"Average Stage {stage_id} output processor time (ms):",
@@ -444,12 +518,6 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                 ]
             )
         else:
-            lines.append(
-                self._summary_line(
-                    "Component total work time (ms):",
-                    component_sum + final_output_bucket_ms,
-                )
-            )
             lines.extend(
                 [
                     "",
@@ -473,12 +541,14 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
             )
             e2e_evt = next((evt for evt in self.e2e_events if evt.request_id == rid), None)
             if e2e_evt is not None:
+                request_dispatch_wait_ms = self._request_dispatch_wait_time_ms(rid)
                 lines.extend(
                     [
                         "",
                         self._summary_header(f"Request {rid} Breakdown", "-"),
                         self._summary_line("Input preprocess time (ms):", e2e_evt.input_preprocess_time_ms),
                         self._summary_line("Input preprocess sum check (ms):", e2e_evt.input_preprocess_time_ms),
+                        self._summary_line("Request dispatch wait time (ms):", request_dispatch_wait_ms),
                     ]
                 )
             for evt in stage_evts:
@@ -487,14 +557,18 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                     [
                         "",
                         self._summary_header(f"Stage {stage_id} Breakdown", "-"),
-                        self._summary_line("Stage generation time (ms):", float(evt.stage_gen_time_ms or 0.0)),
                         self._summary_line(
-                            "Output processor time (ms):",
-                            float(evt.output_processor_time_ms or 0.0),
+                            "Stage latency time (ms):",
+                            float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0),
                         ),
+                        self._summary_line("Queue wait time (ms):", float(evt.stage_queue_wait_time_ms or 0.0)),
                         self._summary_line(
-                            "Stage sum check (ms):",
-                            float(evt.stage_gen_time_ms or 0.0) + float(evt.output_processor_time_ms or 0.0),
+                            "Execution time (ms):",
+                            float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0),
+                        ),
+                        self._summary_line(
+                            "Output processor time (ms):",
+                            float(evt.output_processor_time_ms or 0.0),
                         ),
                         "",
                     ]
@@ -520,7 +594,9 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                     )
 
             final_output_ms = float(self.final_output_time_ms.get(rid, 0.0))
-            final_orchestration_ms = self._request_final_orchestration_time_ms(e2e_evt) if e2e_evt is not None else 0.0
+            remaining_orchestration_ms = (
+                self._request_remaining_orchestration_time_ms(e2e_evt) if e2e_evt is not None else 0.0
+            )
             if final_output_ms != 0.0:
                 lines.extend(
                     [
@@ -531,8 +607,13 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                         self._summary_line("Final output sum check (ms):", final_output_ms),
                     ]
                 )
-            if final_orchestration_ms != 0.0:
-                lines.append(self._summary_line("Unattributed request time (ms):", final_orchestration_ms))
+            if remaining_orchestration_ms != 0.0:
+                lines.append(
+                    self._summary_line(
+                        "Remaining orchestration overhead time (ms):",
+                        remaining_orchestration_ms,
+                    )
+                )
 
         return lines
 
@@ -719,6 +800,12 @@ def _as_stage_request_stats(
         stats.stage_name = self._get_stage_name(stage_id, stage_meta)
         stats.request_id = req_id
         stats.final_output_type = final_output_type
+        if stats.stage_latency_time_ms == 0.0:
+            stats.stage_latency_time_ms = (
+                float(stats.stage_end_ts - stats.stage_submit_ts) * 1000.0
+                if stats.stage_submit_ts is not None and stats.stage_end_ts is not None
+                else float(stats.stage_gen_time_ms or 0.0) + float(stats.output_processor_time_ms or 0.0)
+            )
         stats.diffusion_metrics = (
             {k: int(v) for k, v in self.diffusion_metrics.pop(req_id, {}).items()}
             if req_id in self.diffusion_metrics
@@ -895,6 +982,7 @@ def on_finalize_request(
     def build_and_log_summary(self) -> dict[str, Any]:
         if not self.log_stats:
             return {}
+        self._estimate_stage_wait_and_execution_times()
         wall_time_ms = max(0.0, (self.last_finish_ts - self.wall_start_ts) * 1000.0)
         e2e_avg_req = (
             sum(float(evt.request_wall_time_ms) for evt in self.e2e_events) / self.e2e_count
@@ -926,6 +1014,9 @@ def build_and_log_summary(self) -> dict[str, Any]:
             for i in range(self.num_stages)
         ]
         stage_gen_total_ms = 0.0
+        stage_latency_total_ms = 0.0
+        stage_queue_wait_total_ms = 0.0
+        stage_execution_total_ms = 0.0
         output_processor_total_ms = 0.0
         stage_handoff_total_ms = 0.0
         ar2diffusion_total_ms = 0.0
@@ -934,6 +1025,9 @@ def build_and_log_summary(self) -> dict[str, Any]:
         for stage_evts in self.stage_events.values():
             for evt in stage_evts:
                 stage_gen_total_ms += float(evt.stage_gen_time_ms or 0.0)
+                stage_latency_total_ms += float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0)
+                stage_queue_wait_total_ms += float(evt.stage_queue_wait_time_ms or 0.0)
+                stage_execution_total_ms += float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0)
                 output_processor_total_ms += float(evt.output_processor_time_ms or 0.0)
                 handoff_ms = float(evt.stage_handoff_time_ms or 0.0)
                 ar2d_ms = float(evt.ar2diffusion_time_ms or 0.0)
@@ -952,6 +1046,9 @@ def build_and_log_summary(self) -> dict[str, Any]:
             "input_preprocess_wall_time_ms": float(input_preprocess_wall_ms),
             "engine_pipeline_time_ms": float(engine_pipeline_wall_ms),
             "stage_gen_total_time_ms": float(stage_gen_total_ms),
+            "stage_latency_total_time_ms": float(stage_latency_total_ms),
+            "stage_queue_wait_total_time_ms": float(stage_queue_wait_total_ms),
+            "stage_execution_total_time_ms": float(stage_execution_total_ms),
             "output_processor_total_time_ms": float(output_processor_total_ms),
             "stage_handoff_total_time_ms": float(stage_handoff_total_ms),
             "ar2diffusion_total_time_ms": float(ar2diffusion_total_ms),

From 44c261fa9fd85635a2d52f348e96c0093442cac2 Mon Sep 17 00:00:00 2001
From: bjf-frz <frz123db@gmail.com>
Date: Mon, 27 Apr 2026 15:10:55 +0800
Subject: [PATCH 09/11] Fix offline metrics UT regressions

Signed-off-by: bjf-frz <frz123db@gmail.com>
---
 vllm_omni/entrypoints/omni_base.py | 5 +++--
 vllm_omni/metrics/stats.py         | 9 ++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index 687d3951c65..dc1e365f85a 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -388,8 +388,9 @@ def _process_single_result(
         peak_memory_mb = getattr(result["engine_outputs"], "peak_memory_mb", 0.0)
 
         # Merge AR stage timing from OrchestratorAggregator.stage_events
+        stage_events = getattr(metrics, "stage_events", {})
         if self._enable_ar_profiler:
-            ar_events = metrics.stage_events.get(str(req_id), [])
+            ar_events = stage_events.get(str(req_id), [])
             for evt in ar_events:
                 if evt.stage_id != stage_id:
                     stage_durations[f"ar_stage_{evt.stage_id}"] = evt.stage_gen_time_ms / 1000.0
@@ -402,7 +403,7 @@ def _process_single_result(
                     stage_durations[key] = value
 
         # Merge per-stage gen times into stage_durations
-        for evt in metrics.stage_events.get(str(req_id), []):
+        for evt in stage_events.get(str(req_id), []):
             key = f"stage_{evt.stage_id}_gen_ms"
             if key not in stage_durations:
                 stage_durations[key] = evt.stage_gen_time_ms
diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py
index f7347f6e0e9..4c26649a96d 100644
--- a/vllm_omni/metrics/stats.py
+++ b/vllm_omni/metrics/stats.py
@@ -991,6 +991,11 @@ def build_and_log_summary(self) -> dict[str, Any]:
             if self.e2e_count > 0
             else 0.0
         )
+        avg_engine_pipeline_ms = (
+            sum(float(evt.engine_pipeline_time_ms) for evt in self.e2e_events) / self.e2e_count
+            if self.e2e_count > 0
+            else 0.0
+        )
         avg_tok = (self.total_tokens * 1000.0 / wall_time_ms) if wall_time_ms > 0 else 0.0
         first_engine_ts = min((ts for ts in self.stage_first_ts if ts is not None), default=None)
         engine_pipeline_wall_ms = (
@@ -1061,9 +1066,7 @@ def build_and_log_summary(self) -> dict[str, Any]:
             "avg_input_preprocess_time_ms": float(
                 self.input_preprocess_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
             ),
-            "avg_engine_pipeline_time_ms": float(
-                self.engine_pipeline_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
-            ),
+            "avg_engine_pipeline_time_ms": float(avg_engine_pipeline_ms),
             "avg_stage_gen_total_time_ms": float(stage_gen_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0),
             "avg_output_processor_time_ms": float(
                 output_processor_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0

From d9395784e2e947908ded815b2f81360aedfad092 Mon Sep 17 00:00:00 2001
From: bjf-frz <frz123db@gmail.com>
Date: Mon, 27 Apr 2026 15:46:10 +0800
Subject: [PATCH 10/11] Address metrics review feedback

Signed-off-by: bjf-frz <frz123db@gmail.com>
---
 docs/contributing/metrics.md               |  6 ++
 tests/entrypoints/test_omni_entrypoints.py |  9 ++-
 vllm_omni/metrics/stats.py                 | 86 ++++++++++++----------
 3 files changed, 60 insertions(+), 41 deletions(-)

diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md
index ca7f596dda2..c854a088603 100644
--- a/docs/contributing/metrics.md
+++ b/docs/contributing/metrics.md
@@ -140,6 +140,12 @@ Any remaining difference is reported as `breakdown_delta_time_ms`.
 
 `ar2diffusion_total_time_ms` is included in `stage_handoff_total_time_ms`. For a concrete AR-to-diffusion edge, `stage_0_to_1_ar2diffusion_time_ms` is included in `stage_0_to_1_handoff_time_ms`.
 
+For offline batches, average component fields such as `avg_stage_gen_total_time_ms`,
+`avg_output_processor_time_ms`, `avg_stage_handoff_total_time_ms`, and
+`avg_ar2diffusion_time_ms` are computed by dividing the batch aggregate by
+`num_of_requests`. They are not individually timed per request and then averaged.
+Per-request timers remain available in the E2E table where they are measured directly.
+
 ---
 
 ### E2E Table (per request)
diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py
index bac4897da5d..052cb4fe07a 100644
--- a/tests/entrypoints/test_omni_entrypoints.py
+++ b/tests/entrypoints/test_omni_entrypoints.py
@@ -671,7 +671,14 @@ def __init__(
         def on_stage_metrics(self, *_: Any, **__: Any) -> None:
             pass
 
-        def record_final_output_time(self, req_id: str, final_output_time_ms: float) -> None:
+        def record_final_output_time(
+            self,
+            req_id: str,
+            final_output_time_ms: float,
+            start_ts: float | None = None,
+            end_ts: float | None = None,
+        ) -> None:
+            del start_ts, end_ts
             self.final_output_time_ms[req_id] = final_output_time_ms
 
         def on_finalize_request(
diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py
index 4c26649a96d..90f575b54be 100644
--- a/vllm_omni/metrics/stats.py
+++ b/vllm_omni/metrics/stats.py
@@ -47,9 +47,9 @@ class StageRequestStats:
     stage_submit_ts: float | None = None
     stage_end_ts: float | None = None
     request_dispatch_wait_time_ms: float = 0.0
-    stage_latency_time_ms: float = 0.0
-    stage_queue_wait_time_ms: float = 0.0
-    stage_execution_time_ms: float = 0.0
+    stage_latency_time_ms: float | None = None
+    stage_queue_wait_time_ms: float | None = None
+    stage_execution_time_ms: float | None = None
     handoff_to_stage_id: int | None = None
     stage_handoff_time_ms: float = 0.0
     ar2diffusion_time_ms: float = 0.0
@@ -327,8 +327,8 @@ def _append_stage_info_lines(self, lines: list[str], evt: StageRequestStats) ->
 
     def _request_stage_component_time_ms(self, request_id: str) -> float:
         return sum(
-            float(evt.stage_queue_wait_time_ms or 0.0)
-            + float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0)
+            self._optional_ms(evt.stage_queue_wait_time_ms)
+            + self._stage_execution_ms(evt)
             + float(evt.output_processor_time_ms or 0.0)
             + float(evt.stage_handoff_time_ms or 0.0)
             for evt in self.stage_events.get(request_id, [])
@@ -355,7 +355,29 @@ def _request_remaining_orchestration_time_ms(self, evt: RequestE2EStats) -> floa
     def _get_overall_summary_time_ms(self, overall_summary: dict[str, Any], key: str) -> float:
         return float(overall_summary.get(key, overall_summary.get(key.replace("_wall_", "_"), 0.0)))
 
+    @staticmethod
+    def _optional_ms(value: float | None, default: float = 0.0) -> float:
+        return float(default if value is None else value)
+
+    @classmethod
+    def _stage_latency_ms(cls, evt: StageRequestStats) -> float:
+        return cls._optional_ms(evt.stage_latency_time_ms, float(evt.stage_gen_time_ms or 0.0))
+
+    @classmethod
+    def _stage_execution_ms(cls, evt: StageRequestStats) -> float:
+        return cls._optional_ms(evt.stage_execution_time_ms, float(evt.stage_gen_time_ms or 0.0))
+
+    @staticmethod
+    def _average(total: float, count: int) -> float:
+        return float(total / count) if count > 0 else 0.0
+
+    def _average_e2e_field(self, field_name: str) -> float:
+        if self.e2e_count <= 0:
+            return 0.0
+        return float(sum(float(getattr(evt, field_name)) for evt in self.e2e_events) / self.e2e_count)
+
     def _estimate_stage_wait_and_execution_times(self) -> None:
+        """Populate derived latency, queue wait, and execution times on stage events."""
         for stage_id in range(self.num_stages):
             stage_evts = sorted(
                 [
@@ -422,19 +444,19 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
         )
         for stage_id in stage_ids:
             stage_latency_ms = sum(
-                float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0)
+                self._stage_latency_ms(evt)
                 for stage_evts in self.stage_events.values()
                 for evt in stage_evts
                 if evt.stage_id == stage_id
             )
             stage_queue_wait_ms = sum(
-                float(evt.stage_queue_wait_time_ms or 0.0)
+                self._optional_ms(evt.stage_queue_wait_time_ms)
                 for stage_evts in self.stage_events.values()
                 for evt in stage_evts
                 if evt.stage_id == stage_id
             )
             stage_execution_ms = sum(
-                float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0)
+                self._stage_execution_ms(evt)
                 for stage_evts in self.stage_events.values()
                 for evt in stage_evts
                 if evt.stage_id == stage_id
@@ -561,12 +583,12 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                         self._summary_header(f"Stage {stage_id} Breakdown", "-"),
                         self._summary_line(
                             "Stage latency time (ms):",
-                            float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0),
+                            self._stage_latency_ms(evt),
                         ),
-                        self._summary_line("Queue wait time (ms):", float(evt.stage_queue_wait_time_ms or 0.0)),
+                        self._summary_line("Queue wait time (ms):", self._optional_ms(evt.stage_queue_wait_time_ms)),
                         self._summary_line(
                             "Execution time (ms):",
-                            float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0),
+                            self._stage_execution_ms(evt),
                         ),
                         self._summary_line(
                             "Output processor time (ms):",
@@ -802,7 +824,7 @@ def _as_stage_request_stats(
         stats.stage_name = self._get_stage_name(stage_id, stage_meta)
         stats.request_id = req_id
         stats.final_output_type = final_output_type
-        if stats.stage_latency_time_ms == 0.0:
+        if stats.stage_latency_time_ms is None:
             stats.stage_latency_time_ms = (
                 float(stats.stage_end_ts - stats.stage_submit_ts) * 1000.0
                 if stats.stage_submit_ts is not None and stats.stage_end_ts is not None
@@ -986,16 +1008,8 @@ def build_and_log_summary(self) -> dict[str, Any]:
             return {}
         self._estimate_stage_wait_and_execution_times()
         wall_time_ms = max(0.0, (self.last_finish_ts - self.wall_start_ts) * 1000.0)
-        e2e_avg_req = (
-            sum(float(evt.request_wall_time_ms) for evt in self.e2e_events) / self.e2e_count
-            if self.e2e_count > 0
-            else 0.0
-        )
-        avg_engine_pipeline_ms = (
-            sum(float(evt.engine_pipeline_time_ms) for evt in self.e2e_events) / self.e2e_count
-            if self.e2e_count > 0
-            else 0.0
-        )
+        e2e_avg_req = self._average_e2e_field("request_wall_time_ms")
+        avg_engine_pipeline_ms = self._average_e2e_field("engine_pipeline_time_ms")
         avg_tok = (self.total_tokens * 1000.0 / wall_time_ms) if wall_time_ms > 0 else 0.0
         first_engine_ts = min((ts for ts in self.stage_first_ts if ts is not None), default=None)
         engine_pipeline_wall_ms = (
@@ -1032,9 +1046,9 @@ def build_and_log_summary(self) -> dict[str, Any]:
         for stage_evts in self.stage_events.values():
             for evt in stage_evts:
                 stage_gen_total_ms += float(evt.stage_gen_time_ms or 0.0)
-                stage_latency_total_ms += float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0)
-                stage_queue_wait_total_ms += float(evt.stage_queue_wait_time_ms or 0.0)
-                stage_execution_total_ms += float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0)
+                stage_latency_total_ms += self._stage_latency_ms(evt)
+                stage_queue_wait_total_ms += self._optional_ms(evt.stage_queue_wait_time_ms)
+                stage_execution_total_ms += self._stage_execution_ms(evt)
                 output_processor_total_ms += float(evt.output_processor_time_ms or 0.0)
                 handoff_ms = float(evt.stage_handoff_time_ms or 0.0)
                 ar2d_ms = float(evt.ar2diffusion_time_ms or 0.0)
@@ -1063,22 +1077,14 @@ def build_and_log_summary(self) -> dict[str, Any]:
             "breakdown_delta_time_ms": float(breakdown_delta_ms),
             "total_tokens": int(self.total_tokens),
             "avg_request_wall_time_ms": float(e2e_avg_req),
-            "avg_input_preprocess_time_ms": float(
-                self.input_preprocess_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
-            ),
+            "avg_input_preprocess_time_ms": self._average(self.input_preprocess_total_ms, self.e2e_count),
             "avg_engine_pipeline_time_ms": float(avg_engine_pipeline_ms),
-            "avg_stage_gen_total_time_ms": float(stage_gen_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0),
-            "avg_output_processor_time_ms": float(
-                output_processor_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
-            ),
-            "avg_stage_handoff_total_time_ms": float(
-                stage_handoff_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
-            ),
-            "avg_ar2diffusion_time_ms": float(ar2diffusion_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0),
-            "avg_final_output_time_ms": float(
-                self.final_output_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0
-            ),
-            "avg_breakdown_delta_time_ms": float(breakdown_delta_ms / self.e2e_count if self.e2e_count > 0 else 0.0),
+            "avg_stage_gen_total_time_ms": self._average(stage_gen_total_ms, self.e2e_count),
+            "avg_output_processor_time_ms": self._average(output_processor_total_ms, self.e2e_count),
+            "avg_stage_handoff_total_time_ms": self._average(stage_handoff_total_ms, self.e2e_count),
+            "avg_ar2diffusion_time_ms": self._average(ar2diffusion_total_ms, self.e2e_count),
+            "avg_final_output_time_ms": self._average(self.final_output_total_ms, self.e2e_count),
+            "avg_breakdown_delta_time_ms": self._average(breakdown_delta_ms, self.e2e_count),
             "avg_tokens_per_s": float(avg_tok),
         }
         for edge, handoff_time in sorted(handoff_edge_totals.items()):

From d054cbc92d940143ea04a0eff1c924253a30a44c Mon Sep 17 00:00:00 2001
From: bjf-frz <frz123db@gmail.com>
Date: Tue, 12 May 2026 02:47:55 +0000
Subject: [PATCH 11/11] Limit logged request breakdowns

Signed-off-by: bjf-frz <frz123db@gmail.com>
---
 tests/entrypoints/test_serve.py     | 17 +++++++++++
 tests/metrics/test_stats.py         | 46 +++++++++++++++++++++++++++++
 vllm_omni/entrypoints/async_omni.py |  1 +
 vllm_omni/entrypoints/cli/serve.py  |  7 +++++
 vllm_omni/entrypoints/omni.py       |  1 +
 vllm_omni/entrypoints/omni_base.py  |  1 +
 vllm_omni/metrics/stats.py          | 27 ++++++++++++++++-
 7 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/test_serve.py b/tests/entrypoints/test_serve.py
index 1267a189a32..e02b79d9b58 100644
--- a/tests/entrypoints/test_serve.py
+++ b/tests/entrypoints/test_serve.py
@@ -31,6 +31,23 @@ def test_serve_parser_accepts_no_async_chunk() -> None:
     assert args.async_chunk is False
 
 
+def test_serve_parser_accepts_log_stats_request_breakdown_limit() -> None:
+    try:
+        from vllm.utils.argparse_utils import FlexibleArgumentParser
+    except Exception as exc:
+        pytest.skip(f"Cannot build parser in this environment: {exc}")
+
+    root = FlexibleArgumentParser()
+    subparsers = root.add_subparsers(dest="subcommand")
+    cmd = OmniServeCommand()
+    cmd.subparser_init(subparsers)
+
+    argv = ["serve", "fake-model", "--omni", "--log-stats-request-breakdown-limit", "5"]
+    args = root.parse_args(argv)
+
+    assert args.log_stats_request_breakdown_limit == 5
+
+
 def _make_headless_args() -> argparse.Namespace:
     return argparse.Namespace(
         model="fake-model",
diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py
index 357b6319319..b273184cef8 100644
--- a/tests/metrics/test_stats.py
+++ b/tests/metrics/test_stats.py
@@ -113,6 +113,52 @@ def test_build_and_log_summary_e2e_only() -> None:
     assert stage_entry["stages"] == []
 
 
+def test_request_breakdown_log_limit_does_not_truncate_summary_tables() -> None:
+    agg = OrchestratorAggregator(
+        num_stages=1,
+        log_stats=True,
+        wall_start_ts=0.0,
+        final_stage_id_for_e2e=0,
+        request_breakdown_limit=2,
+    )
+    for idx in range(4):
+        agg.e2e_events.append(
+            RequestE2EStats(
+                request_id=f"r{idx}",
+                request_wall_time_ms=10.0,
+                input_preprocess_time_ms=1.0,
+                engine_pipeline_time_ms=9.0,
+                total_tokens=1,
+                transfers_total_time_ms=0.0,
+                transfers_total_bytes=0,
+            )
+        )
+    agg.e2e_count = 4
+
+    lines = agg._build_omni_metrics_summary_lines(
+        {
+            "num_of_requests": 4,
+            "request_wall_time_ms": 40.0,
+            "input_preprocess_wall_time_ms": 4.0,
+            "engine_pipeline_time_ms": 36.0,
+            "final_output_total_time_ms": 0.0,
+            "avg_input_preprocess_time_ms": 1.0,
+            "avg_final_output_time_ms": 0.0,
+        }
+    )
+    rendered = "\n".join(lines)
+
+    assert "Request r0 Breakdown" in rendered
+    assert "Request r1 Breakdown" in rendered
+    assert "Request r2 Breakdown" not in rendered
+    assert "Request r3 Breakdown" not in rendered
+    assert "Request breakdowns shown:" in rendered
+    assert "Request breakdowns omitted:" in rendered
+
+    summary = agg.build_and_log_summary()
+    assert {entry["request_id"] for entry in summary["e2e_table"]} == {"r0", "r1", "r2", "r3"}
+
+
 def test_build_and_log_summary_multiple_requests() -> None:
     agg = OrchestratorAggregator(
         num_stages=2, log_stats=True, wall_start_ts=0.0, final_stage_id_for_e2e={"r1": 1, "r2": 0}
diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py
index a57c2672e94..747a77f3717 100644
--- a/vllm_omni/entrypoints/async_omni.py
+++ b/vllm_omni/entrypoints/async_omni.py
@@ -302,6 +302,7 @@ async def generate(
                 wall_start_ts,
                 final_stage_id_for_e2e,
                 stage_metadata=getattr(self.engine, "stage_metadata", None),
+                request_breakdown_limit=self.log_stats_request_breakdown_limit,
             )
             req_state = ClientRequestState(request_id)
             req_state.metrics = metrics
diff --git a/vllm_omni/entrypoints/cli/serve.py b/vllm_omni/entrypoints/cli/serve.py
index b4293d59fd7..9a28ff8076e 100644
--- a/vllm_omni/entrypoints/cli/serve.py
+++ b/vllm_omni/entrypoints/cli/serve.py
@@ -216,6 +216,13 @@ def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgu
             action="store_true",
             help="Enable logging the stats.",
         )
+        omni_config_group.add_argument(
+            "--log-stats-request-breakdown-limit",
+            type=int,
+            default=5,
+            help="Maximum number of per-request breakdowns to print in the final stats summary. "
+            "Use a negative value to print all request breakdowns.",
+        )
         omni_config_group.add_argument(
             "--log-file",
             type=str,
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
index ebf1301c0a0..8485d852eec 100644
--- a/vllm_omni/entrypoints/omni.py
+++ b/vllm_omni/entrypoints/omni.py
@@ -126,6 +126,7 @@ def _run_generation(
                 wall_start_ts,
                 req_final_stage_ids,
                 stage_metadata=getattr(self.engine, "stage_metadata", None),
+                request_breakdown_limit=self.log_stats_request_breakdown_limit,
             )
 
             for req_id, prompt in zip(request_ids, request_prompts):
diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index 10b64deab76..0d0b230b188 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -143,6 +143,7 @@ def __init__(
         stage_init_timeout = kwargs.pop("stage_init_timeout", 300)
         init_timeout = kwargs.pop("init_timeout", 600)
         log_stats = kwargs.pop("log_stats", False)
+        self.log_stats_request_breakdown_limit = kwargs.pop("log_stats_request_breakdown_limit", 5)
         self._enable_ar_profiler = kwargs.pop("enable_ar_profiler", False)
         # NOTE: read-only lookup — must NOT pop. Popping here drops the key
         # before it reaches ``StageConfigFactory._create_from_registry``, so
diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py
index 90f575b54be..f3f7d5c9074 100644
--- a/vllm_omni/metrics/stats.py
+++ b/vllm_omni/metrics/stats.py
@@ -161,6 +161,7 @@ def e2e_tpt(self) -> float:
 STAGE_FIELDS = _build_field_defs(StageRequestStats, STAGE_EXCLUDE, FIELD_TRANSFORMS)
 TRANSFER_FIELDS = _build_field_defs(TransferEdgeStats, TRANSFER_EXCLUDE, FIELD_TRANSFORMS)
 E2E_FIELDS = _build_field_defs(RequestE2EStats, E2E_EXCLUDE, FIELD_TRANSFORMS)
+DEFAULT_REQUEST_BREAKDOWN_LIMIT = 5
 
 
 class OrchestratorAggregator:
@@ -171,11 +172,13 @@ def __init__(
         wall_start_ts: float,
         final_stage_id_for_e2e: dict[str, int] | int,
         stage_metadata: list[dict[str, Any]] | None = None,
+        request_breakdown_limit: int | None = DEFAULT_REQUEST_BREAKDOWN_LIMIT,
     ) -> None:
         self.num_stages = int(num_stages)
         self.log_stats = bool(log_stats)
         self.final_stage_id_for_e2e = final_stage_id_for_e2e
         self.stage_metadata = list(stage_metadata or [])
+        self.request_breakdown_limit = self._normalize_request_breakdown_limit(request_breakdown_limit)
         self.init_run_state(wall_start_ts)
         self.stage_events: dict[str, list[StageRequestStats]] = {}
         self.transfer_events: dict[
@@ -236,6 +239,13 @@ def _format_seconds(self, ms: float) -> str:
     def _format_ms(self, ms: float) -> str:
         return f"{ms:,.3f} ms"
 
+    @staticmethod
+    def _normalize_request_breakdown_limit(limit: int | None) -> int | None:
+        if limit is None:
+            return None
+        limit = int(limit)
+        return None if limit < 0 else limit
+
     def _log_omni_timing(self, evt: RequestE2EStats) -> None:
         rid = evt.request_id
         stages = []
@@ -558,7 +568,22 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) ->
                 ]
             )
 
-        for rid in sorted(set(self.stage_events.keys()) | {evt.request_id for evt in self.e2e_events}):
+        all_breakdown_request_ids = sorted(set(self.stage_events.keys()) | {evt.request_id for evt in self.e2e_events})
+        if self.request_breakdown_limit is None:
+            breakdown_request_ids = all_breakdown_request_ids
+        else:
+            breakdown_request_ids = all_breakdown_request_ids[: self.request_breakdown_limit]
+        omitted_breakdowns = len(all_breakdown_request_ids) - len(breakdown_request_ids)
+        if omitted_breakdowns > 0:
+            lines.extend(
+                [
+                    "",
+                    self._summary_line("Request breakdowns shown:", len(breakdown_request_ids)),
+                    self._summary_line("Request breakdowns omitted:", omitted_breakdowns),
+                ]
+            )
+
+        for rid in breakdown_request_ids:
             stage_evts = sorted(
                 self.stage_events.get(rid, []),
                 key=lambda evt: evt.stage_id if evt.stage_id is not None else -1,