From a740028334c042063bb11961c9689fd6158fafab Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Thu, 23 Apr 2026 11:01:44 +0800 Subject: [PATCH 01/11] modify-log-stats Signed-off-by: bjf-frz --- docs/contributing/metrics.md | 22 ++++++++++++------- tests/entrypoints/test_async_omni_abort.py | 11 ++++++++-- tests/entrypoints/test_pd_disaggregation.py | 6 ++++-- tests/metrics/test_stats.py | 7 +++++- vllm_omni/engine/async_omni_engine.py | 7 ++++++ vllm_omni/entrypoints/async_omni.py | 6 ++++++ vllm_omni/entrypoints/omni.py | 4 ++++ vllm_omni/entrypoints/omni_base.py | 2 ++ vllm_omni/metrics/stats.py | 24 +++++++++++++++------ 9 files changed, 69 insertions(+), 20 deletions(-) diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md index 92dcd92ccac..b021d05b445 100644 --- a/docs/contributing/metrics.md +++ b/docs/contributing/metrics.md @@ -3,7 +3,7 @@ You can use these metrics in production to monitor the health and performance of the vLLM-omni system. Typical scenarios include: -- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `e2e_total_ms`), and resource utilization to verify that the system meets expected standards. +- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `request_latency_ms`), and resource utilization to verify that the system meets expected standards. - **Debugging and Troubleshooting**: Use detailed per-request metrics to diagnose issues, such as high transfer times or unexpected token counts. @@ -31,9 +31,10 @@ With `--log-stats` enabled, the server will output detailed metrics logs after e | Field | Value | |-----------------------------|--------------| | e2e_requests | 1 | -| e2e_wall_time_ms | 41,299.190 | +| run_wall_time_ms | 41,299.190 | +| request_latency_total_ms | 41,299.133 | | e2e_total_tokens | 5,202 | -| e2e_avg_time_per_request_ms | 41,299.190 | +| avg_run_wall_time_per_request_ms | 41,299.190 | | e2e_avg_tokens_per_s | 125.959 | | e2e_stage_0_wall_time_ms | 10,192.289 | | e2e_stage_1_wall_time_ms | 30,541.409 | @@ -43,7 +44,8 @@ With `--log-stats` enabled, the server will output detailed metrics logs after e | Field | Value | |-------------------------|------------| -| e2e_total_ms | 41,299.133 | +| request_latency_ms | 41,299.133 | +| request_submit_prep_ms | 0.000 | | e2e_total_tokens | 5,202 | | transfers_total_time_ms | 245.895 | | transfers_total_kbytes | 138,089.939| @@ -98,10 +100,13 @@ For **online inference** (serving mode), the summary is always per-request. `e2e | Field | Meaning | |---------------------------|----------------------------------------------------------------------------------------------| | `e2e_requests` | Number of completed requests. | -| `e2e_wall_time_ms` | Wall-clock time span from run start to last completion, in ms. | +| `run_wall_time_ms` | Wall-clock time span from run start to last completion, in ms. | +| `request_latency_total_ms` | Sum of per-request `request_latency_ms` across completed requests. | +| `request_submit_prep_total_ms` | Sum of pre-submit request preparation time across completed requests. | | `e2e_total_tokens` | Total tokens counted across all completed requests (stage0 input + all stage outputs). | -| `e2e_avg_time_per_request_ms` | Average wall time per request: `e2e_wall_time_ms / e2e_requests`. | -| `e2e_avg_tokens_per_s` | Average token throughput over wall time: `e2e_total_tokens * 1000 / e2e_wall_time_ms`. | +| `avg_run_wall_time_per_request_ms` | Average wall time per request: `run_wall_time_ms / e2e_requests`. | +| `avg_request_submit_prep_ms` | Average pre-submit request preparation time per completed request. | +| `e2e_avg_tokens_per_s` | Average token throughput over wall time: `e2e_total_tokens * 1000 / run_wall_time_ms`. | | `e2e_stage_{i}_wall_time_ms` | Wall-clock time span for stage i, in ms. Each stage's wall time is reported as a separate field, e.g., `e2e_stage_0_wall_time_ms`, `e2e_stage_1_wall_time_ms`, etc. | --- @@ -110,7 +115,8 @@ For **online inference** (serving mode), the summary is always per-request. `e2e | Field | Meaning | |---------------------------|-----------------------------------------------------------------------| -| `e2e_total_ms` | End-to-end latency in ms. | +| `request_latency_ms` | End-to-end latency in ms, starting after request preparation and ending at final completion. | +| `request_submit_prep_ms` | Time spent preparing and submitting the request before `request_latency_ms` starts. | | `e2e_total_tokens` | Total tokens for the request (stage0 input + all stage outputs). | | `transfers_total_time_ms` | Sum of transfer edge `total_time_ms` for this request. | | `transfers_total_kbytes` | Sum of transfer kbytes for this request. | diff --git a/tests/entrypoints/test_async_omni_abort.py b/tests/entrypoints/test_async_omni_abort.py index b34652162d0..6ee8a6ef112 100644 --- a/tests/entrypoints/test_async_omni_abort.py +++ b/tests/entrypoints/test_async_omni_abort.py @@ -20,8 +20,15 @@ async def fake_add_request_async(*, request_id, prompt, sampling_params_list, fi async def fake_abort_async(request_ids): aborted_request_batches.append(list(request_ids)) - async def fake_process_results(request_id, metrics, final_stage_id_for_e2e, req_start_ts, wall_start_ts): - del metrics, final_stage_id_for_e2e, req_start_ts, wall_start_ts + async def fake_process_results( + request_id, + metrics, + final_stage_id_for_e2e, + req_start_ts, + req_submit_prep_ms, + wall_start_ts, + ): + del metrics, final_stage_id_for_e2e, req_start_ts, req_submit_prep_ms, wall_start_ts if request_id.startswith("cancel-"): await asyncio.Future() return diff --git a/tests/entrypoints/test_pd_disaggregation.py b/tests/entrypoints/test_pd_disaggregation.py index 5ffabfbf2af..6deb12b0c4a 100644 --- a/tests/entrypoints/test_pd_disaggregation.py +++ b/tests/entrypoints/test_pd_disaggregation.py @@ -255,12 +255,14 @@ def __init__(self, num_stages, enable_stats, wall_start_ts, final_stage_id_for_e self.accumulated_gen_time_ms = {} self.e2e_done = set() self.e2e_count = 0 - self.e2e_total_ms = 0.0 + self.request_latency_total_ms = 0.0 + self.request_submit_prep_total_ms = 0.0 def on_stage_metrics(self, stage_id, req_id, metrics, final_output_type=None): pass - def on_finalize_request(self, stage_id, req_id, start_ts): + def on_finalize_request(self, stage_id, req_id, start_ts, request_submit_prep_ms=0.0): + del start_ts, request_submit_prep_ms self.e2e_done.add(req_id) def on_forward(self, from_stage, to_stage, req_id, size_bytes, tx_ms, use_shm): diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py index 8b18c5dffa1..6c38bffd0e2 100644 --- a/tests/metrics/test_stats.py +++ b/tests/metrics/test_stats.py @@ -59,6 +59,8 @@ def test_orchestrator_aggregator_builds_summary() -> None: summary = agg.build_and_log_summary() overall = summary["overall_summary"] assert overall["e2e_requests"] == 1 + assert overall["run_wall_time_ms"] == 70.0 + assert overall["request_submit_prep_total_ms"] == 0.0 stage_entry = _get_request_entry(summary["stage_table"], "r1") stage_ids = [row["stage_id"] for row in stage_entry["stages"]] @@ -77,7 +79,8 @@ def test_build_and_log_summary_e2e_only() -> None: agg.e2e_events.append( RequestE2EStats( request_id="r", - e2e_total_ms=10.0, + request_latency_ms=10.0, + request_submit_prep_ms=0.0, e2e_total_tokens=5, transfers_total_time_ms=0.0, transfers_total_bytes=0, @@ -86,6 +89,7 @@ def test_build_and_log_summary_e2e_only() -> None: summary = agg.build_and_log_summary() e2e_entry = _get_request_entry(summary["e2e_table"], "r") + assert e2e_entry["request_latency_ms"] == 10.0 assert e2e_entry["e2e_total_tokens"] == 5 stage_entry = _get_request_entry(summary["stage_table"], "r") assert stage_entry["stages"] == [] @@ -150,6 +154,7 @@ def test_build_and_log_summary_multiple_requests() -> None: summary = agg.build_and_log_summary() assert len(summary["stage_table"]) == 2 assert {entry["request_id"] for entry in summary["e2e_table"]} == {"r1", "r2"} + assert summary["overall_summary"]["request_latency_total_ms"] > 0.0 # Check that r1 has two stages and r2 has one r1_stage_entry = next(e for e in summary["stage_table"] if e["request_id"] == "r1") r2_stage_entry = next(e for e in summary["stage_table"] if e["request_id"] == "r2") diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 8299a577a41..cea726054c9 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1493,6 +1493,7 @@ def add_request( a queue + coroutine-switch round-trip. The Orchestrator receives a ready-to-submit OmniEngineCoreRequest. """ + build_msg_t0 = time.perf_counter() msg = self._build_add_request_message( request_id=request_id, prompt=prompt, @@ -1508,6 +1509,12 @@ def add_request( reasoning_ended=reasoning_ended, resumable=resumable, ) + build_msg_ms = (time.perf_counter() - build_msg_t0) * 1000.0 + logger.info( + "[AsyncOmniEngine] req=%s _build_add_request_message took %.3f ms", + request_id, + build_msg_ms, + ) if self.request_queue is None: raise RuntimeError("request_queue is not initialized") self.request_queue.sync_q.put_nowait(msg) diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index 119232ce767..ea0e56b6594 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -282,7 +282,9 @@ async def generate( # Track per-request metrics wall_start_ts = time.time() + request_prep_start_ts = wall_start_ts req_start_ts: dict[str, float] = {} + req_submit_prep_ms: dict[str, float] = {} # Determine the final stage for E2E stats final_stage_id_for_e2e = self._compute_final_stage_id(output_modalities) @@ -321,6 +323,7 @@ async def generate( final_stage_id=final_stage_id_for_e2e, ) submit_ts = time.time() + req_submit_prep_ms[request_id] = (submit_ts - request_prep_start_ts) * 1000.0 req_state.metrics.stage_first_ts[0] = submit_ts req_start_ts[request_id] = submit_ts @@ -333,6 +336,7 @@ async def generate( metrics, final_stage_id_for_e2e, req_start_ts, + req_submit_prep_ms, wall_start_ts, ): yield output @@ -479,6 +483,7 @@ async def _process_orchestrator_results( metrics: OrchestratorMetrics, final_stage_id_for_e2e: int, req_start_ts: dict[str, float], + req_submit_prep_ms: dict[str, float], wall_start_ts: float, ) -> AsyncGenerator[OmniRequestOutput, None]: """Read results from the Orchestrator (via the request's asyncio.Queue) @@ -520,6 +525,7 @@ async def _process_orchestrator_results( stage_id, metrics, req_start_ts, + req_submit_prep_ms, wall_start_ts, final_stage_id_for_e2e, ) diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py index 223c208af98..e31a9da6726 100644 --- a/vllm_omni/entrypoints/omni.py +++ b/vllm_omni/entrypoints/omni.py @@ -114,10 +114,12 @@ def _run_generation( request_ids = [f"{i}_{uuid.uuid4()}" for i in range(len(request_prompts))] req_start_ts: dict[str, float] = {} + req_submit_prep_ms: dict[str, float] = {} wall_start_ts = time.time() req_final_stage_ids: dict[str, int] = {} for req_id, prompt in zip(request_ids, request_prompts): + request_prep_start_ts = time.time() prompt_modalities = prompt.get("modalities", None) if isinstance(prompt, dict) else None final_stage_id = self._compute_final_stage_id(prompt_modalities) req_final_stage_ids[req_id] = final_stage_id @@ -146,6 +148,7 @@ def _run_generation( final_stage_id=final_stage_id, ) submit_ts = time.time() + req_submit_prep_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0 req_state.metrics.stage_first_ts[0] = submit_ts req_start_ts[req_id] = submit_ts @@ -175,6 +178,7 @@ def _run_generation( stage_id=stage_id, metrics=req_state.metrics, req_start_ts=req_start_ts, + req_submit_prep_ms=req_submit_prep_ms, wall_start_ts=wall_start_ts, final_stage_id_for_e2e=req_final_stage_ids[req_id], ) diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 51b816fb796..b27dc3ece9e 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -368,6 +368,7 @@ def _process_single_result( stage_id: int, metrics: OrchestratorMetrics, req_start_ts: dict[str, float], + req_submit_prep_ms: dict[str, float], wall_start_ts: float, final_stage_id_for_e2e: int, ) -> OmniRequestOutput | None: @@ -398,6 +399,7 @@ def _process_single_result( stage_id, req_id, req_start_ts.get(req_id, wall_start_ts), + req_submit_prep_ms=req_submit_prep_ms.get(req_id, 0.0), ) except Exception: logger.exception("[%s] Finalize request handling error", self.__class__.__name__) diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py index 46fec6fcc1f..7be2bbf21c4 100644 --- a/vllm_omni/metrics/stats.py +++ b/vllm_omni/metrics/stats.py @@ -74,14 +74,15 @@ def total_time_ms(self) -> float: @dataclass class RequestE2EStats: request_id: str - e2e_total_ms: float + request_latency_ms: float + request_submit_prep_ms: float e2e_total_tokens: int transfers_total_time_ms: float transfers_total_bytes: int @property def e2e_tpt(self) -> float: - return (self.e2e_total_ms / self.e2e_total_tokens) if self.e2e_total_tokens > 0 else 0.0 + return (self.request_latency_ms / self.e2e_total_tokens) if self.e2e_total_tokens > 0 else 0.0 # === Field Configuration === @@ -133,7 +134,8 @@ def __init__( def init_run_state(self, wall_start_ts: float) -> None: # Per-run aggregates and timing state self.stage_total_tokens = [0 for _ in range(self.num_stages)] - self.e2e_total_ms = 0.0 + self.request_latency_total_ms = 0.0 + self.request_submit_prep_total_ms = 0.0 self.e2e_total_tokens = 0 self.e2e_count = 0 self.e2e_done = set() @@ -423,6 +425,7 @@ def on_finalize_request( stage_id: int, req_id: Any, req_start_ts: float, + request_submit_prep_ms: float = 0.0, ) -> None: rid_key = str(req_id) if rid_key in self.e2e_done: @@ -444,13 +447,15 @@ def on_finalize_request( total_tokens += int(evt.num_tokens_in) total_tokens += int(evt.num_tokens_out) - self.e2e_total_ms += e2e_ms + self.request_latency_total_ms += e2e_ms + self.request_submit_prep_total_ms += float(request_submit_prep_ms) self.e2e_total_tokens += total_tokens self.e2e_count += 1 self.e2e_done.add(rid_key) per_req_record = RequestE2EStats( request_id=rid_key, - e2e_total_ms=e2e_ms, + request_latency_ms=e2e_ms, + request_submit_prep_ms=float(request_submit_prep_ms), e2e_total_tokens=total_tokens, transfers_total_time_ms=float( sum(evt.total_time_ms for evt in self.transfer_events.values() if evt.request_id == rid_key) @@ -482,9 +487,14 @@ def build_and_log_summary(self) -> dict[str, Any]: overall_summary = { "e2e_requests": int(self.e2e_count), - "e2e_wall_time_ms": float(wall_time_ms), + "run_wall_time_ms": float(wall_time_ms), + "request_latency_total_ms": float(self.request_latency_total_ms), + "request_submit_prep_total_ms": float(self.request_submit_prep_total_ms), "e2e_total_tokens": int(self.e2e_total_tokens), - "e2e_avg_time_per_request_ms": float(e2e_avg_req), + "avg_run_wall_time_per_request_ms": float(e2e_avg_req), + "avg_request_submit_prep_ms": float( + self.request_submit_prep_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 + ), "e2e_avg_tokens_per_s": float(e2e_avg_tok), } # Add stage_wall_time_ms as separate fields for each stage From 31f9e4c2c826545c41cc231c1be20743d76e8b0e Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Thu, 23 Apr 2026 19:05:55 +0800 Subject: [PATCH 02/11] Improve Omni metrics timing visibility Signed-off-by: bjf-frz --- docs/contributing/metrics.md | 40 ++++--- tests/entrypoints/test_async_omni_abort.py | 12 +- tests/entrypoints/test_pd_disaggregation.py | 19 ++- tests/metrics/test_stats.py | 15 ++- vllm_omni/engine/async_omni_engine.py | 15 ++- vllm_omni/engine/stage_init_utils.py | 1 + vllm_omni/entrypoints/async_omni.py | 26 ++-- vllm_omni/entrypoints/omni.py | 16 ++- vllm_omni/entrypoints/omni_base.py | 10 +- vllm_omni/metrics/stats.py | 124 ++++++++++++++++---- 10 files changed, 209 insertions(+), 69 deletions(-) diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md index b021d05b445..fa148366e66 100644 --- a/docs/contributing/metrics.md +++ b/docs/contributing/metrics.md @@ -3,7 +3,7 @@ You can use these metrics in production to monitor the health and performance of the vLLM-omni system. Typical scenarios include: -- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `request_latency_ms`), and resource utilization to verify that the system meets expected standards. +- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `request_wall_time_ms` / `engine_pipeline_time_ms`), and resource utilization to verify that the system meets expected standards. - **Debugging and Troubleshooting**: Use detailed per-request metrics to diagnose issues, such as high transfer times or unexpected token counts. @@ -25,16 +25,23 @@ python openai_chat_completion_client_for_multimodal_generation.py --query-type u With `--log-stats` enabled, the server will output detailed metrics logs after each request. Example output: +For multi-stage pipelines, vLLM-Omni also emits a concise per-request timing line: + +```text +[OmniTiming] req=chatcmpl-a0edd05 total=32.99s preprocess=2.80s engine=30.19s stages=[0:ar=16.23s,1:diffusion=13.96s] transfers=[0->1=0.780ms] +``` + #### Overall Summary | Field | Value | |-----------------------------|--------------| | e2e_requests | 1 | -| run_wall_time_ms | 41,299.190 | -| request_latency_total_ms | 41,299.133 | +| request_wall_time_ms | 41,299.190 | +| input_preprocess_time_ms | 57.000 | +| engine_pipeline_time_ms | 41,299.133 | | e2e_total_tokens | 5,202 | -| avg_run_wall_time_per_request_ms | 41,299.190 | +| avg_request_wall_time_ms | 41,299.190 | | e2e_avg_tokens_per_s | 125.959 | | e2e_stage_0_wall_time_ms | 10,192.289 | | e2e_stage_1_wall_time_ms | 30,541.409 | @@ -44,8 +51,9 @@ With `--log-stats` enabled, the server will output detailed metrics logs after e | Field | Value | |-------------------------|------------| -| request_latency_ms | 41,299.133 | -| request_submit_prep_ms | 0.000 | +| request_wall_time_ms | 41,356.133 | +| input_preprocess_time_ms | 57.000 | +| engine_pipeline_time_ms | 41,299.133 | | e2e_total_tokens | 5,202 | | transfers_total_time_ms | 245.895 | | transfers_total_kbytes | 138,089.939| @@ -100,13 +108,15 @@ For **online inference** (serving mode), the summary is always per-request. `e2e | Field | Meaning | |---------------------------|----------------------------------------------------------------------------------------------| | `e2e_requests` | Number of completed requests. | -| `run_wall_time_ms` | Wall-clock time span from run start to last completion, in ms. | -| `request_latency_total_ms` | Sum of per-request `request_latency_ms` across completed requests. | -| `request_submit_prep_total_ms` | Sum of pre-submit request preparation time across completed requests. | +| `request_wall_time_ms` | Wall-clock time span from request preparation start to final completion, in ms. | +| `input_preprocess_time_ms` | Time spent preparing and submitting requests before the engine pipeline starts. | +| `build_add_request_message_time_ms` | Time spent inside `AsyncOmniEngine._build_add_request_message()`. | +| `engine_pipeline_time_ms` | Time from engine request submission to final completion. | | `e2e_total_tokens` | Total tokens counted across all completed requests (stage0 input + all stage outputs). | -| `avg_run_wall_time_per_request_ms` | Average wall time per request: `run_wall_time_ms / e2e_requests`. | -| `avg_request_submit_prep_ms` | Average pre-submit request preparation time per completed request. | -| `e2e_avg_tokens_per_s` | Average token throughput over wall time: `e2e_total_tokens * 1000 / run_wall_time_ms`. | +| `avg_request_wall_time_ms` | Average wall time per request: `request_wall_time_ms / e2e_requests`. | +| `avg_input_preprocess_time_ms` | Average pre-submit request preparation time per completed request. | +| `avg_engine_pipeline_time_ms` | Average engine pipeline time per completed request. | +| `e2e_avg_tokens_per_s` | Average token throughput over wall time: `e2e_total_tokens * 1000 / request_wall_time_ms`. | | `e2e_stage_{i}_wall_time_ms` | Wall-clock time span for stage i, in ms. Each stage's wall time is reported as a separate field, e.g., `e2e_stage_0_wall_time_ms`, `e2e_stage_1_wall_time_ms`, etc. | --- @@ -115,8 +125,10 @@ For **online inference** (serving mode), the summary is always per-request. `e2e | Field | Meaning | |---------------------------|-----------------------------------------------------------------------| -| `request_latency_ms` | End-to-end latency in ms, starting after request preparation and ending at final completion. | -| `request_submit_prep_ms` | Time spent preparing and submitting the request before `request_latency_ms` starts. | +| `request_wall_time_ms` | End-to-end latency in ms, including input preprocessing and engine pipeline time. | +| `input_preprocess_time_ms` | Time spent preparing and submitting the request before `engine_pipeline_time_ms` starts. | +| `build_add_request_message_time_ms` | Time spent inside `_build_add_request_message()` for this request. | +| `engine_pipeline_time_ms` | Time from engine request submission to final completion. | | `e2e_total_tokens` | Total tokens for the request (stage0 input + all stage outputs). | | `transfers_total_time_ms` | Sum of transfer edge `total_time_ms` for this request. | | `transfers_total_kbytes` | Sum of transfer kbytes for this request. | diff --git a/tests/entrypoints/test_async_omni_abort.py b/tests/entrypoints/test_async_omni_abort.py index 6ee8a6ef112..f7144086689 100644 --- a/tests/entrypoints/test_async_omni_abort.py +++ b/tests/entrypoints/test_async_omni_abort.py @@ -25,10 +25,18 @@ async def fake_process_results( metrics, final_stage_id_for_e2e, req_start_ts, - req_submit_prep_ms, + input_preprocess_time_ms, + build_add_request_message_time_ms, wall_start_ts, ): - del metrics, final_stage_id_for_e2e, req_start_ts, req_submit_prep_ms, wall_start_ts + del ( + metrics, + final_stage_id_for_e2e, + req_start_ts, + input_preprocess_time_ms, + build_add_request_message_time_ms, + wall_start_ts, + ) if request_id.startswith("cancel-"): await asyncio.Future() return diff --git a/tests/entrypoints/test_pd_disaggregation.py b/tests/entrypoints/test_pd_disaggregation.py index 6deb12b0c4a..4b220fbcb8c 100644 --- a/tests/entrypoints/test_pd_disaggregation.py +++ b/tests/entrypoints/test_pd_disaggregation.py @@ -246,7 +246,8 @@ def _setup_ipc_mocks(monkeypatch): def _setup_log_mocks(monkeypatch): class _FakeOrchestratorAggregator: - def __init__(self, num_stages, enable_stats, wall_start_ts, final_stage_id_for_e2e=None): + def __init__(self, num_stages, enable_stats, wall_start_ts, final_stage_id_for_e2e=None, **kwargs): + del kwargs self.num_stages = num_stages self.enable_stats = enable_stats self.stage_first_ts = [None] * num_stages @@ -255,14 +256,22 @@ def __init__(self, num_stages, enable_stats, wall_start_ts, final_stage_id_for_e self.accumulated_gen_time_ms = {} self.e2e_done = set() self.e2e_count = 0 - self.request_latency_total_ms = 0.0 - self.request_submit_prep_total_ms = 0.0 + self.engine_pipeline_total_ms = 0.0 + self.input_preprocess_total_ms = 0.0 + self.build_add_request_message_total_ms = 0.0 def on_stage_metrics(self, stage_id, req_id, metrics, final_output_type=None): pass - def on_finalize_request(self, stage_id, req_id, start_ts, request_submit_prep_ms=0.0): - del start_ts, request_submit_prep_ms + def on_finalize_request( + self, + stage_id, + req_id, + start_ts, + input_preprocess_time_ms=0.0, + build_add_request_message_time_ms=0.0, + ): + del start_ts, input_preprocess_time_ms, build_add_request_message_time_ms self.e2e_done.add(req_id) def on_forward(self, from_stage, to_stage, req_id, size_bytes, tx_ms, use_shm): diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py index 6c38bffd0e2..aa81b09421f 100644 --- a/tests/metrics/test_stats.py +++ b/tests/metrics/test_stats.py @@ -59,8 +59,8 @@ def test_orchestrator_aggregator_builds_summary() -> None: summary = agg.build_and_log_summary() overall = summary["overall_summary"] assert overall["e2e_requests"] == 1 - assert overall["run_wall_time_ms"] == 70.0 - assert overall["request_submit_prep_total_ms"] == 0.0 + assert overall["request_wall_time_ms"] == 70.0 + assert overall["input_preprocess_time_ms"] == 0.0 stage_entry = _get_request_entry(summary["stage_table"], "r1") stage_ids = [row["stage_id"] for row in stage_entry["stages"]] @@ -79,8 +79,10 @@ def test_build_and_log_summary_e2e_only() -> None: agg.e2e_events.append( RequestE2EStats( request_id="r", - request_latency_ms=10.0, - request_submit_prep_ms=0.0, + request_wall_time_ms=10.0, + input_preprocess_time_ms=0.0, + build_add_request_message_time_ms=0.0, + engine_pipeline_time_ms=10.0, e2e_total_tokens=5, transfers_total_time_ms=0.0, transfers_total_bytes=0, @@ -89,7 +91,8 @@ def test_build_and_log_summary_e2e_only() -> None: summary = agg.build_and_log_summary() e2e_entry = _get_request_entry(summary["e2e_table"], "r") - assert e2e_entry["request_latency_ms"] == 10.0 + assert e2e_entry["request_wall_time_ms"] == 10.0 + assert e2e_entry["engine_pipeline_time_ms"] == 10.0 assert e2e_entry["e2e_total_tokens"] == 5 stage_entry = _get_request_entry(summary["stage_table"], "r") assert stage_entry["stages"] == [] @@ -154,7 +157,7 @@ def test_build_and_log_summary_multiple_requests() -> None: summary = agg.build_and_log_summary() assert len(summary["stage_table"]) == 2 assert {entry["request_id"] for entry in summary["e2e_table"]} == {"r1", "r2"} - assert summary["overall_summary"]["request_latency_total_ms"] > 0.0 + assert summary["overall_summary"]["engine_pipeline_time_ms"] > 0.0 # Check that r1 has two stages and r2 has one r1_stage_entry = next(e for e in summary["stage_table"] if e["request_id"] == "r1") r2_stage_entry = next(e for e in summary["stage_table"] if e["request_id"] == "r2") diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index cea726054c9..bd289ca6b1b 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1485,7 +1485,7 @@ def add_request( reasoning_ended: bool | None = None, *, resumable: bool = False, - ) -> None: + ) -> dict[str, float]: """Process stage-0 input locally, then send to the Orchestrator. Input processing and output @@ -1510,11 +1510,6 @@ def add_request( resumable=resumable, ) build_msg_ms = (time.perf_counter() - build_msg_t0) * 1000.0 - logger.info( - "[AsyncOmniEngine] req=%s _build_add_request_message took %.3f ms", - request_id, - build_msg_ms, - ) if self.request_queue is None: raise RuntimeError("request_queue is not initialized") self.request_queue.sync_q.put_nowait(msg) @@ -1527,6 +1522,10 @@ def add_request( stage0_params = effective_spl[0] if effective_spl else None if stage0_params is not None: self._enqueue_cfg_companions(request_id, original_prompt, stage0_params, effective_spl) + return { + "input_preprocess_time_ms": build_msg_ms, + "build_add_request_message_time_ms": build_msg_ms, + } async def add_request_async( self, @@ -1544,9 +1543,9 @@ async def add_request_async( reasoning_ended: bool | None = None, *, resumable: bool = False, - ) -> None: + ) -> dict[str, float]: """Async add_request API.""" - self.add_request( + return self.add_request( request_id=request_id, prompt=prompt, prompt_text=prompt_text, diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 9fdcd6216ee..2249efc57f4 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -765,6 +765,7 @@ def finalize_initialized_stages( "final_output": stage_client.final_output, "final_output_type": stage_client.final_output_type, "stage_type": stage_client.stage_type, + "model_stage": getattr(stage_client, "model_stage", None), } for stage_client in initialized_stage_clients ] diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index ea0e56b6594..2641e8437b3 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -284,7 +284,8 @@ async def generate( wall_start_ts = time.time() request_prep_start_ts = wall_start_ts req_start_ts: dict[str, float] = {} - req_submit_prep_ms: dict[str, float] = {} + input_preprocess_time_ms: dict[str, float] = {} + build_add_request_message_time_ms: dict[str, float] = {} # Determine the final stage for E2E stats final_stage_id_for_e2e = self._compute_final_stage_id(output_modalities) @@ -294,6 +295,7 @@ async def generate( self.log_stats, wall_start_ts, final_stage_id_for_e2e, + stage_metadata=getattr(self.engine, "stage_metadata", None), ) req_state = ClientRequestState(request_id) req_state.metrics = metrics @@ -315,15 +317,22 @@ async def generate( sampling_params_list=req_sp_list, final_stage_id=final_stage_id_for_e2e, ) + add_request_timings = {} else: - await self.engine.add_request_async( + add_request_timings = await self.engine.add_request_async( request_id=request_id, prompt=prompt, sampling_params_list=req_sp_list, final_stage_id=final_stage_id_for_e2e, - ) + ) or {} submit_ts = time.time() - req_submit_prep_ms[request_id] = (submit_ts - request_prep_start_ts) * 1000.0 + input_preprocess_time_ms[request_id] = (submit_ts - request_prep_start_ts) * 1000.0 + build_add_request_message_time_ms[request_id] = float( + add_request_timings.get( + "build_add_request_message_time_ms", + input_preprocess_time_ms[request_id], + ) + ) req_state.metrics.stage_first_ts[0] = submit_ts req_start_ts[request_id] = submit_ts @@ -336,7 +345,8 @@ async def generate( metrics, final_stage_id_for_e2e, req_start_ts, - req_submit_prep_ms, + input_preprocess_time_ms, + build_add_request_message_time_ms, wall_start_ts, ): yield output @@ -483,7 +493,8 @@ async def _process_orchestrator_results( metrics: OrchestratorMetrics, final_stage_id_for_e2e: int, req_start_ts: dict[str, float], - req_submit_prep_ms: dict[str, float], + input_preprocess_time_ms: dict[str, float], + build_add_request_message_time_ms: dict[str, float], wall_start_ts: float, ) -> AsyncGenerator[OmniRequestOutput, None]: """Read results from the Orchestrator (via the request's asyncio.Queue) @@ -525,7 +536,8 @@ async def _process_orchestrator_results( stage_id, metrics, req_start_ts, - req_submit_prep_ms, + input_preprocess_time_ms, + build_add_request_message_time_ms, wall_start_ts, final_stage_id_for_e2e, ) diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py index e31a9da6726..7432b9cabae 100644 --- a/vllm_omni/entrypoints/omni.py +++ b/vllm_omni/entrypoints/omni.py @@ -114,7 +114,8 @@ def _run_generation( request_ids = [f"{i}_{uuid.uuid4()}" for i in range(len(request_prompts))] req_start_ts: dict[str, float] = {} - req_submit_prep_ms: dict[str, float] = {} + input_preprocess_time_ms: dict[str, float] = {} + build_add_request_message_time_ms: dict[str, float] = {} wall_start_ts = time.time() req_final_stage_ids: dict[str, int] = {} @@ -129,6 +130,7 @@ def _run_generation( self.log_stats, wall_start_ts, final_stage_id, + stage_metadata=getattr(self.engine, "stage_metadata", None), ) req_state = ClientRequestState(req_id) req_state.metrics = metrics @@ -141,14 +143,17 @@ def _run_generation( p_id = pd_pair[0] req_sp_list[p_id] = self._prepare_prefill_sampling_params(req_id, req_sp_list[p_id]) - self.engine.add_request( + add_request_timings = self.engine.add_request( request_id=req_id, prompt=prompt, sampling_params_list=req_sp_list, final_stage_id=final_stage_id, - ) + ) or {} submit_ts = time.time() - req_submit_prep_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0 + input_preprocess_time_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0 + build_add_request_message_time_ms[req_id] = float( + add_request_timings.get("build_add_request_message_time_ms", input_preprocess_time_ms[req_id]) + ) req_state.metrics.stage_first_ts[0] = submit_ts req_start_ts[req_id] = submit_ts @@ -178,7 +183,8 @@ def _run_generation( stage_id=stage_id, metrics=req_state.metrics, req_start_ts=req_start_ts, - req_submit_prep_ms=req_submit_prep_ms, + input_preprocess_time_ms=input_preprocess_time_ms, + build_add_request_message_time_ms=build_add_request_message_time_ms, wall_start_ts=wall_start_ts, final_stage_id_for_e2e=req_final_stage_ids[req_id], ) diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index b27dc3ece9e..2d25860ef1e 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -7,6 +7,7 @@ import types import weakref from collections.abc import Sequence +from pprint import pformat from typing import TYPE_CHECKING, Any, Literal import huggingface_hub @@ -249,6 +250,9 @@ def _log_summary_and_cleanup(self, request_id: str) -> None: try: if req_state is None or req_state.metrics is None: return + summary = req_state.metrics.build_and_log_summary() + if summary: + logger.debug("[Summary] %s", pformat(summary, sort_dicts=False)) except Exception: logger.exception( "[%s] Failed to build/log summary for req=%s", @@ -368,7 +372,8 @@ def _process_single_result( stage_id: int, metrics: OrchestratorMetrics, req_start_ts: dict[str, float], - req_submit_prep_ms: dict[str, float], + input_preprocess_time_ms: dict[str, float], + build_add_request_message_time_ms: dict[str, float], wall_start_ts: float, final_stage_id_for_e2e: int, ) -> OmniRequestOutput | None: @@ -399,7 +404,8 @@ def _process_single_result( stage_id, req_id, req_start_ts.get(req_id, wall_start_ts), - req_submit_prep_ms=req_submit_prep_ms.get(req_id, 0.0), + input_preprocess_time_ms=input_preprocess_time_ms.get(req_id, 0.0), + build_add_request_message_time_ms=build_add_request_message_time_ms.get(req_id, 0.0), ) except Exception: logger.exception("[%s] Finalize request handling error", self.__class__.__name__) diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py index 7be2bbf21c4..4b22670ad07 100644 --- a/vllm_omni/metrics/stats.py +++ b/vllm_omni/metrics/stats.py @@ -36,6 +36,8 @@ class StageRequestStats: rx_in_flight_time_ms: float stage_stats: StageStats stage_id: int | None = None + stage_name: str | None = None + stage_type: str | None = None final_output_type: str | None = None request_id: str | None = None postprocess_time_ms: float = 0.0 @@ -74,15 +76,17 @@ def total_time_ms(self) -> float: @dataclass class RequestE2EStats: request_id: str - request_latency_ms: float - request_submit_prep_ms: float + request_wall_time_ms: float + input_preprocess_time_ms: float + build_add_request_message_time_ms: float + engine_pipeline_time_ms: float e2e_total_tokens: int transfers_total_time_ms: float transfers_total_bytes: int @property def e2e_tpt(self) -> float: - return (self.request_latency_ms / self.e2e_total_tokens) if self.e2e_total_tokens > 0 else 0.0 + return (self.engine_pipeline_time_ms / self.e2e_total_tokens) if self.e2e_total_tokens > 0 else 0.0 # === Field Configuration === @@ -120,10 +124,12 @@ def __init__( log_stats: bool, wall_start_ts: float, final_stage_id_for_e2e: dict[str, int] | int, + stage_metadata: list[dict[str, Any]] | None = None, ) -> None: self.num_stages = int(num_stages) self.log_stats = bool(log_stats) self.final_stage_id_for_e2e = final_stage_id_for_e2e + self.stage_metadata = list(stage_metadata or []) self.init_run_state(wall_start_ts) self.stage_events: dict[str, list[StageRequestStats]] = {} self.transfer_events: dict[ @@ -134,8 +140,9 @@ def __init__( def init_run_state(self, wall_start_ts: float) -> None: # Per-run aggregates and timing state self.stage_total_tokens = [0 for _ in range(self.num_stages)] - self.request_latency_total_ms = 0.0 - self.request_submit_prep_total_ms = 0.0 + self.engine_pipeline_total_ms = 0.0 + self.input_preprocess_total_ms = 0.0 + self.build_add_request_message_total_ms = 0.0 self.e2e_total_tokens = 0 self.e2e_count = 0 self.e2e_done = set() @@ -150,6 +157,62 @@ def init_run_state(self, wall_start_ts: float) -> None: lambda: defaultdict(float) ) # {request_id: {diffusion_metrics_key: accumulated_metrics_data}} + def _get_stage_metadata(self, stage_id: int | None) -> dict[str, Any]: + if stage_id is None or stage_id < 0 or stage_id >= len(self.stage_metadata): + return {} + meta = self.stage_metadata[stage_id] + return dict(meta) if isinstance(meta, dict) else {} + + @staticmethod + def _get_stage_type(stage_meta: dict[str, Any]) -> str: + return str(stage_meta.get("stage_type") or "unknown") + + @classmethod + def _get_stage_name(cls, stage_id: int, stage_meta: dict[str, Any]) -> str: + stage_type = cls._get_stage_type(stage_meta) + model_stage = stage_meta.get("model_stage") + if model_stage: + return str(model_stage) + if stage_type == "diffusion": + return "diffusion" + return f"stage_{stage_id}" + + def _stage_label(self, stage_id: int | None) -> str: + if stage_id is None: + return "unknown" + stage_meta = self._get_stage_metadata(stage_id) + return f"{stage_id}:{self._get_stage_name(stage_id, stage_meta)}" + + def _format_seconds(self, ms: float) -> str: + return f"{ms / 1000.0:.2f}s" + + def _log_omni_timing(self, evt: RequestE2EStats) -> None: + rid = evt.request_id + stages = [] + for stage_evt in sorted( + self.stage_events.get(rid, []), + key=lambda e: e.stage_id if e.stage_id is not None else -1, + ): + stages.append( + f"{self._stage_label(stage_evt.stage_id)}={self._format_seconds(stage_evt.stage_gen_time_ms)}" + ) + transfers = [] + for transfer_evt in sorted( + [e for e in self.transfer_events.values() if e.request_id == rid], + key=lambda e: (e.from_stage, e.to_stage), + ): + transfers.append(f"{transfer_evt.from_stage}->{transfer_evt.to_stage}={transfer_evt.total_time_ms:.3f}ms") + + logger.info( + "[OmniTiming] req=%s total=%s preprocess=%s engine=%s stages=[%s] transfers=[%s]", + rid, + self._format_seconds(evt.request_wall_time_ms), + self._format_seconds(evt.input_preprocess_time_ms), + self._format_seconds(evt.engine_pipeline_time_ms), + ",".join(stages), + ",".join(transfers), + ) + def _get_or_create_transfer_event( self, from_stage: int, @@ -328,6 +391,9 @@ def _as_stage_request_stats( "Convert dict to StageRequestStats if needed." stats = metrics stats.stage_id = stage_id + stage_meta = self._get_stage_metadata(stage_id) + stats.stage_type = self._get_stage_type(stage_meta) + stats.stage_name = self._get_stage_name(stage_id, stage_meta) stats.request_id = req_id stats.final_output_type = final_output_type stats.diffusion_metrics = ( @@ -425,7 +491,8 @@ def on_finalize_request( stage_id: int, req_id: Any, req_start_ts: float, - request_submit_prep_ms: float = 0.0, + input_preprocess_time_ms: float = 0.0, + build_add_request_message_time_ms: float = 0.0, ) -> None: rid_key = str(req_id) if rid_key in self.e2e_done: @@ -436,7 +503,10 @@ def on_finalize_request( prev_last = self.stage_last_ts[stage_id] self.stage_last_ts[stage_id] = _t1 if prev_last is None else max(prev_last, _t1) self.last_finish_ts = max(self.last_finish_ts, _t1) - e2e_ms = (_t1 - _t0) * 1000.0 + engine_pipeline_ms = (_t1 - _t0) * 1000.0 + input_preprocess_ms = float(input_preprocess_time_ms) + build_msg_ms = float(build_add_request_message_time_ms) + request_wall_ms = engine_pipeline_ms + input_preprocess_ms # Sum tokens from all stages for this request # Include input tokens from stage 0 + output tokens from all stages @@ -447,15 +517,18 @@ def on_finalize_request( total_tokens += int(evt.num_tokens_in) total_tokens += int(evt.num_tokens_out) - self.request_latency_total_ms += e2e_ms - self.request_submit_prep_total_ms += float(request_submit_prep_ms) + self.engine_pipeline_total_ms += engine_pipeline_ms + self.input_preprocess_total_ms += input_preprocess_ms + self.build_add_request_message_total_ms += build_msg_ms self.e2e_total_tokens += total_tokens self.e2e_count += 1 self.e2e_done.add(rid_key) per_req_record = RequestE2EStats( request_id=rid_key, - request_latency_ms=e2e_ms, - request_submit_prep_ms=float(request_submit_prep_ms), + request_wall_time_ms=request_wall_ms, + input_preprocess_time_ms=input_preprocess_ms, + build_add_request_message_time_ms=build_msg_ms, + engine_pipeline_time_ms=engine_pipeline_ms, e2e_total_tokens=total_tokens, transfers_total_time_ms=float( sum(evt.total_time_ms for evt in self.transfer_events.values() if evt.request_id == rid_key) @@ -466,6 +539,9 @@ def on_finalize_request( ) self.e2e_events.append(per_req_record) + if self.num_stages > 1: + self._log_omni_timing(per_req_record) + def build_and_log_summary(self) -> dict[str, Any]: if not self.log_stats: return {} @@ -487,13 +563,17 @@ def build_and_log_summary(self) -> dict[str, Any]: overall_summary = { "e2e_requests": int(self.e2e_count), - "run_wall_time_ms": float(wall_time_ms), - "request_latency_total_ms": float(self.request_latency_total_ms), - "request_submit_prep_total_ms": float(self.request_submit_prep_total_ms), + "request_wall_time_ms": float(wall_time_ms), + "input_preprocess_time_ms": float(self.input_preprocess_total_ms), + "build_add_request_message_time_ms": float(self.build_add_request_message_total_ms), + "engine_pipeline_time_ms": float(self.engine_pipeline_total_ms), "e2e_total_tokens": int(self.e2e_total_tokens), - "avg_run_wall_time_per_request_ms": float(e2e_avg_req), - "avg_request_submit_prep_ms": float( - self.request_submit_prep_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 + "avg_request_wall_time_ms": float(e2e_avg_req), + "avg_input_preprocess_time_ms": float( + self.input_preprocess_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 + ), + "avg_engine_pipeline_time_ms": float( + self.engine_pipeline_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 ), "e2e_avg_tokens_per_s": float(e2e_avg_tok), } @@ -561,7 +641,11 @@ def build_and_log_summary(self) -> dict[str, Any]: # then remove diffusion_metrics from the table stage_rows = [] for evt in stage_evts: - row = {"stage_id": evt.stage_id, **_build_row(evt, local_stage_fields)} + row = { + "stage": self._stage_label(evt.stage_id), + "stage_id": evt.stage_id, + **_build_row(evt, local_stage_fields), + } if evt.diffusion_metrics: row.update(evt.diffusion_metrics) row.pop("diffusion_metrics", None) # Remove the dict itself @@ -574,7 +658,7 @@ def build_and_log_summary(self) -> dict[str, Any]: all_value_fields = set() for row in stage_rows: for k in row.keys(): - if k != "stage_id": + if k not in ("stage", "stage_id"): all_value_fields.add(k) value_fields_list = [] for field in sorted(all_value_fields): @@ -593,7 +677,7 @@ def build_and_log_summary(self) -> dict[str, Any]: _format_table( f"StageRequestStats [request_id={rid}]", stage_rows, - column_key="stage_id", + column_key="stage", value_fields=value_fields_list, ), ) From f366015ccb43852f1f08d59bf6dd7710e9c49c25 Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Thu, 23 Apr 2026 20:13:38 +0800 Subject: [PATCH 03/11] Refine Omni timing metrics Signed-off-by: bjf-frz --- docs/contributing/metrics.md | 3 +- tests/entrypoints/test_async_omni_abort.py | 2 - tests/entrypoints/test_pd_disaggregation.py | 12 +---- tests/metrics/test_stats.py | 3 +- vllm_omni/engine/async_omni_engine.py | 12 ++--- vllm_omni/engine/cfg_companion_tracker.py | 3 +- vllm_omni/engine/orchestrator.py | 37 ++++++++-------- vllm_omni/entrypoints/async_omni.py | 15 +------ vllm_omni/entrypoints/omni.py | 9 +--- vllm_omni/entrypoints/omni_base.py | 2 - vllm_omni/metrics/stats.py | 49 ++++++++------------- 11 files changed, 51 insertions(+), 96 deletions(-) diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md index fa148366e66..f1fc37cbddf 100644 --- a/docs/contributing/metrics.md +++ b/docs/contributing/metrics.md @@ -110,7 +110,6 @@ For **online inference** (serving mode), the summary is always per-request. `e2e | `e2e_requests` | Number of completed requests. | | `request_wall_time_ms` | Wall-clock time span from request preparation start to final completion, in ms. | | `input_preprocess_time_ms` | Time spent preparing and submitting requests before the engine pipeline starts. | -| `build_add_request_message_time_ms` | Time spent inside `AsyncOmniEngine._build_add_request_message()`. | | `engine_pipeline_time_ms` | Time from engine request submission to final completion. | | `e2e_total_tokens` | Total tokens counted across all completed requests (stage0 input + all stage outputs). | | `avg_request_wall_time_ms` | Average wall time per request: `request_wall_time_ms / e2e_requests`. | @@ -127,7 +126,6 @@ For **online inference** (serving mode), the summary is always per-request. `e2e |---------------------------|-----------------------------------------------------------------------| | `request_wall_time_ms` | End-to-end latency in ms, including input preprocessing and engine pipeline time. | | `input_preprocess_time_ms` | Time spent preparing and submitting the request before `engine_pipeline_time_ms` starts. | -| `build_add_request_message_time_ms` | Time spent inside `_build_add_request_message()` for this request. | | `engine_pipeline_time_ms` | Time from engine request submission to final completion. | | `e2e_total_tokens` | Total tokens for the request (stage0 input + all stage outputs). | | `transfers_total_time_ms` | Sum of transfer edge `total_time_ms` for this request. | @@ -148,6 +146,7 @@ For **online inference** (serving mode), the summary is always per-request. `e2e | `image_num` | Number of images generated (for diffusion/image stages). | | `resolution` | Image resolution (for diffusion/image stages). | | `postprocess_time_ms` | Diffusion/image: post-processing time in ms. | +| `ar2diffusion_time_ms` | Time spent converting AR-stage output into the diffusion-stage prompt before diffusion request submission. | --- diff --git a/tests/entrypoints/test_async_omni_abort.py b/tests/entrypoints/test_async_omni_abort.py index f7144086689..baf488cada8 100644 --- a/tests/entrypoints/test_async_omni_abort.py +++ b/tests/entrypoints/test_async_omni_abort.py @@ -26,7 +26,6 @@ async def fake_process_results( final_stage_id_for_e2e, req_start_ts, input_preprocess_time_ms, - build_add_request_message_time_ms, wall_start_ts, ): del ( @@ -34,7 +33,6 @@ async def fake_process_results( final_stage_id_for_e2e, req_start_ts, input_preprocess_time_ms, - build_add_request_message_time_ms, wall_start_ts, ) if request_id.startswith("cancel-"): diff --git a/tests/entrypoints/test_pd_disaggregation.py b/tests/entrypoints/test_pd_disaggregation.py index 4b220fbcb8c..b03f425084e 100644 --- a/tests/entrypoints/test_pd_disaggregation.py +++ b/tests/entrypoints/test_pd_disaggregation.py @@ -258,20 +258,12 @@ def __init__(self, num_stages, enable_stats, wall_start_ts, final_stage_id_for_e self.e2e_count = 0 self.engine_pipeline_total_ms = 0.0 self.input_preprocess_total_ms = 0.0 - self.build_add_request_message_total_ms = 0.0 def on_stage_metrics(self, stage_id, req_id, metrics, final_output_type=None): pass - def on_finalize_request( - self, - stage_id, - req_id, - start_ts, - input_preprocess_time_ms=0.0, - build_add_request_message_time_ms=0.0, - ): - del start_ts, input_preprocess_time_ms, build_add_request_message_time_ms + def on_finalize_request(self, stage_id, req_id, start_ts, input_preprocess_time_ms=0.0): + del start_ts, input_preprocess_time_ms self.e2e_done.add(req_id) def on_forward(self, from_stage, to_stage, req_id, size_bytes, tx_ms, use_shm): diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py index aa81b09421f..5c60f02ca18 100644 --- a/tests/metrics/test_stats.py +++ b/tests/metrics/test_stats.py @@ -39,6 +39,7 @@ def test_orchestrator_aggregator_builds_summary() -> None: stage_stats=StageStats(), ), ) + agg.record_ar2diffusion_time(0, "r1", 12.5) agg.on_stage_metrics( 1, "r1", @@ -65,6 +66,7 @@ def test_orchestrator_aggregator_builds_summary() -> None: stage_entry = _get_request_entry(summary["stage_table"], "r1") stage_ids = [row["stage_id"] for row in stage_entry["stages"]] assert stage_ids == [0, 1] + assert stage_entry["stages"][0]["ar2diffusion_time_ms"] == 12.5 transfer_entry = _get_request_entry(summary["trans_table"], "r1") assert transfer_entry["transfers"][0]["edge"] == "0->1" @@ -81,7 +83,6 @@ def test_build_and_log_summary_e2e_only() -> None: request_id="r", request_wall_time_ms=10.0, input_preprocess_time_ms=0.0, - build_add_request_message_time_ms=0.0, engine_pipeline_time_ms=10.0, e2e_total_tokens=5, transfers_total_time_ms=0.0, diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index bd289ca6b1b..8299a577a41 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1485,7 +1485,7 @@ def add_request( reasoning_ended: bool | None = None, *, resumable: bool = False, - ) -> dict[str, float]: + ) -> None: """Process stage-0 input locally, then send to the Orchestrator. Input processing and output @@ -1493,7 +1493,6 @@ def add_request( a queue + coroutine-switch round-trip. The Orchestrator receives a ready-to-submit OmniEngineCoreRequest. """ - build_msg_t0 = time.perf_counter() msg = self._build_add_request_message( request_id=request_id, prompt=prompt, @@ -1509,7 +1508,6 @@ def add_request( reasoning_ended=reasoning_ended, resumable=resumable, ) - build_msg_ms = (time.perf_counter() - build_msg_t0) * 1000.0 if self.request_queue is None: raise RuntimeError("request_queue is not initialized") self.request_queue.sync_q.put_nowait(msg) @@ -1522,10 +1520,6 @@ def add_request( stage0_params = effective_spl[0] if effective_spl else None if stage0_params is not None: self._enqueue_cfg_companions(request_id, original_prompt, stage0_params, effective_spl) - return { - "input_preprocess_time_ms": build_msg_ms, - "build_add_request_message_time_ms": build_msg_ms, - } async def add_request_async( self, @@ -1543,9 +1537,9 @@ async def add_request_async( reasoning_ended: bool | None = None, *, resumable: bool = False, - ) -> dict[str, float]: + ) -> None: """Async add_request API.""" - return self.add_request( + self.add_request( request_id=request_id, prompt=prompt, prompt_text=prompt_text, diff --git a/vllm_omni/engine/cfg_companion_tracker.py b/vllm_omni/engine/cfg_companion_tracker.py index b9dfae833e2..74cdf818775 100644 --- a/vllm_omni/engine/cfg_companion_tracker.py +++ b/vllm_omni/engine/cfg_companion_tracker.py @@ -80,7 +80,7 @@ def on_companion_completed(self, companion_id: str) -> str | None: return parent_id return None - def defer_parent(self, parent_id: str, engine_outputs: Any, stage_id: int) -> None: + def defer_parent(self, parent_id: str, engine_outputs: Any, stage_id: int, stage_metrics: Any | None = None) -> None: """Hold parent result while waiting for companions to finish.""" # TODO: Add timeout/error recovery when the orchestrator grows a # companion-failure path. Today deferred parents are released only when @@ -88,6 +88,7 @@ def defer_parent(self, parent_id: str, engine_outputs: Any, stage_id: int) -> No self._pending_parents[parent_id] = { "engine_outputs": engine_outputs, "stage_id": stage_id, + "stage_metrics": stage_metrics, } logger.debug("Parent %s deferred, waiting for CFG companions", parent_id) diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index a7edf43394a..24830b9b840 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -425,17 +425,6 @@ async def _route_output( "stage_submit_ts": submit_ts, } ) - elif stage_metrics is not None: - await self.output_async_queue.put( - { - "type": "stage_metrics", - "request_id": req_id, - "stage_id": stage_id, - "metrics": stage_metrics, - "stage_submit_ts": submit_ts, - } - ) - # PD disaggregation: extract KV transfer params from prefill stage output if self._pd_pair is not None and finished and stage_id == self._pd_pair[0]: kv_params = getattr(output, "kv_transfer_params", None) @@ -464,13 +453,14 @@ async def _route_output( and self._cfg_tracker.has_companions(req_id) and not self._cfg_tracker.all_companions_done(req_id) ): - self._cfg_tracker.defer_parent(req_id, output, stage_id) + self._cfg_tracker.defer_parent(req_id, output, stage_id, stage_metrics) else: await self._forward_to_next_stage( req_id, stage_id, output, req_state, + stage_metrics=stage_metrics, is_streaming_session=req_state.streaming.enabled, is_final_update=False, ) @@ -481,10 +471,22 @@ async def _route_output( stage_id, output, req_state, + stage_metrics=stage_metrics, is_streaming_session=True, is_final_update=True, ) + if not stage_client.final_output and stage_metrics is not None: + await self.output_async_queue.put( + { + "type": "stage_metrics", + "request_id": req_id, + "stage_id": stage_id, + "metrics": stage_metrics, + "stage_submit_ts": submit_ts, + } + ) + if finished and stage_id == req_state.final_stage_id: # PD: clean up any lingering KV params for this request self._pd_kv_params.pop(req_id, None) @@ -509,6 +511,7 @@ async def _handle_cfg_companion_ready(self, req_id: str) -> None: deferred["stage_id"], deferred["engine_outputs"], parent_state, + stage_metrics=deferred.get("stage_metrics"), ) async def _handle_kv_ready_raw_outputs(self, stage_id: int, raw_outputs: EngineCoreOutputs) -> None: @@ -657,6 +660,7 @@ async def _forward_to_next_stage( stage_id: int, output: Any, req_state: OrchestratorRequestState, + stage_metrics: Any | None = None, *, is_streaming_session: bool = False, is_final_update: bool = False, @@ -682,13 +686,8 @@ async def _forward_to_next_stage( False, ) _dt_ar2d = (_time.perf_counter() - _t_ar2d) * 1000 - logger.info( - "[Orchestrator] ar2diffusion req=%s wall_time=%.3fms stage=%d->%d", - req_id, - _dt_ar2d, - stage_id, - next_stage_id, - ) + if stage_metrics is not None: + stage_metrics.ar2diffusion_time_ms = _dt_ar2d if isinstance(diffusion_prompt, list): if not diffusion_prompt: error_output = OmniRequestOutput.from_error( diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index 2641e8437b3..bac15dbb835 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -285,7 +285,6 @@ async def generate( request_prep_start_ts = wall_start_ts req_start_ts: dict[str, float] = {} input_preprocess_time_ms: dict[str, float] = {} - build_add_request_message_time_ms: dict[str, float] = {} # Determine the final stage for E2E stats final_stage_id_for_e2e = self._compute_final_stage_id(output_modalities) @@ -317,22 +316,15 @@ async def generate( sampling_params_list=req_sp_list, final_stage_id=final_stage_id_for_e2e, ) - add_request_timings = {} else: - add_request_timings = await self.engine.add_request_async( + await self.engine.add_request_async( request_id=request_id, prompt=prompt, sampling_params_list=req_sp_list, final_stage_id=final_stage_id_for_e2e, - ) or {} + ) submit_ts = time.time() input_preprocess_time_ms[request_id] = (submit_ts - request_prep_start_ts) * 1000.0 - build_add_request_message_time_ms[request_id] = float( - add_request_timings.get( - "build_add_request_message_time_ms", - input_preprocess_time_ms[request_id], - ) - ) req_state.metrics.stage_first_ts[0] = submit_ts req_start_ts[request_id] = submit_ts @@ -346,7 +338,6 @@ async def generate( final_stage_id_for_e2e, req_start_ts, input_preprocess_time_ms, - build_add_request_message_time_ms, wall_start_ts, ): yield output @@ -494,7 +485,6 @@ async def _process_orchestrator_results( final_stage_id_for_e2e: int, req_start_ts: dict[str, float], input_preprocess_time_ms: dict[str, float], - build_add_request_message_time_ms: dict[str, float], wall_start_ts: float, ) -> AsyncGenerator[OmniRequestOutput, None]: """Read results from the Orchestrator (via the request's asyncio.Queue) @@ -537,7 +527,6 @@ async def _process_orchestrator_results( metrics, req_start_ts, input_preprocess_time_ms, - build_add_request_message_time_ms, wall_start_ts, final_stage_id_for_e2e, ) diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py index 7432b9cabae..fcff91df56d 100644 --- a/vllm_omni/entrypoints/omni.py +++ b/vllm_omni/entrypoints/omni.py @@ -115,7 +115,6 @@ def _run_generation( request_ids = [f"{i}_{uuid.uuid4()}" for i in range(len(request_prompts))] req_start_ts: dict[str, float] = {} input_preprocess_time_ms: dict[str, float] = {} - build_add_request_message_time_ms: dict[str, float] = {} wall_start_ts = time.time() req_final_stage_ids: dict[str, int] = {} @@ -143,17 +142,14 @@ def _run_generation( p_id = pd_pair[0] req_sp_list[p_id] = self._prepare_prefill_sampling_params(req_id, req_sp_list[p_id]) - add_request_timings = self.engine.add_request( + self.engine.add_request( request_id=req_id, prompt=prompt, sampling_params_list=req_sp_list, final_stage_id=final_stage_id, - ) or {} + ) submit_ts = time.time() input_preprocess_time_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0 - build_add_request_message_time_ms[req_id] = float( - add_request_timings.get("build_add_request_message_time_ms", input_preprocess_time_ms[req_id]) - ) req_state.metrics.stage_first_ts[0] = submit_ts req_start_ts[req_id] = submit_ts @@ -184,7 +180,6 @@ def _run_generation( metrics=req_state.metrics, req_start_ts=req_start_ts, input_preprocess_time_ms=input_preprocess_time_ms, - build_add_request_message_time_ms=build_add_request_message_time_ms, wall_start_ts=wall_start_ts, final_stage_id_for_e2e=req_final_stage_ids[req_id], ) diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 2d25860ef1e..1cf359f7c08 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -373,7 +373,6 @@ def _process_single_result( metrics: OrchestratorMetrics, req_start_ts: dict[str, float], input_preprocess_time_ms: dict[str, float], - build_add_request_message_time_ms: dict[str, float], wall_start_ts: float, final_stage_id_for_e2e: int, ) -> OmniRequestOutput | None: @@ -405,7 +404,6 @@ def _process_single_result( req_id, req_start_ts.get(req_id, wall_start_ts), input_preprocess_time_ms=input_preprocess_time_ms.get(req_id, 0.0), - build_add_request_message_time_ms=build_add_request_message_time_ms.get(req_id, 0.0), ) except Exception: logger.exception("[%s] Finalize request handling error", self.__class__.__name__) diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py index 4b22670ad07..618c292c7e7 100644 --- a/vllm_omni/metrics/stats.py +++ b/vllm_omni/metrics/stats.py @@ -43,6 +43,7 @@ class StageRequestStats: postprocess_time_ms: float = 0.0 diffusion_metrics: dict[str, int] = None audio_generated_frames: int = 0 + ar2diffusion_time_ms: float = 0.0 @property def rx_mbps(self) -> float: @@ -78,7 +79,6 @@ class RequestE2EStats: request_id: str request_wall_time_ms: float input_preprocess_time_ms: float - build_add_request_message_time_ms: float engine_pipeline_time_ms: float e2e_total_tokens: int transfers_total_time_ms: float @@ -142,7 +142,6 @@ def init_run_state(self, wall_start_ts: float) -> None: self.stage_total_tokens = [0 for _ in range(self.num_stages)] self.engine_pipeline_total_ms = 0.0 self.input_preprocess_total_ms = 0.0 - self.build_add_request_message_total_ms = 0.0 self.e2e_total_tokens = 0 self.e2e_count = 0 self.e2e_done = set() @@ -189,6 +188,7 @@ def _format_seconds(self, ms: float) -> str: def _log_omni_timing(self, evt: RequestE2EStats) -> None: rid = evt.request_id stages = [] + ar2diffusion_ms = 0.0 for stage_evt in sorted( self.stage_events.get(rid, []), key=lambda e: e.stage_id if e.stage_id is not None else -1, @@ -196,6 +196,7 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None: stages.append( f"{self._stage_label(stage_evt.stage_id)}={self._format_seconds(stage_evt.stage_gen_time_ms)}" ) + ar2diffusion_ms += float(stage_evt.ar2diffusion_time_ms or 0.0) transfers = [] for transfer_evt in sorted( [e for e in self.transfer_events.values() if e.request_id == rid], @@ -203,12 +204,14 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None: ): transfers.append(f"{transfer_evt.from_stage}->{transfer_evt.to_stage}={transfer_evt.total_time_ms:.3f}ms") + ar2diffusion = f" ar2diffusion={self._format_seconds(ar2diffusion_ms)}" if ar2diffusion_ms > 0.0 else "" logger.info( - "[OmniTiming] req=%s total=%s preprocess=%s engine=%s stages=[%s] transfers=[%s]", + "[OmniTiming] req=%s total=%s preprocess=%s engine=%s%s stages=[%s] transfers=[%s]", rid, self._format_seconds(evt.request_wall_time_ms), self._format_seconds(evt.input_preprocess_time_ms), self._format_seconds(evt.engine_pipeline_time_ms), + ar2diffusion, ",".join(stages), ",".join(transfers), ) @@ -418,19 +421,27 @@ def on_stage_metrics( self.record_transfer_rx(stats) - def record_stage_postprocess_time(self, stage_id: int, req_id: Any, postproc_time_ms: float) -> None: - if req_id in self.stage_events: - for stats in self.stage_events[req_id]: + def _update_stage_event_field(self, stage_id: int, req_id: Any, field_name: str, value: float) -> None: + rid_key = str(req_id) + if rid_key in self.stage_events: + for stats in self.stage_events[rid_key]: if stats.stage_id == stage_id: - stats.postprocess_time_ms = float(postproc_time_ms) + setattr(stats, field_name, float(value)) break else: logger.warning( - "Failed to record postprocess time for request %s at stage %s: no stage event found", + "Failed to record %s for request %s at stage %s: no stage event found", + field_name, req_id, stage_id, ) + def record_stage_postprocess_time(self, stage_id: int, req_id: Any, postproc_time_ms: float) -> None: + self._update_stage_event_field(stage_id, req_id, "postprocess_time_ms", postproc_time_ms) + + def record_ar2diffusion_time(self, stage_id: int, req_id: Any, ar2diffusion_time_ms: float) -> None: + self._update_stage_event_field(stage_id, req_id, "ar2diffusion_time_ms", ar2diffusion_time_ms) + @contextmanager def stage_postprocess_timer(self, stage_id: int, req_id: Any): """Context manager for measuring and recording stage postprocessing time. @@ -492,7 +503,6 @@ def on_finalize_request( req_id: Any, req_start_ts: float, input_preprocess_time_ms: float = 0.0, - build_add_request_message_time_ms: float = 0.0, ) -> None: rid_key = str(req_id) if rid_key in self.e2e_done: @@ -505,7 +515,6 @@ def on_finalize_request( self.last_finish_ts = max(self.last_finish_ts, _t1) engine_pipeline_ms = (_t1 - _t0) * 1000.0 input_preprocess_ms = float(input_preprocess_time_ms) - build_msg_ms = float(build_add_request_message_time_ms) request_wall_ms = engine_pipeline_ms + input_preprocess_ms # Sum tokens from all stages for this request @@ -519,7 +528,6 @@ def on_finalize_request( self.engine_pipeline_total_ms += engine_pipeline_ms self.input_preprocess_total_ms += input_preprocess_ms - self.build_add_request_message_total_ms += build_msg_ms self.e2e_total_tokens += total_tokens self.e2e_count += 1 self.e2e_done.add(rid_key) @@ -527,7 +535,6 @@ def on_finalize_request( request_id=rid_key, request_wall_time_ms=request_wall_ms, input_preprocess_time_ms=input_preprocess_ms, - build_add_request_message_time_ms=build_msg_ms, engine_pipeline_time_ms=engine_pipeline_ms, e2e_total_tokens=total_tokens, transfers_total_time_ms=float( @@ -565,7 +572,6 @@ def build_and_log_summary(self) -> dict[str, Any]: "e2e_requests": int(self.e2e_count), "request_wall_time_ms": float(wall_time_ms), "input_preprocess_time_ms": float(self.input_preprocess_total_ms), - "build_add_request_message_time_ms": float(self.build_add_request_message_total_ms), "engine_pipeline_time_ms": float(self.engine_pipeline_total_ms), "e2e_total_tokens": int(self.e2e_total_tokens), "avg_request_wall_time_ms": float(e2e_avg_req), @@ -607,23 +613,6 @@ def build_and_log_summary(self) -> dict[str, Any]: e2e_data = _build_row(e2e_evt, E2E_FIELDS) result_e2e_table.append({"request_id": rid, **e2e_data}) - # filter out all-zero fields for logging - nonzero_e2e_fields = set() - for k, v in e2e_data.items(): - if v not in (0, 0.000, None, ""): - nonzero_e2e_fields.add(k) - value_fields_e2e = sorted(nonzero_e2e_fields) - - if value_fields_e2e: - logger.info( - "\n%s", - _format_table( - f"RequestE2EStats [request_id={rid}]", - e2e_data, - value_fields=value_fields_e2e, - ), - ) - # === Stage table (columns = stage_id) === stage_evts = sorted( self.stage_events.get(rid, []), From 49337b8c668117ed37805027d5dfbf7ece478a8c Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Thu, 23 Apr 2026 20:50:48 +0800 Subject: [PATCH 04/11] Clarify Omni timing breakdown Signed-off-by: bjf-frz --- docs/contributing/metrics.md | 23 ++++- tests/metrics/test_stats.py | 12 ++- vllm_omni/engine/orchestrator.py | 12 ++- vllm_omni/metrics/stats.py | 164 ++++++++++++++++++++++++++++++- 4 files changed, 203 insertions(+), 8 deletions(-) diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md index f1fc37cbddf..76978da9aa7 100644 --- a/docs/contributing/metrics.md +++ b/docs/contributing/metrics.md @@ -99,6 +99,8 @@ For **offline inference** (batch mode), the summary includes both system-level m For **online inference** (serving mode), the summary is always per-request. `e2e_requests` is always 1, and only request-level metrics are reported for each completion. +When `e2e_requests` is 1, average fields are omitted from the printed `Overall Summary` because they are identical to the total/request-level values. They remain meaningful for offline batches with multiple completed requests. + --- ## Parameter Details @@ -111,13 +113,33 @@ For **online inference** (serving mode), the summary is always per-request. `e2e | `request_wall_time_ms` | Wall-clock time span from request preparation start to final completion, in ms. | | `input_preprocess_time_ms` | Time spent preparing and submitting requests before the engine pipeline starts. | | `engine_pipeline_time_ms` | Time from engine request submission to final completion. | +| `stage_gen_total_time_ms` | Sum of all stage `stage_gen_time_ms` values. | +| `stage_handoff_total_time_ms` | Sum of inter-stage handoff time, measured after an upstream stage finishes and before the downstream stage is submitted. | +| `ar2diffusion_total_time_ms` | Subset of `stage_handoff_total_time_ms` spent converting AR outputs into diffusion inputs. | +| `final_output_overhead_time_ms` | Final engine-side overhead not covered by stage generation or inter-stage handoff. | +| `stage_{i}_to_{j}_handoff_time_ms` | Handoff time for a specific stage edge, e.g. `stage_0_to_1_handoff_time_ms`. | +| `stage_{i}_to_{j}_ar2diffusion_time_ms` | AR-to-diffusion conversion time included in that edge's handoff time. | | `e2e_total_tokens` | Total tokens counted across all completed requests (stage0 input + all stage outputs). | | `avg_request_wall_time_ms` | Average wall time per request: `request_wall_time_ms / e2e_requests`. | | `avg_input_preprocess_time_ms` | Average pre-submit request preparation time per completed request. | | `avg_engine_pipeline_time_ms` | Average engine pipeline time per completed request. | +| `avg_stage_gen_total_time_ms` | Average summed stage generation time per completed request. | +| `avg_stage_handoff_total_time_ms` | Average summed inter-stage handoff time per completed request. | +| `avg_ar2diffusion_time_ms` | Average AR-to-diffusion conversion time per completed request. | +| `avg_final_output_overhead_time_ms` | Average final engine-side overhead per completed request. | | `e2e_avg_tokens_per_s` | Average token throughput over wall time: `e2e_total_tokens * 1000 / request_wall_time_ms`. | | `e2e_stage_{i}_wall_time_ms` | Wall-clock time span for stage i, in ms. Each stage's wall time is reported as a separate field, e.g., `e2e_stage_0_wall_time_ms`, `e2e_stage_1_wall_time_ms`, etc. | +### Timing Breakdown + +The printed summary also includes a `Timing Breakdown` section that shows the containment relationship: + +`request_wall_time_ms = input_preprocess_time_ms + engine_pipeline_time_ms` + +`engine_pipeline_time_ms = stage_gen_total_time_ms + stage_handoff_total_time_ms + final_output_overhead_time_ms` + +`ar2diffusion_total_time_ms` is included in `stage_handoff_total_time_ms`. For a concrete AR-to-diffusion edge, `stage_0_to_1_ar2diffusion_time_ms` is included in `stage_0_to_1_handoff_time_ms`. + --- ### E2E Table (per request) @@ -146,7 +168,6 @@ For **online inference** (serving mode), the summary is always per-request. `e2e | `image_num` | Number of images generated (for diffusion/image stages). | | `resolution` | Image resolution (for diffusion/image stages). | | `postprocess_time_ms` | Diffusion/image: post-processing time in ms. | -| `ar2diffusion_time_ms` | Time spent converting AR-stage output into the diffusion-stage prompt before diffusion request submission. | --- diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py index 5c60f02ca18..1f01d6f40c9 100644 --- a/tests/metrics/test_stats.py +++ b/tests/metrics/test_stats.py @@ -37,9 +37,11 @@ def test_orchestrator_aggregator_builds_summary() -> None: rx_decode_time_ms=0.0, rx_in_flight_time_ms=0.0, stage_stats=StageStats(), + handoff_to_stage_id=1, + stage_handoff_time_ms=20.0, + ar2diffusion_time_ms=12.5, ), ) - agg.record_ar2diffusion_time(0, "r1", 12.5) agg.on_stage_metrics( 1, "r1", @@ -62,11 +64,17 @@ def test_orchestrator_aggregator_builds_summary() -> None: assert overall["e2e_requests"] == 1 assert overall["request_wall_time_ms"] == 70.0 assert overall["input_preprocess_time_ms"] == 0.0 + assert overall["stage_gen_total_time_ms"] == 50.0 + assert overall["stage_handoff_total_time_ms"] == 20.0 + assert overall["stage_0_to_1_handoff_time_ms"] == 20.0 + assert overall["ar2diffusion_total_time_ms"] == 12.5 + assert overall["stage_0_to_1_ar2diffusion_time_ms"] == 12.5 stage_entry = _get_request_entry(summary["stage_table"], "r1") stage_ids = [row["stage_id"] for row in stage_entry["stages"]] assert stage_ids == [0, 1] - assert stage_entry["stages"][0]["ar2diffusion_time_ms"] == 12.5 + assert "stage_handoff_time_ms" not in stage_entry["stages"][0] + assert "ar2diffusion_time_ms" not in stage_entry["stages"][0] transfer_entry = _get_request_entry(summary["trans_table"], "r1") assert transfer_entry["transfers"][0]["edge"] == "0->1" diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 24830b9b840..f61f6346a12 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -628,6 +628,7 @@ def _build_stage_metrics( total_token=self._agg_total_tokens[stage_id], total_gen_time_ms=self._agg_total_gen_time_ms[stage_id], ), + stage_end_ts=now, ) def _build_kv_sender_info(self, sender_stage_ids: list[int]) -> dict[int, dict[str, Any]] | None: @@ -676,7 +677,9 @@ async def _forward_to_next_stage( next_stage_resumable = is_streaming_session and not is_final_update if next_client.stage_type == "diffusion": + handoff_start_ts = getattr(stage_metrics, "stage_end_ts", None) or _time.time() self.stage_clients[stage_id].set_engine_outputs([output]) + ar2diffusion_ms = 0.0 if next_client.custom_process_input_func is not None: _t_ar2d = _time.perf_counter() diffusion_prompt = next_client.custom_process_input_func( @@ -686,6 +689,7 @@ async def _forward_to_next_stage( False, ) _dt_ar2d = (_time.perf_counter() - _t_ar2d) * 1000 + ar2diffusion_ms = _dt_ar2d if stage_metrics is not None: stage_metrics.ar2diffusion_time_ms = _dt_ar2d if isinstance(diffusion_prompt, list): @@ -737,7 +741,13 @@ async def _forward_to_next_stage( params, kv_sender_info=kv_sender_info, ) - req_state.stage_submit_ts[next_stage_id] = _time.time() + next_submit_ts = _time.time() + req_state.stage_submit_ts[next_stage_id] = next_submit_ts + if stage_metrics is not None: + stage_metrics.handoff_to_stage_id = next_stage_id + stage_metrics.stage_handoff_time_ms = max(0.0, (next_submit_ts - handoff_start_ts) * 1000.0) + if stage_metrics.ar2diffusion_time_ms == 0.0: + stage_metrics.ar2diffusion_time_ms = ar2diffusion_ms return # PD disaggregation: prefill → decode routing uses original prompt + KV transfer params diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py index 618c292c7e7..7ab31cc3b65 100644 --- a/vllm_omni/metrics/stats.py +++ b/vllm_omni/metrics/stats.py @@ -43,6 +43,9 @@ class StageRequestStats: postprocess_time_ms: float = 0.0 diffusion_metrics: dict[str, int] = None audio_generated_frames: int = 0 + stage_end_ts: float | None = None + handoff_to_stage_id: int | None = None + stage_handoff_time_ms: float = 0.0 ar2diffusion_time_ms: float = 0.0 @property @@ -106,12 +109,34 @@ def e2e_tpt(self) -> float: "rx_decode_time_ms", "rx_in_flight_time_ms", "final_output_type", + "stage_end_ts", + "handoff_to_stage_id", + "stage_handoff_time_ms", + "ar2diffusion_time_ms", } TRANSFER_EXCLUDE = {"from_stage", "to_stage", "request_id", "used_shm"} E2E_EXCLUDE = {"request_id"} # Decide the order of overall summary fields, or None for auto -OVERALL_FIELDS: list[str] | None = None +OVERALL_FIELDS: list[str] | None = [ + "e2e_requests", + "request_wall_time_ms", + "input_preprocess_time_ms", + "engine_pipeline_time_ms", + "stage_gen_total_time_ms", + "stage_handoff_total_time_ms", + "ar2diffusion_total_time_ms", + "final_output_overhead_time_ms", + "e2e_total_tokens", + "avg_request_wall_time_ms", + "avg_input_preprocess_time_ms", + "avg_engine_pipeline_time_ms", + "avg_stage_gen_total_time_ms", + "avg_stage_handoff_total_time_ms", + "avg_ar2diffusion_time_ms", + "avg_final_output_overhead_time_ms", + "e2e_avg_tokens_per_s", +] STAGE_FIELDS = _build_field_defs(StageRequestStats, STAGE_EXCLUDE, FIELD_TRANSFORMS) TRANSFER_FIELDS = _build_field_defs(TransferEdgeStats, TRANSFER_EXCLUDE, FIELD_TRANSFORMS) E2E_FIELDS = _build_field_defs(RequestE2EStats, E2E_EXCLUDE, FIELD_TRANSFORMS) @@ -185,9 +210,13 @@ def _stage_label(self, stage_id: int | None) -> str: def _format_seconds(self, ms: float) -> str: return f"{ms / 1000.0:.2f}s" + def _format_ms(self, ms: float) -> str: + return f"{ms:,.3f} ms" + def _log_omni_timing(self, evt: RequestE2EStats) -> None: rid = evt.request_id stages = [] + handoffs = [] ar2diffusion_ms = 0.0 for stage_evt in sorted( self.stage_events.get(rid, []), @@ -197,6 +226,14 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None: f"{self._stage_label(stage_evt.stage_id)}={self._format_seconds(stage_evt.stage_gen_time_ms)}" ) ar2diffusion_ms += float(stage_evt.ar2diffusion_time_ms or 0.0) + if stage_evt.handoff_to_stage_id is not None and stage_evt.stage_handoff_time_ms > 0.0: + handoff = ( + f"{stage_evt.stage_id}->{stage_evt.handoff_to_stage_id}=" + f"{stage_evt.stage_handoff_time_ms:.3f}ms" + ) + if stage_evt.ar2diffusion_time_ms > 0.0: + handoff += f"(ar2diffusion={stage_evt.ar2diffusion_time_ms:.3f}ms)" + handoffs.append(handoff) transfers = [] for transfer_evt in sorted( [e for e in self.transfer_events.values() if e.request_id == rid], @@ -206,16 +243,60 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None: ar2diffusion = f" ar2diffusion={self._format_seconds(ar2diffusion_ms)}" if ar2diffusion_ms > 0.0 else "" logger.info( - "[OmniTiming] req=%s total=%s preprocess=%s engine=%s%s stages=[%s] transfers=[%s]", + "[OmniTiming] req=%s total=%s preprocess=%s engine=%s%s stages=[%s] handoffs=[%s] transfers=[%s]", rid, self._format_seconds(evt.request_wall_time_ms), self._format_seconds(evt.input_preprocess_time_ms), self._format_seconds(evt.engine_pipeline_time_ms), ar2diffusion, ",".join(stages), + ",".join(handoffs), ",".join(transfers), ) + def _build_timing_breakdown_lines(self, summary: dict[str, Any]) -> list[str]: + lines = [ + "[Timing Breakdown]", + ( + "request_wall_time_ms = input_preprocess_time_ms + engine_pipeline_time_ms " + f"= {self._format_ms(summary.get('input_preprocess_time_ms', 0.0))} + " + f"{self._format_ms(summary.get('engine_pipeline_time_ms', 0.0))} " + f"= {self._format_ms(summary.get('request_wall_time_ms', 0.0))}" + ), + ( + "engine_pipeline_time_ms = stage_gen_total_time_ms + " + "stage_handoff_total_time_ms + final_output_overhead_time_ms " + f"= {self._format_ms(summary.get('stage_gen_total_time_ms', 0.0))} + " + f"{self._format_ms(summary.get('stage_handoff_total_time_ms', 0.0))} + " + f"{self._format_ms(summary.get('final_output_overhead_time_ms', 0.0))} " + f"= {self._format_ms(summary.get('engine_pipeline_time_ms', 0.0))}" + ), + ] + + ar2diffusion_total_ms = float(summary.get("ar2diffusion_total_time_ms", 0.0) or 0.0) + if ar2diffusion_total_ms > 0.0: + lines.append( + "ar2diffusion_total_time_ms is included in stage_handoff_total_time_ms: " + f"{self._format_ms(ar2diffusion_total_ms)} <= " + f"{self._format_ms(summary.get('stage_handoff_total_time_ms', 0.0))}" + ) + + edge_handoff_fields = sorted( + k + for k in summary + if k.startswith("stage_") and "_to_" in k and k.endswith("_handoff_time_ms") + ) + for handoff_field in edge_handoff_fields: + edge_name = handoff_field.removesuffix("_handoff_time_ms") + ar2d_field = f"{edge_name}_ar2diffusion_time_ms" + if ar2d_field in summary and float(summary.get(ar2d_field, 0.0) or 0.0) > 0.0: + lines.append( + f"{ar2d_field} is included in {handoff_field}: " + f"{self._format_ms(summary[ar2d_field])} <= {self._format_ms(summary[handoff_field])}" + ) + + return lines + def _get_or_create_transfer_event( self, from_stage: int, @@ -426,7 +507,7 @@ def _update_stage_event_field(self, stage_id: int, req_id: Any, field_name: str, if rid_key in self.stage_events: for stats in self.stage_events[rid_key]: if stats.stage_id == stage_id: - setattr(stats, field_name, float(value)) + setattr(stats, field_name, value if field_name == "handoff_to_stage_id" else float(value)) break else: logger.warning( @@ -442,6 +523,16 @@ def record_stage_postprocess_time(self, stage_id: int, req_id: Any, postproc_tim def record_ar2diffusion_time(self, stage_id: int, req_id: Any, ar2diffusion_time_ms: float) -> None: self._update_stage_event_field(stage_id, req_id, "ar2diffusion_time_ms", ar2diffusion_time_ms) + def record_stage_handoff_time( + self, + from_stage: int, + to_stage: int, + req_id: Any, + handoff_time_ms: float, + ) -> None: + self._update_stage_event_field(from_stage, req_id, "handoff_to_stage_id", to_stage) + self._update_stage_event_field(from_stage, req_id, "stage_handoff_time_ms", handoff_time_ms) + @contextmanager def stage_postprocess_timer(self, stage_id: int, req_id: Any): """Context manager for measuring and recording stage postprocessing time. @@ -567,12 +658,36 @@ def build_and_log_summary(self) -> dict[str, Any]: else 0.0 for i in range(self.num_stages) ] + stage_gen_total_ms = 0.0 + stage_handoff_total_ms = 0.0 + ar2diffusion_total_ms = 0.0 + handoff_edge_totals: defaultdict[str, float] = defaultdict(float) + handoff_edge_ar2diffusion: defaultdict[str, float] = defaultdict(float) + for stage_evts in self.stage_events.values(): + for evt in stage_evts: + stage_gen_total_ms += float(evt.stage_gen_time_ms or 0.0) + handoff_ms = float(evt.stage_handoff_time_ms or 0.0) + ar2d_ms = float(evt.ar2diffusion_time_ms or 0.0) + stage_handoff_total_ms += handoff_ms + ar2diffusion_total_ms += ar2d_ms + if evt.stage_id is not None and evt.handoff_to_stage_id is not None: + edge = f"stage_{evt.stage_id}_to_{evt.handoff_to_stage_id}" + handoff_edge_totals[edge] += handoff_ms + handoff_edge_ar2diffusion[edge] += ar2d_ms + final_output_overhead_ms = max( + 0.0, + float(self.engine_pipeline_total_ms) - stage_gen_total_ms - stage_handoff_total_ms, + ) overall_summary = { "e2e_requests": int(self.e2e_count), "request_wall_time_ms": float(wall_time_ms), "input_preprocess_time_ms": float(self.input_preprocess_total_ms), "engine_pipeline_time_ms": float(self.engine_pipeline_total_ms), + "stage_gen_total_time_ms": float(stage_gen_total_ms), + "stage_handoff_total_time_ms": float(stage_handoff_total_ms), + "ar2diffusion_total_time_ms": float(ar2diffusion_total_ms), + "final_output_overhead_time_ms": float(final_output_overhead_ms), "e2e_total_tokens": int(self.e2e_total_tokens), "avg_request_wall_time_ms": float(e2e_avg_req), "avg_input_preprocess_time_ms": float( @@ -581,8 +696,23 @@ def build_and_log_summary(self) -> dict[str, Any]: "avg_engine_pipeline_time_ms": float( self.engine_pipeline_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 ), + "avg_stage_gen_total_time_ms": float(stage_gen_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0), + "avg_stage_handoff_total_time_ms": float( + stage_handoff_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 + ), + "avg_ar2diffusion_time_ms": float( + ar2diffusion_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 + ), + "avg_final_output_overhead_time_ms": float( + final_output_overhead_ms / self.e2e_count if self.e2e_count > 0 else 0.0 + ), "e2e_avg_tokens_per_s": float(e2e_avg_tok), } + for edge, handoff_time in sorted(handoff_edge_totals.items()): + overall_summary[f"{edge}_handoff_time_ms"] = float(handoff_time) + ar2d_time = handoff_edge_ar2diffusion.get(edge, 0.0) + if ar2d_time > 0.0: + overall_summary[f"{edge}_ar2diffusion_time_ms"] = float(ar2d_time) # Add stage_wall_time_ms as separate fields for each stage for idx, wall_time in enumerate(stage_wall_time_ms): overall_summary[f"e2e_stage_{idx}_wall_time_ms"] = wall_time @@ -590,7 +720,32 @@ def build_and_log_summary(self) -> dict[str, Any]: # Print overall summary # filter out all-zero fields for logging overall_fields = [] - for k in OVERALL_FIELDS or list(overall_summary.keys()): + avg_fields = { + "avg_request_wall_time_ms", + "avg_input_preprocess_time_ms", + "avg_engine_pipeline_time_ms", + "avg_stage_gen_total_time_ms", + "avg_stage_handoff_total_time_ms", + "avg_ar2diffusion_time_ms", + "avg_final_output_overhead_time_ms", + } + dynamic_overall_fields = [ + k + for k in overall_summary + if k.startswith("stage_") + and "_to_" in k + and (k.endswith("_handoff_time_ms") or k.endswith("_ar2diffusion_time_ms")) + ] + stage_wall_fields = [k for k in overall_summary if k.startswith("e2e_stage_") and k.endswith("_wall_time_ms")] + ordered_overall_fields = list(OVERALL_FIELDS or []) + insert_at = ordered_overall_fields.index("e2e_total_tokens") + ordered_overall_fields[insert_at:insert_at] = sorted( + dynamic_overall_fields, + key=lambda name: (name.replace("_ar2diffusion_time_ms", "_zz_ar2diffusion_time_ms")), + ) + sorted(stage_wall_fields) + for k in ordered_overall_fields or list(overall_summary.keys()): + if self.e2e_count <= 1 and k in avg_fields: + continue v = overall_summary.get(k, None) if v not in (0, 0.0, 0.000, None, ""): overall_fields.append(k) @@ -599,6 +754,7 @@ def build_and_log_summary(self) -> dict[str, Any]: "\n%s", _format_table("Overall Summary", overall_summary, overall_fields), ) + logger.info("\n%s", "\n".join(self._build_timing_breakdown_lines(overall_summary))) all_request_ids = sorted(set(self.stage_events.keys()) | {e.request_id for e in self.e2e_events}) From 8323309af63df33467bb4642b9f261dabde8ec5e Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Thu, 23 Apr 2026 21:01:25 +0800 Subject: [PATCH 05/11] Apply pre-commit formatting Signed-off-by: bjf-frz --- vllm_omni/engine/cfg_companion_tracker.py | 8 +++++++- vllm_omni/metrics/stats.py | 11 +++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/vllm_omni/engine/cfg_companion_tracker.py b/vllm_omni/engine/cfg_companion_tracker.py index 74cdf818775..919a842af1d 100644 --- a/vllm_omni/engine/cfg_companion_tracker.py +++ b/vllm_omni/engine/cfg_companion_tracker.py @@ -80,7 +80,13 @@ def on_companion_completed(self, companion_id: str) -> str | None: return parent_id return None - def defer_parent(self, parent_id: str, engine_outputs: Any, stage_id: int, stage_metrics: Any | None = None) -> None: + def defer_parent( + self, + parent_id: str, + engine_outputs: Any, + stage_id: int, + stage_metrics: Any | None = None, + ) -> None: """Hold parent result while waiting for companions to finish.""" # TODO: Add timeout/error recovery when the orchestrator grows a # companion-failure path. Today deferred parents are released only when diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py index 7ab31cc3b65..3e4f0f6bd7e 100644 --- a/vllm_omni/metrics/stats.py +++ b/vllm_omni/metrics/stats.py @@ -228,8 +228,7 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None: ar2diffusion_ms += float(stage_evt.ar2diffusion_time_ms or 0.0) if stage_evt.handoff_to_stage_id is not None and stage_evt.stage_handoff_time_ms > 0.0: handoff = ( - f"{stage_evt.stage_id}->{stage_evt.handoff_to_stage_id}=" - f"{stage_evt.stage_handoff_time_ms:.3f}ms" + f"{stage_evt.stage_id}->{stage_evt.handoff_to_stage_id}={stage_evt.stage_handoff_time_ms:.3f}ms" ) if stage_evt.ar2diffusion_time_ms > 0.0: handoff += f"(ar2diffusion={stage_evt.ar2diffusion_time_ms:.3f}ms)" @@ -282,9 +281,7 @@ def _build_timing_breakdown_lines(self, summary: dict[str, Any]) -> list[str]: ) edge_handoff_fields = sorted( - k - for k in summary - if k.startswith("stage_") and "_to_" in k and k.endswith("_handoff_time_ms") + k for k in summary if k.startswith("stage_") and "_to_" in k and k.endswith("_handoff_time_ms") ) for handoff_field in edge_handoff_fields: edge_name = handoff_field.removesuffix("_handoff_time_ms") @@ -700,9 +697,7 @@ def build_and_log_summary(self) -> dict[str, Any]: "avg_stage_handoff_total_time_ms": float( stage_handoff_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 ), - "avg_ar2diffusion_time_ms": float( - ar2diffusion_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 - ), + "avg_ar2diffusion_time_ms": float(ar2diffusion_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0), "avg_final_output_overhead_time_ms": float( final_output_overhead_ms / self.e2e_count if self.e2e_count > 0 else 0.0 ), From 35283eccfebba6901c151ce5fd23306837d0ae0b Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Fri, 24 Apr 2026 16:13:09 +0800 Subject: [PATCH 06/11] Address metrics review comments Signed-off-by: bjf-frz --- docs/contributing/metrics.md | 148 +++++------ tests/metrics/test_stats.py | 16 +- vllm_omni/engine/orchestrator.py | 19 +- vllm_omni/entrypoints/omni_base.py | 30 ++- vllm_omni/metrics/stats.py | 395 +++++++++++++++++++---------- 5 files changed, 386 insertions(+), 222 deletions(-) diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md index 76978da9aa7..ca7f596dda2 100644 --- a/docs/contributing/metrics.md +++ b/docs/contributing/metrics.md @@ -3,7 +3,7 @@ You can use these metrics in production to monitor the health and performance of the vLLM-omni system. Typical scenarios include: -- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `request_wall_time_ms` / `engine_pipeline_time_ms`), and resource utilization to verify that the system meets expected standards. +- **Performance Monitoring**: Track throughput (e.g., `avg_tokens_per_s`), latency (e.g., `request_wall_time_ms` / `engine_pipeline_time_ms`), and resource utilization to verify that the system meets expected standards. - **Debugging and Troubleshooting**: Use detailed per-request metrics to diagnose issues, such as high transfer times or unexpected token counts. @@ -32,74 +32,68 @@ For multi-stage pipelines, vLLM-Omni also emits a concise per-request timing lin ``` -#### Overall Summary +#### Omni Metrics Summary -| Field | Value | -|-----------------------------|--------------| -| e2e_requests | 1 | -| request_wall_time_ms | 41,299.190 | -| input_preprocess_time_ms | 57.000 | -| engine_pipeline_time_ms | 41,299.133 | -| e2e_total_tokens | 5,202 | -| avg_request_wall_time_ms | 41,299.190 | -| e2e_avg_tokens_per_s | 125.959 | -| e2e_stage_0_wall_time_ms | 10,192.289 | -| e2e_stage_1_wall_time_ms | 30,541.409 | -| e2e_stage_2_wall_time_ms | 207.496 | - -#### RequestE2EStats - -| Field | Value | -|-------------------------|------------| -| request_wall_time_ms | 41,356.133 | -| input_preprocess_time_ms | 57.000 | -| engine_pipeline_time_ms | 41,299.133 | -| e2e_total_tokens | 5,202 | -| transfers_total_time_ms | 245.895 | -| transfers_total_kbytes | 138,089.939| - -#### StageRequestStats - -| Field | 0 | 1 | 2 | -|------------------------|--------|--------|--------| -| audio_generated_frames | 0 | 0 | 525,525| -| batch_id | 38 | 274 | 0 | -| batch_size | 1 | 1 | 1 | -| num_tokens_in | 4,860 | 4,826 | 4,384 | -| num_tokens_out | 67 | 275 | 0 | -| postprocess_time_ms | 256.158| 0.491 | 0.000 | -| stage_gen_time_ms | 9,910.007|30,379.198|160.745| - -#### TransferEdgeStats - -| Field | 0->1 | 1->2 | -|---------------------|-------------|------------| -| size_kbytes | 109,277.349 | 28,812.591 | -| tx_time_ms | 78.701 | 18.790 | -| rx_decode_time_ms | 111.865 | 31.706 | -| in_flight_time_ms | 2.015 | 2.819 | - - -These logs include: - -- **Overall summary**: total requests, wall time, average tokens/sec, etc. - -- **E2E table**: per-request latency and token counts. - -- **Stage table**: per-stage batch and timing details. +```text +============ Omni Metrics Summary ============ +Successful requests: 1 +Total E2E time (ms): 41,356.133 +Input preprocess time (ms): 57.000 +Engine pipeline time (ms): 41,299.133 +Sum check (ms): 41,356.133 + +------------ Overall Time Breakdown ------------ +Input preprocess time (ms): 57.000 +Stage 0 generation time (ms): 9,910.007 +Stage 0 output processor time (ms): 7.000 +Stage 0 -> Stage 1 handoff time (ms): 245.895 +Stage 1 generation time (ms): 30,379.198 +Stage 1 output processor time (ms): 10.000 +Final output time (ms): 747.033 +Component sum (ms): 41,356.133 +E2E - component sum (ms): 0.000 + +------------ Stage 0 Breakdown ------------ +Stage generation time (ms): 9,910.007 +Output processor time (ms): 7.000 +Stage sum check (ms): 9,917.007 + +Stage id: 0 +Stage name: ar +Stage type: llm +Final output type: text +Batch id: 38 +Batch size: 1 + +Input tokens: 4,860 +Output tokens: 67 +Output token throughput (tok/s): 6.761 +Postprocess time (ms): 256.158 + +------------ Stage 0 -> Stage 1 Handoff ------------ +Handoff total time (ms): 245.895 +AR to diffusion time (ms): 53.314 +Other handoff processing time (ms): 192.581 + +------------ Final Output Breakdown ------------ +Final output wrapping time (ms): 160.745 +Final orchestration overhead time (ms): 586.288 +Final output total time (ms): 747.033 +Final output sum check (ms): 747.033 +``` -- **Transfer table**: data transfer and timing for each edge. +These logs include a high-level end-to-end summary, a non-overlapping stage/time breakdown, and per-stage details from `StageRequestStats`. The final output layer includes output wrapping plus the remaining orchestration overhead so the top-level component sum can be checked directly against E2E time. You can use these logs to monitor system health, debug performance, and analyze request-level metrics as described above. ## Metrics Scope: Offline vs Online Inference -For **offline inference** (batch mode), the summary includes both system-level metrics (aggregated across all requests) and per-request metrics. In this case, `e2e_requests` can be greater than 1, reflecting multiple completed requests in a batch. +For **offline inference** (batch mode), the summary includes both system-level metrics (aggregated across all requests) and per-request metrics. In this case, `num_of_requests` can be greater than 1, reflecting multiple completed requests in a batch. -For **online inference** (serving mode), the summary is always per-request. `e2e_requests` is always 1, and only request-level metrics are reported for each completion. +For **online inference** (serving mode), the summary is always per-request. `num_of_requests` is always 1, and only request-level metrics are reported for each completion. -When `e2e_requests` is 1, average fields are omitted from the printed `Overall Summary` because they are identical to the total/request-level values. They remain meaningful for offline batches with multiple completed requests. +When `num_of_requests` is 1, average fields are omitted from the returned overall summary because they are identical to the total/request-level values. They remain meaningful for offline batches with multiple completed requests. --- @@ -109,34 +103,40 @@ When `e2e_requests` is 1, average fields are omitted from the printed `Overall S | Field | Meaning | |---------------------------|----------------------------------------------------------------------------------------------| -| `e2e_requests` | Number of completed requests. | +| `num_of_requests` | Number of completed requests. | | `request_wall_time_ms` | Wall-clock time span from request preparation start to final completion, in ms. | | `input_preprocess_time_ms` | Time spent preparing and submitting requests before the engine pipeline starts. | | `engine_pipeline_time_ms` | Time from engine request submission to final completion. | | `stage_gen_total_time_ms` | Sum of all stage `stage_gen_time_ms` values. | +| `output_processor_total_time_ms` | Sum of time spent in per-stage output processors after raw engine outputs are returned. | | `stage_handoff_total_time_ms` | Sum of inter-stage handoff time, measured after an upstream stage finishes and before the downstream stage is submitted. | | `ar2diffusion_total_time_ms` | Subset of `stage_handoff_total_time_ms` spent converting AR outputs into diffusion inputs. | -| `final_output_overhead_time_ms` | Final engine-side overhead not covered by stage generation or inter-stage handoff. | +| `final_output_total_time_ms` | Measured time spent wrapping final stage outputs into `OmniRequestOutput` objects before yielding them. | +| `breakdown_delta_time_ms` | Difference between E2E wall time and the printed component sum; useful for detecting missing timers or overlap. | | `stage_{i}_to_{j}_handoff_time_ms` | Handoff time for a specific stage edge, e.g. `stage_0_to_1_handoff_time_ms`. | | `stage_{i}_to_{j}_ar2diffusion_time_ms` | AR-to-diffusion conversion time included in that edge's handoff time. | -| `e2e_total_tokens` | Total tokens counted across all completed requests (stage0 input + all stage outputs). | -| `avg_request_wall_time_ms` | Average wall time per request: `request_wall_time_ms / e2e_requests`. | +| `total_tokens` | Total tokens counted across all completed requests (stage0 input + all stage outputs). | +| `avg_request_wall_time_ms` | Average wall time per request: `request_wall_time_ms / num_of_requests`. | | `avg_input_preprocess_time_ms` | Average pre-submit request preparation time per completed request. | | `avg_engine_pipeline_time_ms` | Average engine pipeline time per completed request. | | `avg_stage_gen_total_time_ms` | Average summed stage generation time per completed request. | +| `avg_output_processor_time_ms` | Average output processor time per completed request. | | `avg_stage_handoff_total_time_ms` | Average summed inter-stage handoff time per completed request. | | `avg_ar2diffusion_time_ms` | Average AR-to-diffusion conversion time per completed request. | -| `avg_final_output_overhead_time_ms` | Average final engine-side overhead per completed request. | -| `e2e_avg_tokens_per_s` | Average token throughput over wall time: `e2e_total_tokens * 1000 / request_wall_time_ms`. | -| `e2e_stage_{i}_wall_time_ms` | Wall-clock time span for stage i, in ms. Each stage's wall time is reported as a separate field, e.g., `e2e_stage_0_wall_time_ms`, `e2e_stage_1_wall_time_ms`, etc. | +| `avg_final_output_time_ms` | Average final output wrapping time per completed request. | +| `avg_breakdown_delta_time_ms` | Average difference between E2E wall time and the printed component sum per completed request. | +| `avg_tokens_per_s` | Average token throughput over wall time: `total_tokens * 1000 / request_wall_time_ms`. | +| `stage_{i}_wall_time_ms` | Wall-clock time span for stage i, in ms. Each stage's wall time is reported as a separate field, e.g., `stage_0_wall_time_ms`, `stage_1_wall_time_ms`, etc. | ### Timing Breakdown -The printed summary also includes a `Timing Breakdown` section that shows the containment relationship: +The printed summary includes sum checks that show the containment relationship: `request_wall_time_ms = input_preprocess_time_ms + engine_pipeline_time_ms` -`engine_pipeline_time_ms = stage_gen_total_time_ms + stage_handoff_total_time_ms + final_output_overhead_time_ms` +`request_wall_time_ms ~= input_preprocess_time_ms + sum(stage_gen_time_ms) + sum(output_processor_time_ms) + sum(stage_handoff_time_ms) + final_output_total_time_ms` + +Any remaining difference is reported as `breakdown_delta_time_ms`. `ar2diffusion_total_time_ms` is included in `stage_handoff_total_time_ms`. For a concrete AR-to-diffusion edge, `stage_0_to_1_ar2diffusion_time_ms` is included in `stage_0_to_1_handoff_time_ms`. @@ -149,9 +149,10 @@ The printed summary also includes a `Timing Breakdown` section that shows the co | `request_wall_time_ms` | End-to-end latency in ms, including input preprocessing and engine pipeline time. | | `input_preprocess_time_ms` | Time spent preparing and submitting the request before `engine_pipeline_time_ms` starts. | | `engine_pipeline_time_ms` | Time from engine request submission to final completion. | -| `e2e_total_tokens` | Total tokens for the request (stage0 input + all stage outputs). | +| `total_tokens` | Total tokens for the request (stage0 input + all stage outputs). | | `transfers_total_time_ms` | Sum of transfer edge `total_time_ms` for this request. | | `transfers_total_kbytes` | Sum of transfer kbytes for this request. | +| `final_output_time_ms` | Time spent wrapping the final engine output into an `OmniRequestOutput` before yielding it. | --- @@ -164,7 +165,8 @@ The printed summary also includes a `Timing Breakdown` section that shows the co | `batch_size` | Batch size. | | `num_tokens_in` | Input tokens to the stage. | | `num_tokens_out` | Output tokens from the stage. | -| `stage_gen_time_ms` | Stage compute time in ms, excluding postprocessing time (reported separately as `postprocess_time_ms`). | +| `stage_gen_time_ms` | Stage compute time in ms, excluding output processor time and postprocessing time. | +| `output_processor_time_ms` | Time spent in the stage output processor after raw engine outputs are returned. | | `image_num` | Number of images generated (for diffusion/image stages). | | `resolution` | Image resolution (for diffusion/image stages). | | `postprocess_time_ms` | Diffusion/image: post-processing time in ms. | @@ -185,20 +187,20 @@ The printed summary also includes a `Timing Breakdown` section that shows the co **Formulas:** -- `e2e_total_tokens = Stage0's num_tokens_in + sum(all stages' num_tokens_out)` +- `total_tokens = Stage0's num_tokens_in + sum(all stages' num_tokens_out)` - `transfers_total_time_ms = sum(tx_time_ms + rx_decode_time_ms + in_flight_time_ms)` for every edge **Using the example above:** -**e2e_total_tokens** +**total_tokens** - Stage0's `num_tokens_in`: **4,860** - Stage0's `num_tokens_out`: **67** - Stage1's `num_tokens_out`: **275** - Stage2's `num_tokens_out`: **0** -so `e2e_total_tokens = 4,860 + 67 + 275 + 0 = 5,202`, which matches the table value `e2e_total_tokens`. +so `total_tokens = 4,860 + 67 + 275 + 0 = 5,202`, which matches the table value `total_tokens`. **transfers_total_time_ms** diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py index 1f01d6f40c9..3bb722b732d 100644 --- a/tests/metrics/test_stats.py +++ b/tests/metrics/test_stats.py @@ -15,7 +15,7 @@ def _get_request_entry(table: list[dict], request_id: str) -> dict: raise AssertionError(f"request_id={request_id} not found") -def test_orchestrator_aggregator_builds_summary() -> None: +def test_orchestrator_aggregator_builds_summary(monkeypatch: pytest.MonkeyPatch) -> None: agg = OrchestratorAggregator(num_stages=2, log_stats=True, wall_start_ts=0.0, final_stage_id_for_e2e=1) agg.stage_first_ts[0] = 0.0 agg.stage_last_ts[0] = 0.03 @@ -40,6 +40,7 @@ def test_orchestrator_aggregator_builds_summary() -> None: handoff_to_stage_id=1, stage_handoff_time_ms=20.0, ar2diffusion_time_ms=12.5, + output_processor_time_ms=7.0, ), ) agg.on_stage_metrics( @@ -57,15 +58,19 @@ def test_orchestrator_aggregator_builds_summary() -> None: stage_stats=StageStats(), ), ) + agg.record_final_output_time("r1", 3.0) + monkeypatch.setattr("vllm_omni.metrics.stats.time.time", lambda: 0.07) agg.on_finalize_request(1, "r1", req_start_ts=0.0) summary = agg.build_and_log_summary() overall = summary["overall_summary"] - assert overall["e2e_requests"] == 1 + assert overall["num_of_requests"] == 1 assert overall["request_wall_time_ms"] == 70.0 assert overall["input_preprocess_time_ms"] == 0.0 assert overall["stage_gen_total_time_ms"] == 50.0 + assert overall["output_processor_total_time_ms"] == 7.0 assert overall["stage_handoff_total_time_ms"] == 20.0 + assert overall["final_output_total_time_ms"] == 3.0 assert overall["stage_0_to_1_handoff_time_ms"] == 20.0 assert overall["ar2diffusion_total_time_ms"] == 12.5 assert overall["stage_0_to_1_ar2diffusion_time_ms"] == 12.5 @@ -81,7 +86,8 @@ def test_orchestrator_aggregator_builds_summary() -> None: assert transfer_entry["transfers"][0]["size_kbytes"] == 1.0 e2e_entry = _get_request_entry(summary["e2e_table"], "r1") - assert e2e_entry["e2e_total_tokens"] == 10 + assert e2e_entry["total_tokens"] == 10 + assert e2e_entry["final_output_time_ms"] == 3.0 def test_build_and_log_summary_e2e_only() -> None: @@ -92,7 +98,7 @@ def test_build_and_log_summary_e2e_only() -> None: request_wall_time_ms=10.0, input_preprocess_time_ms=0.0, engine_pipeline_time_ms=10.0, - e2e_total_tokens=5, + total_tokens=5, transfers_total_time_ms=0.0, transfers_total_bytes=0, ) @@ -102,7 +108,7 @@ def test_build_and_log_summary_e2e_only() -> None: e2e_entry = _get_request_entry(summary["e2e_table"], "r") assert e2e_entry["request_wall_time_ms"] == 10.0 assert e2e_entry["engine_pipeline_time_ms"] == 10.0 - assert e2e_entry["e2e_total_tokens"] == 5 + assert e2e_entry["total_tokens"] == 5 stage_entry = _get_request_entry(summary["stage_table"], "r") assert stage_entry["stages"] == [] diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index f61f6346a12..76a3170c214 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -109,6 +109,7 @@ class OrchestratorRequestState: # Metrics: timestamp when request was submitted to each stage stage_submit_ts: dict[int, float] = field(default_factory=dict) + output_processor_time_ms: dict[int, float] = field(default_factory=dict) mm_processor_kwargs: dict | None = None mm_features: list | None = None @@ -597,7 +598,9 @@ def _build_stage_metrics( """ now = _time.time() submit_ts = req_state.stage_submit_ts.get(stage_id, now) - stage_gen_time_ms = (now - submit_ts) * 1000.0 + output_processor_time_ms = float(req_state.output_processor_time_ms.get(stage_id, 0.0)) + stage_wall_time_ms = (now - submit_ts) * 1000.0 + stage_gen_time_ms = max(0.0, stage_wall_time_ms - output_processor_time_ms) num_tokens_out = count_tokens_from_outputs(request_outputs) num_tokens_in = 0 @@ -619,6 +622,7 @@ def _build_stage_metrics( num_tokens_in=num_tokens_in, num_tokens_out=num_tokens_out, stage_gen_time_ms=stage_gen_time_ms, + output_processor_time_ms=output_processor_time_ms, batch_id=batch_id, batch_size=1, rx_decode_time_ms=0.0, @@ -862,11 +866,24 @@ async def _process_stage_outputs(self, stage_id: int, raw_outputs: EngineCoreOut """ processor = self.output_processors[stage_id] + output_processor_start = _time.perf_counter() processed = processor.process_outputs( raw_outputs.outputs, raw_outputs.timestamp, None, ) + output_processor_time_ms = (_time.perf_counter() - output_processor_start) * 1000.0 + processed_request_ids = {output.request_id for output in processed.request_outputs} + if processed_request_ids: + # process_outputs is batch-scoped, so attribute the batch cost evenly. + per_request_time_ms = output_processor_time_ms / len(processed_request_ids) + for request_id in processed_request_ids: + req_state = self.request_states.get(request_id) + if req_state is not None: + req_state.output_processor_time_ms[stage_id] = ( + req_state.output_processor_time_ms.get(stage_id, 0.0) + per_request_time_ms + ) + for eco in raw_outputs.outputs: if not hasattr(eco, "request_id"): continue diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 1cf359f7c08..11cba2ee23c 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -396,20 +396,9 @@ def _process_single_result( if not stage_meta["final_output"]: return None - try: - rid_key = str(req_id) - if stage_id == final_stage_id_for_e2e and rid_key not in metrics.e2e_done and finished: - metrics.on_finalize_request( - stage_id, - req_id, - req_start_ts.get(req_id, wall_start_ts), - input_preprocess_time_ms=input_preprocess_time_ms.get(req_id, 0.0), - ) - except Exception: - logger.exception("[%s] Finalize request handling error", self.__class__.__name__) - + final_output_start = time.perf_counter() images = getattr(engine_outputs, "images", []) if stage_meta["final_output_type"] == "image" else [] - return OmniRequestOutput( + output_to_yield = OmniRequestOutput( request_id=req_id or "", stage_id=stage_id, final_output_type=stage_meta["final_output_type"], @@ -423,6 +412,21 @@ def _process_single_result( stage_durations=stage_durations, peak_memory_mb=peak_memory_mb, ) + metrics.record_final_output_time(req_id, (time.perf_counter() - final_output_start) * 1000.0) + + try: + rid_key = str(req_id) + if stage_id == final_stage_id_for_e2e and rid_key not in metrics.e2e_done and finished: + metrics.on_finalize_request( + stage_id, + req_id, + req_start_ts.get(req_id, wall_start_ts), + input_preprocess_time_ms=input_preprocess_time_ms.get(req_id, 0.0), + ) + except Exception: + logger.exception("[%s] Finalize request handling error", self.__class__.__name__) + + return output_to_yield def shutdown(self) -> None: logger.info("[%s] Shutting down", self.__class__.__name__) diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py index 3e4f0f6bd7e..cb73949ff3a 100644 --- a/vllm_omni/metrics/stats.py +++ b/vllm_omni/metrics/stats.py @@ -9,7 +9,7 @@ from vllm.logger import init_logger -from vllm_omni.metrics.utils import _build_field_defs, _build_row, _format_table +from vllm_omni.metrics.utils import _build_field_defs, _build_row logger = init_logger(__name__) @@ -41,6 +41,7 @@ class StageRequestStats: final_output_type: str | None = None request_id: str | None = None postprocess_time_ms: float = 0.0 + output_processor_time_ms: float = 0.0 diffusion_metrics: dict[str, int] = None audio_generated_frames: int = 0 stage_end_ts: float | None = None @@ -83,13 +84,14 @@ class RequestE2EStats: request_wall_time_ms: float input_preprocess_time_ms: float engine_pipeline_time_ms: float - e2e_total_tokens: int + total_tokens: int transfers_total_time_ms: float transfers_total_bytes: int + final_output_time_ms: float = 0.0 @property def e2e_tpt(self) -> float: - return (self.engine_pipeline_time_ms / self.e2e_total_tokens) if self.e2e_total_tokens > 0 else 0.0 + return (self.engine_pipeline_time_ms / self.total_tokens) if self.total_tokens > 0 else 0.0 # === Field Configuration === @@ -119,23 +121,27 @@ def e2e_tpt(self) -> float: # Decide the order of overall summary fields, or None for auto OVERALL_FIELDS: list[str] | None = [ - "e2e_requests", + "num_of_requests", "request_wall_time_ms", "input_preprocess_time_ms", "engine_pipeline_time_ms", "stage_gen_total_time_ms", + "output_processor_total_time_ms", "stage_handoff_total_time_ms", "ar2diffusion_total_time_ms", - "final_output_overhead_time_ms", - "e2e_total_tokens", + "final_output_total_time_ms", + "breakdown_delta_time_ms", + "total_tokens", "avg_request_wall_time_ms", "avg_input_preprocess_time_ms", "avg_engine_pipeline_time_ms", "avg_stage_gen_total_time_ms", + "avg_output_processor_time_ms", "avg_stage_handoff_total_time_ms", "avg_ar2diffusion_time_ms", - "avg_final_output_overhead_time_ms", - "e2e_avg_tokens_per_s", + "avg_final_output_time_ms", + "avg_breakdown_delta_time_ms", + "avg_tokens_per_s", ] STAGE_FIELDS = _build_field_defs(StageRequestStats, STAGE_EXCLUDE, FIELD_TRANSFORMS) TRANSFER_FIELDS = _build_field_defs(TransferEdgeStats, TRANSFER_EXCLUDE, FIELD_TRANSFORMS) @@ -167,7 +173,8 @@ def init_run_state(self, wall_start_ts: float) -> None: self.stage_total_tokens = [0 for _ in range(self.num_stages)] self.engine_pipeline_total_ms = 0.0 self.input_preprocess_total_ms = 0.0 - self.e2e_total_tokens = 0 + self.final_output_total_ms = 0.0 + self.total_tokens = 0 self.e2e_count = 0 self.e2e_done = set() self.wall_start_ts = float(wall_start_ts) @@ -180,6 +187,7 @@ def init_run_state(self, wall_start_ts: float) -> None: self.diffusion_metrics: defaultdict[str, defaultdict[str, float]] = defaultdict( lambda: defaultdict(float) ) # {request_id: {diffusion_metrics_key: accumulated_metrics_data}} + self.final_output_time_ms: defaultdict[str, float] = defaultdict(float) def _get_stage_metadata(self, stage_id: int | None) -> dict[str, Any]: if stage_id is None or stage_id < 0 or stage_id >= len(self.stage_metadata): @@ -253,43 +261,212 @@ def _log_omni_timing(self, evt: RequestE2EStats) -> None: ",".join(transfers), ) - def _build_timing_breakdown_lines(self, summary: dict[str, Any]) -> list[str]: - lines = [ - "[Timing Breakdown]", - ( - "request_wall_time_ms = input_preprocess_time_ms + engine_pipeline_time_ms " - f"= {self._format_ms(summary.get('input_preprocess_time_ms', 0.0))} + " - f"{self._format_ms(summary.get('engine_pipeline_time_ms', 0.0))} " - f"= {self._format_ms(summary.get('request_wall_time_ms', 0.0))}" - ), - ( - "engine_pipeline_time_ms = stage_gen_total_time_ms + " - "stage_handoff_total_time_ms + final_output_overhead_time_ms " - f"= {self._format_ms(summary.get('stage_gen_total_time_ms', 0.0))} + " - f"{self._format_ms(summary.get('stage_handoff_total_time_ms', 0.0))} + " - f"{self._format_ms(summary.get('final_output_overhead_time_ms', 0.0))} " - f"= {self._format_ms(summary.get('engine_pipeline_time_ms', 0.0))}" - ), - ] + @staticmethod + def _summary_line(label: str, value: Any) -> str: + if isinstance(value, bool): + text = str(value).lower() + elif isinstance(value, int): + text = f"{value:d}" + elif isinstance(value, float): + text = f"{value:,.3f}" + else: + text = str(value) + return f"{label:<42}{text:>16}" - ar2diffusion_total_ms = float(summary.get("ar2diffusion_total_time_ms", 0.0) or 0.0) - if ar2diffusion_total_ms > 0.0: - lines.append( - "ar2diffusion_total_time_ms is included in stage_handoff_total_time_ms: " - f"{self._format_ms(ar2diffusion_total_ms)} <= " - f"{self._format_ms(summary.get('stage_handoff_total_time_ms', 0.0))}" + @staticmethod + def _summary_header(title: str, char: str = "=") -> str: + return f"{char * 12} {title} {char * 12}" + + def _append_stage_info_lines(self, lines: list[str], evt: StageRequestStats) -> None: + lines.extend( + [ + self._summary_line("Stage id:", evt.stage_id if evt.stage_id is not None else "unknown"), + self._summary_line("Stage name:", evt.stage_name or ""), + self._summary_line("Stage type:", evt.stage_type or ""), + self._summary_line("Final output type:", evt.final_output_type or ""), + self._summary_line("Batch id:", evt.batch_id), + self._summary_line("Batch size:", evt.batch_size), + ] + ) + if evt.num_tokens_in > 0 or evt.num_tokens_out > 0: + lines.extend( + [ + "", + self._summary_line("Input tokens:", evt.num_tokens_in), + self._summary_line("Output tokens:", evt.num_tokens_out), + self._summary_line("Output token throughput (tok/s):", evt.tokens_per_s), + ] + ) + if evt.postprocess_time_ms > 0.0: + lines.extend( + [ + "", + self._summary_line("Postprocess time (ms):", evt.postprocess_time_ms), + ] ) + if evt.diffusion_metrics: + lines.append("") + for key, value in sorted(evt.diffusion_metrics.items()): + lines.append(self._summary_line(f"{key}:", value)) + + def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> list[str]: + lines = [self._summary_header("Omni Metrics Summary")] + lines.extend( + [ + self._summary_line("Successful requests:", int(overall_summary.get("num_of_requests", 0))), + self._summary_line("Total E2E time (ms):", float(overall_summary.get("request_wall_time_ms", 0.0))), + self._summary_line( + "Input preprocess time (ms):", + float(overall_summary.get("input_preprocess_time_ms", 0.0)), + ), + self._summary_line( + "Engine pipeline time (ms):", + float(overall_summary.get("engine_pipeline_time_ms", 0.0)), + ), + self._summary_line( + "Sum check (ms):", + float(overall_summary.get("input_preprocess_time_ms", 0.0)) + + float(overall_summary.get("engine_pipeline_time_ms", 0.0)), + ), + "", + self._summary_header("Overall Time Breakdown", "-"), + self._summary_line( + "Input preprocess time (ms):", + float(overall_summary.get("input_preprocess_time_ms", 0.0)), + ), + ] + ) - edge_handoff_fields = sorted( - k for k in summary if k.startswith("stage_") and "_to_" in k and k.endswith("_handoff_time_ms") + component_sum = float(overall_summary.get("input_preprocess_time_ms", 0.0)) + stage_ids = sorted( + { + evt.stage_id + for stage_evts in self.stage_events.values() + for evt in stage_evts + if evt.stage_id is not None + } ) - for handoff_field in edge_handoff_fields: - edge_name = handoff_field.removesuffix("_handoff_time_ms") - ar2d_field = f"{edge_name}_ar2diffusion_time_ms" - if ar2d_field in summary and float(summary.get(ar2d_field, 0.0) or 0.0) > 0.0: - lines.append( - f"{ar2d_field} is included in {handoff_field}: " - f"{self._format_ms(summary[ar2d_field])} <= {self._format_ms(summary[handoff_field])}" + for stage_id in stage_ids: + stage_gen_ms = sum( + float(evt.stage_gen_time_ms or 0.0) + for stage_evts in self.stage_events.values() + for evt in stage_evts + if evt.stage_id == stage_id + ) + output_processor_ms = sum( + float(evt.output_processor_time_ms or 0.0) + for stage_evts in self.stage_events.values() + for evt in stage_evts + if evt.stage_id == stage_id + ) + lines.append(self._summary_line(f"Stage {stage_id} generation time (ms):", stage_gen_ms)) + lines.append(self._summary_line(f"Stage {stage_id} output processor time (ms):", output_processor_ms)) + component_sum += stage_gen_ms + output_processor_ms + + handoff_ms = sum( + float(evt.stage_handoff_time_ms or 0.0) + for stage_evts in self.stage_events.values() + for evt in stage_evts + if evt.stage_id == stage_id + ) + if handoff_ms > 0.0: + to_stage_ids = sorted( + { + evt.handoff_to_stage_id + for stage_evts in self.stage_events.values() + for evt in stage_evts + if evt.stage_id == stage_id and evt.handoff_to_stage_id is not None + } + ) + to_stage = to_stage_ids[0] if len(to_stage_ids) == 1 else "next" + lines.append(self._summary_line(f"Stage {stage_id} -> Stage {to_stage} handoff time (ms):", handoff_ms)) + component_sum += handoff_ms + + final_output_ms = float(overall_summary.get("final_output_total_time_ms", 0.0)) + final_orchestration_ms = float(overall_summary.get("breakdown_delta_time_ms", 0.0)) + final_output_bucket_ms = final_output_ms + final_orchestration_ms + lines.extend( + [ + self._summary_line("Final output time (ms):", final_output_bucket_ms), + self._summary_line("Component sum (ms):", component_sum + final_output_bucket_ms), + self._summary_line( + "E2E - component sum (ms):", + float(overall_summary.get("request_wall_time_ms", 0.0)) - component_sum - final_output_bucket_ms, + ), + ] + ) + + for rid in sorted(set(self.stage_events.keys()) | {evt.request_id for evt in self.e2e_events}): + stage_evts = sorted( + self.stage_events.get(rid, []), + key=lambda evt: evt.stage_id if evt.stage_id is not None else -1, + ) + for evt in stage_evts: + stage_id = evt.stage_id if evt.stage_id is not None else "unknown" + lines.extend( + [ + "", + self._summary_header(f"Stage {stage_id} Breakdown", "-"), + self._summary_line("Stage generation time (ms):", float(evt.stage_gen_time_ms or 0.0)), + self._summary_line( + "Output processor time (ms):", + float(evt.output_processor_time_ms or 0.0), + ), + self._summary_line( + "Stage sum check (ms):", + float(evt.stage_gen_time_ms or 0.0) + float(evt.output_processor_time_ms or 0.0), + ), + "", + ] + ) + self._append_stage_info_lines(lines, evt) + + if evt.handoff_to_stage_id is not None and evt.stage_handoff_time_ms > 0.0: + ar2diffusion_ms = float(evt.ar2diffusion_time_ms or 0.0) + lines.extend( + [ + "", + self._summary_header( + f"Stage {stage_id} -> Stage {evt.handoff_to_stage_id} Handoff", + "-", + ), + self._summary_line("Handoff total time (ms):", float(evt.stage_handoff_time_ms)), + self._summary_line("AR to diffusion time (ms):", ar2diffusion_ms), + self._summary_line( + "Other handoff processing time (ms):", + float(evt.stage_handoff_time_ms) - ar2diffusion_ms, + ), + ] + ) + + final_output_ms = float(self.final_output_time_ms.get(rid, 0.0)) + e2e_evt = next((evt for evt in self.e2e_events if evt.request_id == rid), None) + if e2e_evt is None: + final_orchestration_ms = 0.0 + else: + stage_component_ms = sum( + float(evt.stage_gen_time_ms or 0.0) + + float(evt.output_processor_time_ms or 0.0) + + float(evt.stage_handoff_time_ms or 0.0) + for evt in stage_evts + ) + final_orchestration_ms = ( + float(e2e_evt.request_wall_time_ms) + - float(e2e_evt.input_preprocess_time_ms) + - stage_component_ms + - final_output_ms + ) + final_output_bucket_ms = final_output_ms + final_orchestration_ms + if final_output_bucket_ms != 0.0: + lines.extend( + [ + "", + self._summary_header("Final Output Breakdown", "-"), + self._summary_line("Final output wrapping time (ms):", final_output_ms), + self._summary_line("Final orchestration overhead time (ms):", final_orchestration_ms), + self._summary_line("Final output total time (ms):", final_output_bucket_ms), + self._summary_line("Final output sum check (ms):", final_output_bucket_ms), + ] ) return lines @@ -520,6 +697,9 @@ def record_stage_postprocess_time(self, stage_id: int, req_id: Any, postproc_tim def record_ar2diffusion_time(self, stage_id: int, req_id: Any, ar2diffusion_time_ms: float) -> None: self._update_stage_event_field(stage_id, req_id, "ar2diffusion_time_ms", ar2diffusion_time_ms) + def record_final_output_time(self, req_id: Any, final_output_time_ms: float) -> None: + self.final_output_time_ms[str(req_id)] += float(final_output_time_ms) + def record_stage_handoff_time( self, from_stage: int, @@ -604,6 +784,7 @@ def on_finalize_request( engine_pipeline_ms = (_t1 - _t0) * 1000.0 input_preprocess_ms = float(input_preprocess_time_ms) request_wall_ms = engine_pipeline_ms + input_preprocess_ms + final_output_ms = float(self.final_output_time_ms.get(rid_key, 0.0)) # Sum tokens from all stages for this request # Include input tokens from stage 0 + output tokens from all stages @@ -616,7 +797,8 @@ def on_finalize_request( self.engine_pipeline_total_ms += engine_pipeline_ms self.input_preprocess_total_ms += input_preprocess_ms - self.e2e_total_tokens += total_tokens + self.final_output_total_ms += final_output_ms + self.total_tokens += total_tokens self.e2e_count += 1 self.e2e_done.add(rid_key) per_req_record = RequestE2EStats( @@ -624,13 +806,14 @@ def on_finalize_request( request_wall_time_ms=request_wall_ms, input_preprocess_time_ms=input_preprocess_ms, engine_pipeline_time_ms=engine_pipeline_ms, - e2e_total_tokens=total_tokens, + total_tokens=total_tokens, transfers_total_time_ms=float( sum(evt.total_time_ms for evt in self.transfer_events.values() if evt.request_id == rid_key) ), transfers_total_bytes=int( sum(evt.size_bytes for evt in self.transfer_events.values() if evt.request_id == rid_key) ), + final_output_time_ms=final_output_ms, ) self.e2e_events.append(per_req_record) @@ -642,7 +825,7 @@ def build_and_log_summary(self) -> dict[str, Any]: return {} wall_time_ms = max(0.0, (self.last_finish_ts - self.wall_start_ts) * 1000.0) e2e_avg_req = (wall_time_ms / self.e2e_count) if self.e2e_count > 0 else 0.0 - e2e_avg_tok = (self.e2e_total_tokens * 1000.0 / wall_time_ms) if wall_time_ms > 0 else 0.0 + avg_tok = (self.total_tokens * 1000.0 / wall_time_ms) if wall_time_ms > 0 else 0.0 if isinstance(self.final_stage_id_for_e2e, int): final_stage_id_map: dict[str, int] = {"*": int(self.final_stage_id_for_e2e)} @@ -656,6 +839,7 @@ def build_and_log_summary(self) -> dict[str, Any]: for i in range(self.num_stages) ] stage_gen_total_ms = 0.0 + output_processor_total_ms = 0.0 stage_handoff_total_ms = 0.0 ar2diffusion_total_ms = 0.0 handoff_edge_totals: defaultdict[str, float] = defaultdict(float) @@ -663,6 +847,7 @@ def build_and_log_summary(self) -> dict[str, Any]: for stage_evts in self.stage_events.values(): for evt in stage_evts: stage_gen_total_ms += float(evt.stage_gen_time_ms or 0.0) + output_processor_total_ms += float(evt.output_processor_time_ms or 0.0) handoff_ms = float(evt.stage_handoff_time_ms or 0.0) ar2d_ms = float(evt.ar2diffusion_time_ms or 0.0) stage_handoff_total_ms += handoff_ms @@ -671,21 +856,27 @@ def build_and_log_summary(self) -> dict[str, Any]: edge = f"stage_{evt.stage_id}_to_{evt.handoff_to_stage_id}" handoff_edge_totals[edge] += handoff_ms handoff_edge_ar2diffusion[edge] += ar2d_ms - final_output_overhead_ms = max( - 0.0, - float(self.engine_pipeline_total_ms) - stage_gen_total_ms - stage_handoff_total_ms, + breakdown_delta_ms = ( + float(wall_time_ms) + - float(self.input_preprocess_total_ms) + - stage_gen_total_ms + - output_processor_total_ms + - stage_handoff_total_ms + - float(self.final_output_total_ms) ) overall_summary = { - "e2e_requests": int(self.e2e_count), + "num_of_requests": int(self.e2e_count), "request_wall_time_ms": float(wall_time_ms), "input_preprocess_time_ms": float(self.input_preprocess_total_ms), "engine_pipeline_time_ms": float(self.engine_pipeline_total_ms), "stage_gen_total_time_ms": float(stage_gen_total_ms), + "output_processor_total_time_ms": float(output_processor_total_ms), "stage_handoff_total_time_ms": float(stage_handoff_total_ms), "ar2diffusion_total_time_ms": float(ar2diffusion_total_ms), - "final_output_overhead_time_ms": float(final_output_overhead_ms), - "e2e_total_tokens": int(self.e2e_total_tokens), + "final_output_total_time_ms": float(self.final_output_total_ms), + "breakdown_delta_time_ms": float(breakdown_delta_ms), + "total_tokens": int(self.total_tokens), "avg_request_wall_time_ms": float(e2e_avg_req), "avg_input_preprocess_time_ms": float( self.input_preprocess_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 @@ -694,14 +885,18 @@ def build_and_log_summary(self) -> dict[str, Any]: self.engine_pipeline_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 ), "avg_stage_gen_total_time_ms": float(stage_gen_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0), + "avg_output_processor_time_ms": float( + output_processor_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 + ), "avg_stage_handoff_total_time_ms": float( stage_handoff_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 ), "avg_ar2diffusion_time_ms": float(ar2diffusion_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0), - "avg_final_output_overhead_time_ms": float( - final_output_overhead_ms / self.e2e_count if self.e2e_count > 0 else 0.0 + "avg_final_output_time_ms": float( + self.final_output_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 ), - "e2e_avg_tokens_per_s": float(e2e_avg_tok), + "avg_breakdown_delta_time_ms": float(breakdown_delta_ms / self.e2e_count if self.e2e_count > 0 else 0.0), + "avg_tokens_per_s": float(avg_tok), } for edge, handoff_time in sorted(handoff_edge_totals.items()): overall_summary[f"{edge}_handoff_time_ms"] = float(handoff_time) @@ -710,19 +905,18 @@ def build_and_log_summary(self) -> dict[str, Any]: overall_summary[f"{edge}_ar2diffusion_time_ms"] = float(ar2d_time) # Add stage_wall_time_ms as separate fields for each stage for idx, wall_time in enumerate(stage_wall_time_ms): - overall_summary[f"e2e_stage_{idx}_wall_time_ms"] = wall_time + overall_summary[f"stage_{idx}_wall_time_ms"] = wall_time - # Print overall summary - # filter out all-zero fields for logging - overall_fields = [] avg_fields = { "avg_request_wall_time_ms", "avg_input_preprocess_time_ms", "avg_engine_pipeline_time_ms", "avg_stage_gen_total_time_ms", + "avg_output_processor_time_ms", "avg_stage_handoff_total_time_ms", "avg_ar2diffusion_time_ms", - "avg_final_output_overhead_time_ms", + "avg_final_output_time_ms", + "avg_breakdown_delta_time_ms", } dynamic_overall_fields = [ k @@ -731,25 +925,21 @@ def build_and_log_summary(self) -> dict[str, Any]: and "_to_" in k and (k.endswith("_handoff_time_ms") or k.endswith("_ar2diffusion_time_ms")) ] - stage_wall_fields = [k for k in overall_summary if k.startswith("e2e_stage_") and k.endswith("_wall_time_ms")] + stage_wall_fields = [ + k for k in overall_summary if k.startswith("stage_") and k.endswith("_wall_time_ms") and "_to_" not in k + ] ordered_overall_fields = list(OVERALL_FIELDS or []) - insert_at = ordered_overall_fields.index("e2e_total_tokens") + insert_at = ordered_overall_fields.index("total_tokens") ordered_overall_fields[insert_at:insert_at] = sorted( dynamic_overall_fields, key=lambda name: (name.replace("_ar2diffusion_time_ms", "_zz_ar2diffusion_time_ms")), ) + sorted(stage_wall_fields) - for k in ordered_overall_fields or list(overall_summary.keys()): - if self.e2e_count <= 1 and k in avg_fields: - continue - v = overall_summary.get(k, None) - if v not in (0, 0.0, 0.000, None, ""): - overall_fields.append(k) - if overall_fields: - logger.info( - "\n%s", - _format_table("Overall Summary", overall_summary, overall_fields), - ) - logger.info("\n%s", "\n".join(self._build_timing_breakdown_lines(overall_summary))) + overall_fields = [ + k + for k in ordered_overall_fields or list(overall_summary.keys()) + if not (self.e2e_count <= 1 and k in avg_fields) + and overall_summary.get(k, None) not in (0, 0.0, 0.000, None, "") + ] all_request_ids = sorted(set(self.stage_events.keys()) | {e.request_id for e in self.e2e_events}) @@ -793,35 +983,6 @@ def build_and_log_summary(self) -> dict[str, Any]: result_stage_table.append({"request_id": rid, "stages": stage_rows}) - if stage_rows: - # filter out all-zero fields for logging - all_value_fields = set() - for row in stage_rows: - for k in row.keys(): - if k not in ("stage", "stage_id"): - all_value_fields.add(k) - value_fields_list = [] - for field in sorted(all_value_fields): - all_zero = True - for row in stage_rows: - v = row.get(field, None) - if v not in (0, 0.0, 0.000, None, ""): - all_zero = False - break - if not all_zero: - value_fields_list.append(field) - - if value_fields_list: - logger.info( - "\n%s", - _format_table( - f"StageRequestStats [request_id={rid}]", - stage_rows, - column_key="stage", - value_fields=value_fields_list, - ), - ) - # === Transfer table (columns = edge) === transfer_evts = sorted( [e for e in self.transfer_events.values() if e.request_id == rid], @@ -833,34 +994,8 @@ def build_and_log_summary(self) -> dict[str, Any]: ] result_trans_table.append({"request_id": rid, "transfers": transfer_rows}) - if transfer_rows: - # filter out all-zero fields for logging - all_value_fields = set() - for row in transfer_rows: - for k in row.keys(): - if k != "edge": - all_value_fields.add(k) - value_fields_list = [] - for field in sorted(all_value_fields): - all_zero = True - for row in transfer_rows: - v = row.get(field, None) - if v not in (0, 0.0, 0.000, None, ""): - all_zero = False - break - if not all_zero: - value_fields_list.append(field) - - if value_fields_list: - logger.info( - "\n%s", - _format_table( - f"TransferEdgeStats [request_id={rid}]", - transfer_rows, - column_key="edge", - value_fields=value_fields_list, - ), - ) + if overall_fields: + logger.info("\n%s", "\n".join(self._build_omni_metrics_summary_lines(overall_summary))) return { "final_stage_id": final_stage_id_map, From 9d93f453a754e34e11ff03f26a326c1cde605d9b Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Sat, 25 Apr 2026 10:35:13 +0800 Subject: [PATCH 07/11] Batch offline metrics summaries Signed-off-by: bjf-frz --- tests/entrypoints/test_omni_entrypoints.py | 71 +++++++++ tests/metrics/test_stats.py | 95 +++++++++++ vllm_omni/entrypoints/omni.py | 36 +++-- vllm_omni/entrypoints/omni_base.py | 4 +- vllm_omni/metrics/stats.py | 177 +++++++++++++++------ 5 files changed, 320 insertions(+), 63 deletions(-) diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py index a96b4dd1df6..bac4897da5d 100644 --- a/tests/entrypoints/test_omni_entrypoints.py +++ b/tests/entrypoints/test_omni_entrypoints.py @@ -645,6 +645,77 @@ def test_omni_generate_returns_list_when_not_using_generator(monkeypatch: pytest assert [output.stage_id for output in outputs] == [0, 2, 0, 2] +def test_omni_generate_batches_offline_metrics_summary(monkeypatch: pytest.MonkeyPatch): + class FakeMetrics: + instances: list[FakeMetrics] = [] + + def __init__( + self, + num_stages: int, + log_stats: bool, + wall_start_ts: float, + final_stage_id_for_e2e: dict[str, int], + **_: Any, + ) -> None: + self.num_stages = num_stages + self.log_stats = log_stats + self.wall_start_ts = wall_start_ts + self.final_stage_id_for_e2e = final_stage_id_for_e2e + self.stage_first_ts = [None] * num_stages + self.stage_last_ts = [None] * num_stages + self.e2e_done: set[str] = set() + self.final_output_time_ms: dict[str, float] = {} + self.build_count = 0 + FakeMetrics.instances.append(self) + + def on_stage_metrics(self, *_: Any, **__: Any) -> None: + pass + + def record_final_output_time(self, req_id: str, final_output_time_ms: float) -> None: + self.final_output_time_ms[req_id] = final_output_time_ms + + def on_finalize_request( + self, + stage_id: int, + req_id: str, + req_start_ts: float, + input_preprocess_time_ms: float = 0.0, + ) -> None: + del stage_id, req_start_ts, input_preprocess_time_ms + self.e2e_done.add(req_id) + + def build_and_log_summary(self) -> dict[str, Any]: + self.build_count += 1 + return {"num_requests": len(self.e2e_done)} + + sampling_params = [SamplingParams(max_tokens=8) for _ in range(2)] + engine = FakeAsyncOmniEngine( + stage_metadata=LLM_DIFFUSION_META, + default_sampling_params_list=sampling_params, + on_add_request=_enqueue_omni_llm_diffusion_outputs, + ) + _patch_engine(monkeypatch, engine) + monkeypatch.setattr("vllm_omni.entrypoints.omni.OrchestratorMetrics", FakeMetrics) + + app = Omni("dummy-model", log_stats=True) + try: + outputs = list(app.generate(["p1", "p2"], py_generator=True, use_tqdm=False)) + finally: + if not engine.shutdown_called: + app.shutdown() + + assert len(outputs) == 4 + assert len(FakeMetrics.instances) == 1 + metrics = FakeMetrics.instances[0] + assert metrics.final_stage_id_for_e2e == { + engine.submitted[0]["request_id"]: 1, + engine.submitted[1]["request_id"]: 1, + } + assert metrics.e2e_done == {engine.submitted[0]["request_id"], engine.submitted[1]["request_id"]} + assert metrics.build_count == 1 + assert app.request_states == {} + + def test_omni_generate_diffusion_only_yields_single_image_per_request(monkeypatch: pytest.MonkeyPatch): sampling_params = [SamplingParams(max_tokens=8)] engine = FakeAsyncOmniEngine( diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py index 3bb722b732d..bc998a77e92 100644 --- a/tests/metrics/test_stats.py +++ b/tests/metrics/test_stats.py @@ -178,3 +178,98 @@ def test_build_and_log_summary_multiple_requests() -> None: r2_stage_entry = next(e for e in summary["stage_table"] if e["request_id"] == "r2") assert len(r1_stage_entry["stages"]) == 2 assert len(r2_stage_entry["stages"]) == 1 + + +def test_overall_engine_pipeline_time_uses_batch_wall_time() -> None: + agg = OrchestratorAggregator(num_stages=1, log_stats=True, wall_start_ts=0.0, final_stage_id_for_e2e=0) + agg.stage_first_ts[0] = 1.0 + agg.last_finish_ts = 251.0 + agg.engine_pipeline_total_ms = 1_336_711.0 + agg.e2e_count = 10 + agg.e2e_events = [ + RequestE2EStats( + request_id=f"r{i}", + request_wall_time_ms=250_000.0, + input_preprocess_time_ms=1.0, + engine_pipeline_time_ms=249_999.0, + total_tokens=0, + transfers_total_time_ms=0.0, + transfers_total_bytes=0, + ) + for i in range(10) + ] + + summary = agg.build_and_log_summary() + + assert summary["overall_summary"]["request_wall_time_ms"] == 251_000.0 + assert summary["overall_summary"]["input_preprocess_wall_time_ms"] == 1_000.0 + assert summary["overall_summary"]["engine_pipeline_time_ms"] == 250_000.0 + assert summary["overall_summary"]["avg_engine_pipeline_time_ms"] == 249_999.0 + + +def test_multi_request_summary_includes_averages_and_request_input_preprocess() -> None: + agg = OrchestratorAggregator( + num_stages=1, + log_stats=True, + wall_start_ts=0.0, + final_stage_id_for_e2e={"r1": 0, "r2": 0}, + ) + for req_id in ("r1", "r2"): + agg.on_stage_metrics( + 0, + req_id, + StageRequestStats( + batch_id=1, + batch_size=2, + num_tokens_in=1, + num_tokens_out=2, + stage_gen_time_ms=10.0, + rx_transfer_bytes=0, + rx_decode_time_ms=0.0, + rx_in_flight_time_ms=0.0, + stage_stats=StageStats(), + ), + ) + agg.e2e_events.append( + RequestE2EStats( + request_id=req_id, + request_wall_time_ms=20.0, + input_preprocess_time_ms=2.0, + engine_pipeline_time_ms=18.0, + total_tokens=3, + transfers_total_time_ms=0.0, + transfers_total_bytes=0, + ) + ) + agg.e2e_count = 2 + agg.input_preprocess_total_ms = 4.0 + + lines = agg._build_omni_metrics_summary_lines( + { + "num_of_requests": 2, + "request_wall_time_ms": 20.0, + "input_preprocess_time_ms": 4.0, + "input_preprocess_wall_time_ms": 2.0, + "engine_pipeline_time_ms": 18.0, + "final_output_total_time_ms": 0.0, + "breakdown_delta_time_ms": 12.0, + "avg_request_wall_time_ms": 20.0, + "avg_engine_pipeline_time_ms": 18.0, + "avg_input_preprocess_time_ms": 2.0, + "avg_final_output_time_ms": 0.0, + "avg_breakdown_delta_time_ms": 6.0, + "stage_0_wall_time_ms": 12.0, + } + ) + rendered = "\n".join(lines) + + assert "Sum check (ms):" in rendered + assert "Component total work time (ms):" in rendered + assert "Component sum (ms):" not in rendered + assert "Average E2E time" not in rendered + assert "Average engine pipeline time" not in rendered + assert "Average Time Breakdown" in rendered + assert "Average Stage 0 generation time" in rendered + assert "Request r1 Breakdown" in rendered + assert "Request r2 Breakdown" in rendered + assert "Input preprocess time (ms):" in rendered diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py index fcff91df56d..c5e10044419 100644 --- a/vllm_omni/entrypoints/omni.py +++ b/vllm_omni/entrypoints/omni.py @@ -117,22 +117,23 @@ def _run_generation( input_preprocess_time_ms: dict[str, float] = {} wall_start_ts = time.time() req_final_stage_ids: dict[str, int] = {} + for req_id, prompt in zip(request_ids, request_prompts): + prompt_modalities = prompt.get("modalities", None) if isinstance(prompt, dict) else None + req_final_stage_ids[req_id] = self._compute_final_stage_id(prompt_modalities) + batch_metrics = OrchestratorMetrics( + self.num_stages, + self.log_stats, + wall_start_ts, + req_final_stage_ids, + stage_metadata=getattr(self.engine, "stage_metadata", None), + ) for req_id, prompt in zip(request_ids, request_prompts): request_prep_start_ts = time.time() - prompt_modalities = prompt.get("modalities", None) if isinstance(prompt, dict) else None - final_stage_id = self._compute_final_stage_id(prompt_modalities) - req_final_stage_ids[req_id] = final_stage_id - - metrics = OrchestratorMetrics( - self.num_stages, - self.log_stats, - wall_start_ts, - final_stage_id, - stage_metadata=getattr(self.engine, "stage_metadata", None), - ) + final_stage_id = req_final_stage_ids[req_id] + req_state = ClientRequestState(req_id) - req_state.metrics = metrics + req_state.metrics = batch_metrics self.request_states[req_id] = req_state # PD disaggregation: modify stage-0 (prefill) sampling params per request @@ -150,7 +151,10 @@ def _run_generation( ) submit_ts = time.time() input_preprocess_time_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0 - req_state.metrics.stage_first_ts[0] = submit_ts + if batch_metrics.stage_first_ts[0] is None: + batch_metrics.stage_first_ts[0] = submit_ts + else: + batch_metrics.stage_first_ts[0] = min(batch_metrics.stage_first_ts[0], submit_ts) req_start_ts[req_id] = submit_ts active_reqs = set(request_ids) @@ -190,7 +194,11 @@ def _run_generation( active_reqs.discard(req_id) if pbar is not None: pbar.update(1) - self._log_summary_and_cleanup(req_id) + self.request_states.pop(req_id, None) + + summary = batch_metrics.build_and_log_summary() + if summary: + logger.debug("[Summary] %s", summary) except Exception: if "active_reqs" in locals() and active_reqs: self.abort(list(active_reqs)) diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 11cba2ee23c..d1c5f658749 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -396,6 +396,7 @@ def _process_single_result( if not stage_meta["final_output"]: return None + final_output_start_ts = time.time() final_output_start = time.perf_counter() images = getattr(engine_outputs, "images", []) if stage_meta["final_output_type"] == "image" else [] output_to_yield = OmniRequestOutput( @@ -412,7 +413,8 @@ def _process_single_result( stage_durations=stage_durations, peak_memory_mb=peak_memory_mb, ) - metrics.record_final_output_time(req_id, (time.perf_counter() - final_output_start) * 1000.0) + final_output_time_ms = (time.perf_counter() - final_output_start) * 1000.0 + metrics.record_final_output_time(req_id, final_output_time_ms, final_output_start_ts, time.time()) try: rid_key = str(req_id) diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py index cb73949ff3a..df1876e5eb9 100644 --- a/vllm_omni/metrics/stats.py +++ b/vllm_omni/metrics/stats.py @@ -309,6 +309,25 @@ def _append_stage_info_lines(self, lines: list[str], evt: StageRequestStats) -> for key, value in sorted(evt.diffusion_metrics.items()): lines.append(self._summary_line(f"{key}:", value)) + def _request_stage_component_time_ms(self, request_id: str) -> float: + return sum( + float(evt.stage_gen_time_ms or 0.0) + + float(evt.output_processor_time_ms or 0.0) + + float(evt.stage_handoff_time_ms or 0.0) + for evt in self.stage_events.get(request_id, []) + ) + + def _request_final_orchestration_time_ms(self, evt: RequestE2EStats) -> float: + return ( + float(evt.request_wall_time_ms) + - float(evt.input_preprocess_time_ms) + - self._request_stage_component_time_ms(evt.request_id) + - float(self.final_output_time_ms.get(evt.request_id, 0.0)) + ) + + def _get_overall_summary_time_ms(self, overall_summary: dict[str, Any], key: str) -> float: + return float(overall_summary.get(key, overall_summary.get(key.replace("_wall_", "_"), 0.0))) + def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> list[str]: lines = [self._summary_header("Omni Metrics Summary")] lines.extend( @@ -317,7 +336,7 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> self._summary_line("Total E2E time (ms):", float(overall_summary.get("request_wall_time_ms", 0.0))), self._summary_line( "Input preprocess time (ms):", - float(overall_summary.get("input_preprocess_time_ms", 0.0)), + self._get_overall_summary_time_ms(overall_summary, "input_preprocess_wall_time_ms"), ), self._summary_line( "Engine pipeline time (ms):", @@ -325,19 +344,20 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> ), self._summary_line( "Sum check (ms):", - float(overall_summary.get("input_preprocess_time_ms", 0.0)) + self._get_overall_summary_time_ms(overall_summary, "input_preprocess_wall_time_ms") + float(overall_summary.get("engine_pipeline_time_ms", 0.0)), ), "", self._summary_header("Overall Time Breakdown", "-"), self._summary_line( "Input preprocess time (ms):", - float(overall_summary.get("input_preprocess_time_ms", 0.0)), + self._get_overall_summary_time_ms(overall_summary, "input_preprocess_wall_time_ms"), ), ] ) - component_sum = float(overall_summary.get("input_preprocess_time_ms", 0.0)) + component_sum = self._get_overall_summary_time_ms(overall_summary, "input_preprocess_wall_time_ms") + stage_average_lines: list[str] = [] stage_ids = sorted( { evt.stage_id @@ -353,15 +373,18 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> for evt in stage_evts if evt.stage_id == stage_id ) - output_processor_ms = sum( + output_processor_total_ms = sum( float(evt.output_processor_time_ms or 0.0) for stage_evts in self.stage_events.values() for evt in stage_evts if evt.stage_id == stage_id ) - lines.append(self._summary_line(f"Stage {stage_id} generation time (ms):", stage_gen_ms)) - lines.append(self._summary_line(f"Stage {stage_id} output processor time (ms):", output_processor_ms)) - component_sum += stage_gen_ms + output_processor_ms + stage_event_count = sum( + 1 for stage_evts in self.stage_events.values() for evt in stage_evts if evt.stage_id == stage_id + ) + lines.append(self._summary_line(f"Stage {stage_id} total generation time (ms):", stage_gen_ms)) + lines.append(self._summary_line(f"Stage {stage_id} output processor time (ms):", output_processor_total_ms)) + component_sum += stage_gen_ms + output_processor_total_ms handoff_ms = sum( float(evt.stage_handoff_time_ms or 0.0) @@ -381,26 +404,83 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> to_stage = to_stage_ids[0] if len(to_stage_ids) == 1 else "next" lines.append(self._summary_line(f"Stage {stage_id} -> Stage {to_stage} handoff time (ms):", handoff_ms)) component_sum += handoff_ms + if int(overall_summary.get("num_of_requests", 0)) > 1 and stage_event_count > 0: + stage_average_lines.extend( + [ + self._summary_line( + f"Average Stage {stage_id} generation time (ms):", + stage_gen_ms / stage_event_count, + ), + self._summary_line( + f"Average Stage {stage_id} output processor time (ms):", + output_processor_total_ms / stage_event_count, + ), + ] + ) + if handoff_ms > 0.0: + stage_average_lines.append( + self._summary_line( + f"Average Stage {stage_id} handoff time (ms):", + handoff_ms / stage_event_count, + ) + ) - final_output_ms = float(overall_summary.get("final_output_total_time_ms", 0.0)) - final_orchestration_ms = float(overall_summary.get("breakdown_delta_time_ms", 0.0)) - final_output_bucket_ms = final_output_ms + final_orchestration_ms + final_output_bucket_ms = float(overall_summary.get("final_output_total_time_ms", 0.0)) lines.extend( [ self._summary_line("Final output time (ms):", final_output_bucket_ms), - self._summary_line("Component sum (ms):", component_sum + final_output_bucket_ms), - self._summary_line( - "E2E - component sum (ms):", - float(overall_summary.get("request_wall_time_ms", 0.0)) - component_sum - final_output_bucket_ms, - ), ] ) + if int(overall_summary.get("num_of_requests", 0)) == 1: + lines.extend( + [ + self._summary_line("Component sum (ms):", component_sum + final_output_bucket_ms), + self._summary_line( + "E2E - component sum (ms):", + float(overall_summary.get("request_wall_time_ms", 0.0)) + - component_sum + - final_output_bucket_ms, + ), + ] + ) + else: + lines.append( + self._summary_line( + "Component total work time (ms):", + component_sum + final_output_bucket_ms, + ) + ) + lines.extend( + [ + "", + self._summary_header("Average Time Breakdown", "-"), + self._summary_line( + "Average input preprocess time (ms):", + float(overall_summary.get("avg_input_preprocess_time_ms", 0.0)), + ), + *stage_average_lines, + self._summary_line( + "Average final output time (ms):", + float(overall_summary.get("avg_final_output_time_ms", 0.0)), + ), + ] + ) for rid in sorted(set(self.stage_events.keys()) | {evt.request_id for evt in self.e2e_events}): stage_evts = sorted( self.stage_events.get(rid, []), key=lambda evt: evt.stage_id if evt.stage_id is not None else -1, ) + e2e_evt = next((evt for evt in self.e2e_events if evt.request_id == rid), None) + if e2e_evt is not None: + lines.extend( + [ + "", + self._summary_header(f"Request {rid} Breakdown", "-"), + self._summary_line("Input preprocess time (ms):", e2e_evt.input_preprocess_time_ms), + self._summary_line("Input preprocess sum check (ms):", e2e_evt.input_preprocess_time_ms), + ] + ) for evt in stage_evts: stage_id = evt.stage_id if evt.stage_id is not None else "unknown" lines.extend( @@ -440,34 +520,19 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> ) final_output_ms = float(self.final_output_time_ms.get(rid, 0.0)) - e2e_evt = next((evt for evt in self.e2e_events if evt.request_id == rid), None) - if e2e_evt is None: - final_orchestration_ms = 0.0 - else: - stage_component_ms = sum( - float(evt.stage_gen_time_ms or 0.0) - + float(evt.output_processor_time_ms or 0.0) - + float(evt.stage_handoff_time_ms or 0.0) - for evt in stage_evts - ) - final_orchestration_ms = ( - float(e2e_evt.request_wall_time_ms) - - float(e2e_evt.input_preprocess_time_ms) - - stage_component_ms - - final_output_ms - ) - final_output_bucket_ms = final_output_ms + final_orchestration_ms - if final_output_bucket_ms != 0.0: + final_orchestration_ms = self._request_final_orchestration_time_ms(e2e_evt) if e2e_evt is not None else 0.0 + if final_output_ms != 0.0: lines.extend( [ "", self._summary_header("Final Output Breakdown", "-"), self._summary_line("Final output wrapping time (ms):", final_output_ms), - self._summary_line("Final orchestration overhead time (ms):", final_orchestration_ms), - self._summary_line("Final output total time (ms):", final_output_bucket_ms), - self._summary_line("Final output sum check (ms):", final_output_bucket_ms), + self._summary_line("Final output total time (ms):", final_output_ms), + self._summary_line("Final output sum check (ms):", final_output_ms), ] ) + if final_orchestration_ms != 0.0: + lines.append(self._summary_line("Unattributed request time (ms):", final_orchestration_ms)) return lines @@ -697,7 +762,14 @@ def record_stage_postprocess_time(self, stage_id: int, req_id: Any, postproc_tim def record_ar2diffusion_time(self, stage_id: int, req_id: Any, ar2diffusion_time_ms: float) -> None: self._update_stage_event_field(stage_id, req_id, "ar2diffusion_time_ms", ar2diffusion_time_ms) - def record_final_output_time(self, req_id: Any, final_output_time_ms: float) -> None: + def record_final_output_time( + self, + req_id: Any, + final_output_time_ms: float, + start_ts: float | None = None, + end_ts: float | None = None, + ) -> None: + del start_ts, end_ts self.final_output_time_ms[str(req_id)] += float(final_output_time_ms) def record_stage_handoff_time( @@ -824,8 +896,23 @@ def build_and_log_summary(self) -> dict[str, Any]: if not self.log_stats: return {} wall_time_ms = max(0.0, (self.last_finish_ts - self.wall_start_ts) * 1000.0) - e2e_avg_req = (wall_time_ms / self.e2e_count) if self.e2e_count > 0 else 0.0 + e2e_avg_req = ( + sum(float(evt.request_wall_time_ms) for evt in self.e2e_events) / self.e2e_count + if self.e2e_count > 0 + else 0.0 + ) avg_tok = (self.total_tokens * 1000.0 / wall_time_ms) if wall_time_ms > 0 else 0.0 + first_engine_ts = min((ts for ts in self.stage_first_ts if ts is not None), default=None) + engine_pipeline_wall_ms = ( + max(0.0, (self.last_finish_ts - first_engine_ts) * 1000.0) + if first_engine_ts is not None + else float(self.engine_pipeline_total_ms) + ) + input_preprocess_wall_ms = ( + max(0.0, (first_engine_ts - self.wall_start_ts) * 1000.0) + if first_engine_ts is not None + else float(self.input_preprocess_total_ms) + ) if isinstance(self.final_stage_id_for_e2e, int): final_stage_id_map: dict[str, int] = {"*": int(self.final_stage_id_for_e2e)} @@ -856,20 +943,14 @@ def build_and_log_summary(self) -> dict[str, Any]: edge = f"stage_{evt.stage_id}_to_{evt.handoff_to_stage_id}" handoff_edge_totals[edge] += handoff_ms handoff_edge_ar2diffusion[edge] += ar2d_ms - breakdown_delta_ms = ( - float(wall_time_ms) - - float(self.input_preprocess_total_ms) - - stage_gen_total_ms - - output_processor_total_ms - - stage_handoff_total_ms - - float(self.final_output_total_ms) - ) + breakdown_delta_ms = sum(self._request_final_orchestration_time_ms(evt) for evt in self.e2e_events) overall_summary = { "num_of_requests": int(self.e2e_count), "request_wall_time_ms": float(wall_time_ms), "input_preprocess_time_ms": float(self.input_preprocess_total_ms), - "engine_pipeline_time_ms": float(self.engine_pipeline_total_ms), + "input_preprocess_wall_time_ms": float(input_preprocess_wall_ms), + "engine_pipeline_time_ms": float(engine_pipeline_wall_ms), "stage_gen_total_time_ms": float(stage_gen_total_ms), "output_processor_total_time_ms": float(output_processor_total_ms), "stage_handoff_total_time_ms": float(stage_handoff_total_ms), From ab42f7151141cd8b079270d06330be809dc1fb67 Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Sat, 25 Apr 2026 14:54:53 +0800 Subject: [PATCH 08/11] Split stage latency and execution metrics Signed-off-by: bjf-frz --- tests/metrics/test_stats.py | 7 +- vllm_omni/engine/async_omni_engine.py | 1 + vllm_omni/engine/orchestrator.py | 9 ++ vllm_omni/entrypoints/omni.py | 1 + vllm_omni/metrics/stats.py | 141 ++++++++++++++++++++++---- 5 files changed, 135 insertions(+), 24 deletions(-) diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py index bc998a77e92..357b6319319 100644 --- a/tests/metrics/test_stats.py +++ b/tests/metrics/test_stats.py @@ -264,12 +264,15 @@ def test_multi_request_summary_includes_averages_and_request_input_preprocess() rendered = "\n".join(lines) assert "Sum check (ms):" in rendered - assert "Component total work time (ms):" in rendered + assert "Component total work time (ms):" not in rendered assert "Component sum (ms):" not in rendered + assert "Stage sum check (ms):" not in rendered assert "Average E2E time" not in rendered assert "Average engine pipeline time" not in rendered assert "Average Time Breakdown" in rendered - assert "Average Stage 0 generation time" in rendered + assert "Average Stage 0 latency time" in rendered + assert "Average Stage 0 queue wait time" in rendered + assert "Average Stage 0 execution time" in rendered assert "Request r1 Breakdown" in rendered assert "Request r2 Breakdown" in rendered assert "Input preprocess time (ms):" in rendered diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 8299a577a41..e8fbfc6ee4e 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1510,6 +1510,7 @@ def add_request( ) if self.request_queue is None: raise RuntimeError("request_queue is not initialized") + msg["request_dispatch_start_ts"] = time.time() self.request_queue.sync_q.put_nowait(msg) # CFG companion expansion: create and enqueue companion requests diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 76a3170c214..5b8260f20ab 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -632,7 +632,10 @@ def _build_stage_metrics( total_token=self._agg_total_tokens[stage_id], total_gen_time_ms=self._agg_total_gen_time_ms[stage_id], ), + stage_submit_ts=submit_ts, stage_end_ts=now, + request_dispatch_wait_time_ms=float(getattr(req_state, "request_dispatch_wait_time_ms", 0.0)), + stage_latency_time_ms=stage_wall_time_ms, ) def _build_kv_sender_info(self, sender_stage_ids: list[int]) -> dict[int, dict[str, Any]] | None: @@ -937,6 +940,12 @@ async def _handle_add_request(self, msg: dict[str, Any]) -> None: ) req_state.streaming.enabled = is_streaming req_state.stage_submit_ts[stage_id] = _time.time() + request_dispatch_start_ts = msg.get("request_dispatch_start_ts") + if request_dispatch_start_ts is not None: + req_state.request_dispatch_wait_time_ms = max( + 0.0, + (req_state.stage_submit_ts[stage_id] - request_dispatch_start_ts) * 1000.0, + ) self.request_states[request_id] = req_state # Stage-0 prompt is already a fully-formed OmniEngineCoreRequest diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py index c5e10044419..ebf1301c0a0 100644 --- a/vllm_omni/entrypoints/omni.py +++ b/vllm_omni/entrypoints/omni.py @@ -148,6 +148,7 @@ def _run_generation( prompt=prompt, sampling_params_list=req_sp_list, final_stage_id=final_stage_id, + arrival_time=request_prep_start_ts, ) submit_ts = time.time() input_preprocess_time_ms[req_id] = (submit_ts - request_prep_start_ts) * 1000.0 diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py index df1876e5eb9..ee204dcb28a 100644 --- a/vllm_omni/metrics/stats.py +++ b/vllm_omni/metrics/stats.py @@ -44,7 +44,12 @@ class StageRequestStats: output_processor_time_ms: float = 0.0 diffusion_metrics: dict[str, int] = None audio_generated_frames: int = 0 + stage_submit_ts: float | None = None stage_end_ts: float | None = None + request_dispatch_wait_time_ms: float = 0.0 + stage_latency_time_ms: float = 0.0 + stage_queue_wait_time_ms: float = 0.0 + stage_execution_time_ms: float = 0.0 handoff_to_stage_id: int | None = None stage_handoff_time_ms: float = 0.0 ar2diffusion_time_ms: float = 0.0 @@ -111,7 +116,12 @@ def e2e_tpt(self) -> float: "rx_decode_time_ms", "rx_in_flight_time_ms", "final_output_type", + "stage_submit_ts", "stage_end_ts", + "request_dispatch_wait_time_ms", + "stage_latency_time_ms", + "stage_queue_wait_time_ms", + "stage_execution_time_ms", "handoff_to_stage_id", "stage_handoff_time_ms", "ar2diffusion_time_ms", @@ -126,6 +136,9 @@ def e2e_tpt(self) -> float: "input_preprocess_time_ms", "engine_pipeline_time_ms", "stage_gen_total_time_ms", + "stage_latency_total_time_ms", + "stage_queue_wait_total_time_ms", + "stage_execution_total_time_ms", "output_processor_total_time_ms", "stage_handoff_total_time_ms", "ar2diffusion_total_time_ms", @@ -271,7 +284,8 @@ def _summary_line(label: str, value: Any) -> str: text = f"{value:,.3f}" else: text = str(value) - return f"{label:<42}{text:>16}" + label_width = max(42, len(label) + 1) + return f"{label:<{label_width}}{text:>16}" @staticmethod def _summary_header(title: str, char: str = "=") -> str: @@ -311,7 +325,8 @@ def _append_stage_info_lines(self, lines: list[str], evt: StageRequestStats) -> def _request_stage_component_time_ms(self, request_id: str) -> float: return sum( - float(evt.stage_gen_time_ms or 0.0) + float(evt.stage_queue_wait_time_ms or 0.0) + + float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0) + float(evt.output_processor_time_ms or 0.0) + float(evt.stage_handoff_time_ms or 0.0) for evt in self.stage_events.get(request_id, []) @@ -325,9 +340,46 @@ def _request_final_orchestration_time_ms(self, evt: RequestE2EStats) -> float: - float(self.final_output_time_ms.get(evt.request_id, 0.0)) ) + def _request_dispatch_wait_time_ms(self, request_id: str) -> float: + return sum( + float(evt.request_dispatch_wait_time_ms or 0.0) + for evt in self.stage_events.get(request_id, []) + if evt.stage_id == 0 + ) + + def _request_remaining_orchestration_time_ms(self, evt: RequestE2EStats) -> float: + return self._request_final_orchestration_time_ms(evt) - self._request_dispatch_wait_time_ms(evt.request_id) + def _get_overall_summary_time_ms(self, overall_summary: dict[str, Any], key: str) -> float: return float(overall_summary.get(key, overall_summary.get(key.replace("_wall_", "_"), 0.0))) + def _estimate_stage_wait_and_execution_times(self) -> None: + for stage_id in range(self.num_stages): + stage_evts = sorted( + [ + evt + for evts in self.stage_events.values() + for evt in evts + if evt.stage_id == stage_id and evt.stage_submit_ts is not None and evt.stage_end_ts is not None + ], + key=lambda evt: float(evt.stage_end_ts or 0.0), + ) + previous_finish_ts: float | None = None + for evt in stage_evts: + submit_ts = float(evt.stage_submit_ts or 0.0) + end_ts = float(evt.stage_end_ts or submit_ts) + latency_ms = max(0.0, (end_ts - submit_ts) * 1000.0) + service_start_ts = submit_ts + if previous_finish_ts is not None: + service_start_ts = max(submit_ts, previous_finish_ts) + queue_wait_ms = max(0.0, (service_start_ts - submit_ts) * 1000.0) + service_time_ms = max(0.0, (end_ts - service_start_ts) * 1000.0) + execution_ms = max(0.0, service_time_ms - float(evt.output_processor_time_ms or 0.0)) + evt.stage_latency_time_ms = latency_ms + evt.stage_queue_wait_time_ms = queue_wait_ms + evt.stage_execution_time_ms = execution_ms + previous_finish_ts = end_ts if previous_finish_ts is None else max(previous_finish_ts, end_ts) + def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> list[str]: lines = [self._summary_header("Omni Metrics Summary")] lines.extend( @@ -367,8 +419,20 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> } ) for stage_id in stage_ids: - stage_gen_ms = sum( - float(evt.stage_gen_time_ms or 0.0) + stage_latency_ms = sum( + float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0) + for stage_evts in self.stage_events.values() + for evt in stage_evts + if evt.stage_id == stage_id + ) + stage_queue_wait_ms = sum( + float(evt.stage_queue_wait_time_ms or 0.0) + for stage_evts in self.stage_events.values() + for evt in stage_evts + if evt.stage_id == stage_id + ) + stage_execution_ms = sum( + float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0) for stage_evts in self.stage_events.values() for evt in stage_evts if evt.stage_id == stage_id @@ -382,9 +446,11 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> stage_event_count = sum( 1 for stage_evts in self.stage_events.values() for evt in stage_evts if evt.stage_id == stage_id ) - lines.append(self._summary_line(f"Stage {stage_id} total generation time (ms):", stage_gen_ms)) + lines.append(self._summary_line(f"Stage {stage_id} total latency time (ms):", stage_latency_ms)) + lines.append(self._summary_line(f"Stage {stage_id} queue wait time (ms):", stage_queue_wait_ms)) + lines.append(self._summary_line(f"Stage {stage_id} execution time (ms):", stage_execution_ms)) lines.append(self._summary_line(f"Stage {stage_id} output processor time (ms):", output_processor_total_ms)) - component_sum += stage_gen_ms + output_processor_total_ms + component_sum += stage_execution_ms + output_processor_total_ms handoff_ms = sum( float(evt.stage_handoff_time_ms or 0.0) @@ -408,8 +474,16 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> stage_average_lines.extend( [ self._summary_line( - f"Average Stage {stage_id} generation time (ms):", - stage_gen_ms / stage_event_count, + f"Average Stage {stage_id} latency time (ms):", + stage_latency_ms / stage_event_count, + ), + self._summary_line( + f"Average Stage {stage_id} queue wait time (ms):", + stage_queue_wait_ms / stage_event_count, + ), + self._summary_line( + f"Average Stage {stage_id} execution time (ms):", + stage_execution_ms / stage_event_count, ), self._summary_line( f"Average Stage {stage_id} output processor time (ms):", @@ -444,12 +518,6 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> ] ) else: - lines.append( - self._summary_line( - "Component total work time (ms):", - component_sum + final_output_bucket_ms, - ) - ) lines.extend( [ "", @@ -473,12 +541,14 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> ) e2e_evt = next((evt for evt in self.e2e_events if evt.request_id == rid), None) if e2e_evt is not None: + request_dispatch_wait_ms = self._request_dispatch_wait_time_ms(rid) lines.extend( [ "", self._summary_header(f"Request {rid} Breakdown", "-"), self._summary_line("Input preprocess time (ms):", e2e_evt.input_preprocess_time_ms), self._summary_line("Input preprocess sum check (ms):", e2e_evt.input_preprocess_time_ms), + self._summary_line("Request dispatch wait time (ms):", request_dispatch_wait_ms), ] ) for evt in stage_evts: @@ -487,14 +557,18 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> [ "", self._summary_header(f"Stage {stage_id} Breakdown", "-"), - self._summary_line("Stage generation time (ms):", float(evt.stage_gen_time_ms or 0.0)), self._summary_line( - "Output processor time (ms):", - float(evt.output_processor_time_ms or 0.0), + "Stage latency time (ms):", + float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0), ), + self._summary_line("Queue wait time (ms):", float(evt.stage_queue_wait_time_ms or 0.0)), self._summary_line( - "Stage sum check (ms):", - float(evt.stage_gen_time_ms or 0.0) + float(evt.output_processor_time_ms or 0.0), + "Execution time (ms):", + float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0), + ), + self._summary_line( + "Output processor time (ms):", + float(evt.output_processor_time_ms or 0.0), ), "", ] @@ -520,7 +594,9 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> ) final_output_ms = float(self.final_output_time_ms.get(rid, 0.0)) - final_orchestration_ms = self._request_final_orchestration_time_ms(e2e_evt) if e2e_evt is not None else 0.0 + remaining_orchestration_ms = ( + self._request_remaining_orchestration_time_ms(e2e_evt) if e2e_evt is not None else 0.0 + ) if final_output_ms != 0.0: lines.extend( [ @@ -531,8 +607,13 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> self._summary_line("Final output sum check (ms):", final_output_ms), ] ) - if final_orchestration_ms != 0.0: - lines.append(self._summary_line("Unattributed request time (ms):", final_orchestration_ms)) + if remaining_orchestration_ms != 0.0: + lines.append( + self._summary_line( + "Remaining orchestration overhead time (ms):", + remaining_orchestration_ms, + ) + ) return lines @@ -719,6 +800,12 @@ def _as_stage_request_stats( stats.stage_name = self._get_stage_name(stage_id, stage_meta) stats.request_id = req_id stats.final_output_type = final_output_type + if stats.stage_latency_time_ms == 0.0: + stats.stage_latency_time_ms = ( + float(stats.stage_end_ts - stats.stage_submit_ts) * 1000.0 + if stats.stage_submit_ts is not None and stats.stage_end_ts is not None + else float(stats.stage_gen_time_ms or 0.0) + float(stats.output_processor_time_ms or 0.0) + ) stats.diffusion_metrics = ( {k: int(v) for k, v in self.diffusion_metrics.pop(req_id, {}).items()} if req_id in self.diffusion_metrics @@ -895,6 +982,7 @@ def on_finalize_request( def build_and_log_summary(self) -> dict[str, Any]: if not self.log_stats: return {} + self._estimate_stage_wait_and_execution_times() wall_time_ms = max(0.0, (self.last_finish_ts - self.wall_start_ts) * 1000.0) e2e_avg_req = ( sum(float(evt.request_wall_time_ms) for evt in self.e2e_events) / self.e2e_count @@ -926,6 +1014,9 @@ def build_and_log_summary(self) -> dict[str, Any]: for i in range(self.num_stages) ] stage_gen_total_ms = 0.0 + stage_latency_total_ms = 0.0 + stage_queue_wait_total_ms = 0.0 + stage_execution_total_ms = 0.0 output_processor_total_ms = 0.0 stage_handoff_total_ms = 0.0 ar2diffusion_total_ms = 0.0 @@ -934,6 +1025,9 @@ def build_and_log_summary(self) -> dict[str, Any]: for stage_evts in self.stage_events.values(): for evt in stage_evts: stage_gen_total_ms += float(evt.stage_gen_time_ms or 0.0) + stage_latency_total_ms += float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0) + stage_queue_wait_total_ms += float(evt.stage_queue_wait_time_ms or 0.0) + stage_execution_total_ms += float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0) output_processor_total_ms += float(evt.output_processor_time_ms or 0.0) handoff_ms = float(evt.stage_handoff_time_ms or 0.0) ar2d_ms = float(evt.ar2diffusion_time_ms or 0.0) @@ -952,6 +1046,9 @@ def build_and_log_summary(self) -> dict[str, Any]: "input_preprocess_wall_time_ms": float(input_preprocess_wall_ms), "engine_pipeline_time_ms": float(engine_pipeline_wall_ms), "stage_gen_total_time_ms": float(stage_gen_total_ms), + "stage_latency_total_time_ms": float(stage_latency_total_ms), + "stage_queue_wait_total_time_ms": float(stage_queue_wait_total_ms), + "stage_execution_total_time_ms": float(stage_execution_total_ms), "output_processor_total_time_ms": float(output_processor_total_ms), "stage_handoff_total_time_ms": float(stage_handoff_total_ms), "ar2diffusion_total_time_ms": float(ar2diffusion_total_ms), From 44c261fa9fd85635a2d52f348e96c0093442cac2 Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Mon, 27 Apr 2026 15:10:55 +0800 Subject: [PATCH 09/11] Fix offline metrics UT regressions Signed-off-by: bjf-frz --- vllm_omni/entrypoints/omni_base.py | 5 +++-- vllm_omni/metrics/stats.py | 9 ++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 687d3951c65..dc1e365f85a 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -388,8 +388,9 @@ def _process_single_result( peak_memory_mb = getattr(result["engine_outputs"], "peak_memory_mb", 0.0) # Merge AR stage timing from OrchestratorAggregator.stage_events + stage_events = getattr(metrics, "stage_events", {}) if self._enable_ar_profiler: - ar_events = metrics.stage_events.get(str(req_id), []) + ar_events = stage_events.get(str(req_id), []) for evt in ar_events: if evt.stage_id != stage_id: stage_durations[f"ar_stage_{evt.stage_id}"] = evt.stage_gen_time_ms / 1000.0 @@ -402,7 +403,7 @@ def _process_single_result( stage_durations[key] = value # Merge per-stage gen times into stage_durations - for evt in metrics.stage_events.get(str(req_id), []): + for evt in stage_events.get(str(req_id), []): key = f"stage_{evt.stage_id}_gen_ms" if key not in stage_durations: stage_durations[key] = evt.stage_gen_time_ms diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py index f7347f6e0e9..4c26649a96d 100644 --- a/vllm_omni/metrics/stats.py +++ b/vllm_omni/metrics/stats.py @@ -991,6 +991,11 @@ def build_and_log_summary(self) -> dict[str, Any]: if self.e2e_count > 0 else 0.0 ) + avg_engine_pipeline_ms = ( + sum(float(evt.engine_pipeline_time_ms) for evt in self.e2e_events) / self.e2e_count + if self.e2e_count > 0 + else 0.0 + ) avg_tok = (self.total_tokens * 1000.0 / wall_time_ms) if wall_time_ms > 0 else 0.0 first_engine_ts = min((ts for ts in self.stage_first_ts if ts is not None), default=None) engine_pipeline_wall_ms = ( @@ -1061,9 +1066,7 @@ def build_and_log_summary(self) -> dict[str, Any]: "avg_input_preprocess_time_ms": float( self.input_preprocess_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 ), - "avg_engine_pipeline_time_ms": float( - self.engine_pipeline_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 - ), + "avg_engine_pipeline_time_ms": float(avg_engine_pipeline_ms), "avg_stage_gen_total_time_ms": float(stage_gen_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0), "avg_output_processor_time_ms": float( output_processor_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 From d9395784e2e947908ded815b2f81360aedfad092 Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Mon, 27 Apr 2026 15:46:10 +0800 Subject: [PATCH 10/11] Address metrics review feedback Signed-off-by: bjf-frz --- docs/contributing/metrics.md | 6 ++ tests/entrypoints/test_omni_entrypoints.py | 9 ++- vllm_omni/metrics/stats.py | 86 ++++++++++++---------- 3 files changed, 60 insertions(+), 41 deletions(-) diff --git a/docs/contributing/metrics.md b/docs/contributing/metrics.md index ca7f596dda2..c854a088603 100644 --- a/docs/contributing/metrics.md +++ b/docs/contributing/metrics.md @@ -140,6 +140,12 @@ Any remaining difference is reported as `breakdown_delta_time_ms`. `ar2diffusion_total_time_ms` is included in `stage_handoff_total_time_ms`. For a concrete AR-to-diffusion edge, `stage_0_to_1_ar2diffusion_time_ms` is included in `stage_0_to_1_handoff_time_ms`. +For offline batches, average component fields such as `avg_stage_gen_total_time_ms`, +`avg_output_processor_time_ms`, `avg_stage_handoff_total_time_ms`, and +`avg_ar2diffusion_time_ms` are computed by dividing the batch aggregate by +`num_of_requests`. They are not individually timed per request and then averaged. +Per-request timers remain available in the E2E table where they are measured directly. + --- ### E2E Table (per request) diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py index bac4897da5d..052cb4fe07a 100644 --- a/tests/entrypoints/test_omni_entrypoints.py +++ b/tests/entrypoints/test_omni_entrypoints.py @@ -671,7 +671,14 @@ def __init__( def on_stage_metrics(self, *_: Any, **__: Any) -> None: pass - def record_final_output_time(self, req_id: str, final_output_time_ms: float) -> None: + def record_final_output_time( + self, + req_id: str, + final_output_time_ms: float, + start_ts: float | None = None, + end_ts: float | None = None, + ) -> None: + del start_ts, end_ts self.final_output_time_ms[req_id] = final_output_time_ms def on_finalize_request( diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py index 4c26649a96d..90f575b54be 100644 --- a/vllm_omni/metrics/stats.py +++ b/vllm_omni/metrics/stats.py @@ -47,9 +47,9 @@ class StageRequestStats: stage_submit_ts: float | None = None stage_end_ts: float | None = None request_dispatch_wait_time_ms: float = 0.0 - stage_latency_time_ms: float = 0.0 - stage_queue_wait_time_ms: float = 0.0 - stage_execution_time_ms: float = 0.0 + stage_latency_time_ms: float | None = None + stage_queue_wait_time_ms: float | None = None + stage_execution_time_ms: float | None = None handoff_to_stage_id: int | None = None stage_handoff_time_ms: float = 0.0 ar2diffusion_time_ms: float = 0.0 @@ -327,8 +327,8 @@ def _append_stage_info_lines(self, lines: list[str], evt: StageRequestStats) -> def _request_stage_component_time_ms(self, request_id: str) -> float: return sum( - float(evt.stage_queue_wait_time_ms or 0.0) - + float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0) + self._optional_ms(evt.stage_queue_wait_time_ms) + + self._stage_execution_ms(evt) + float(evt.output_processor_time_ms or 0.0) + float(evt.stage_handoff_time_ms or 0.0) for evt in self.stage_events.get(request_id, []) @@ -355,7 +355,29 @@ def _request_remaining_orchestration_time_ms(self, evt: RequestE2EStats) -> floa def _get_overall_summary_time_ms(self, overall_summary: dict[str, Any], key: str) -> float: return float(overall_summary.get(key, overall_summary.get(key.replace("_wall_", "_"), 0.0))) + @staticmethod + def _optional_ms(value: float | None, default: float = 0.0) -> float: + return float(default if value is None else value) + + @classmethod + def _stage_latency_ms(cls, evt: StageRequestStats) -> float: + return cls._optional_ms(evt.stage_latency_time_ms, float(evt.stage_gen_time_ms or 0.0)) + + @classmethod + def _stage_execution_ms(cls, evt: StageRequestStats) -> float: + return cls._optional_ms(evt.stage_execution_time_ms, float(evt.stage_gen_time_ms or 0.0)) + + @staticmethod + def _average(total: float, count: int) -> float: + return float(total / count) if count > 0 else 0.0 + + def _average_e2e_field(self, field_name: str) -> float: + if self.e2e_count <= 0: + return 0.0 + return float(sum(float(getattr(evt, field_name)) for evt in self.e2e_events) / self.e2e_count) + def _estimate_stage_wait_and_execution_times(self) -> None: + """Populate derived latency, queue wait, and execution times on stage events.""" for stage_id in range(self.num_stages): stage_evts = sorted( [ @@ -422,19 +444,19 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> ) for stage_id in stage_ids: stage_latency_ms = sum( - float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0) + self._stage_latency_ms(evt) for stage_evts in self.stage_events.values() for evt in stage_evts if evt.stage_id == stage_id ) stage_queue_wait_ms = sum( - float(evt.stage_queue_wait_time_ms or 0.0) + self._optional_ms(evt.stage_queue_wait_time_ms) for stage_evts in self.stage_events.values() for evt in stage_evts if evt.stage_id == stage_id ) stage_execution_ms = sum( - float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0) + self._stage_execution_ms(evt) for stage_evts in self.stage_events.values() for evt in stage_evts if evt.stage_id == stage_id @@ -561,12 +583,12 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> self._summary_header(f"Stage {stage_id} Breakdown", "-"), self._summary_line( "Stage latency time (ms):", - float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0), + self._stage_latency_ms(evt), ), - self._summary_line("Queue wait time (ms):", float(evt.stage_queue_wait_time_ms or 0.0)), + self._summary_line("Queue wait time (ms):", self._optional_ms(evt.stage_queue_wait_time_ms)), self._summary_line( "Execution time (ms):", - float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0), + self._stage_execution_ms(evt), ), self._summary_line( "Output processor time (ms):", @@ -802,7 +824,7 @@ def _as_stage_request_stats( stats.stage_name = self._get_stage_name(stage_id, stage_meta) stats.request_id = req_id stats.final_output_type = final_output_type - if stats.stage_latency_time_ms == 0.0: + if stats.stage_latency_time_ms is None: stats.stage_latency_time_ms = ( float(stats.stage_end_ts - stats.stage_submit_ts) * 1000.0 if stats.stage_submit_ts is not None and stats.stage_end_ts is not None @@ -986,16 +1008,8 @@ def build_and_log_summary(self) -> dict[str, Any]: return {} self._estimate_stage_wait_and_execution_times() wall_time_ms = max(0.0, (self.last_finish_ts - self.wall_start_ts) * 1000.0) - e2e_avg_req = ( - sum(float(evt.request_wall_time_ms) for evt in self.e2e_events) / self.e2e_count - if self.e2e_count > 0 - else 0.0 - ) - avg_engine_pipeline_ms = ( - sum(float(evt.engine_pipeline_time_ms) for evt in self.e2e_events) / self.e2e_count - if self.e2e_count > 0 - else 0.0 - ) + e2e_avg_req = self._average_e2e_field("request_wall_time_ms") + avg_engine_pipeline_ms = self._average_e2e_field("engine_pipeline_time_ms") avg_tok = (self.total_tokens * 1000.0 / wall_time_ms) if wall_time_ms > 0 else 0.0 first_engine_ts = min((ts for ts in self.stage_first_ts if ts is not None), default=None) engine_pipeline_wall_ms = ( @@ -1032,9 +1046,9 @@ def build_and_log_summary(self) -> dict[str, Any]: for stage_evts in self.stage_events.values(): for evt in stage_evts: stage_gen_total_ms += float(evt.stage_gen_time_ms or 0.0) - stage_latency_total_ms += float(evt.stage_latency_time_ms or evt.stage_gen_time_ms or 0.0) - stage_queue_wait_total_ms += float(evt.stage_queue_wait_time_ms or 0.0) - stage_execution_total_ms += float(evt.stage_execution_time_ms or evt.stage_gen_time_ms or 0.0) + stage_latency_total_ms += self._stage_latency_ms(evt) + stage_queue_wait_total_ms += self._optional_ms(evt.stage_queue_wait_time_ms) + stage_execution_total_ms += self._stage_execution_ms(evt) output_processor_total_ms += float(evt.output_processor_time_ms or 0.0) handoff_ms = float(evt.stage_handoff_time_ms or 0.0) ar2d_ms = float(evt.ar2diffusion_time_ms or 0.0) @@ -1063,22 +1077,14 @@ def build_and_log_summary(self) -> dict[str, Any]: "breakdown_delta_time_ms": float(breakdown_delta_ms), "total_tokens": int(self.total_tokens), "avg_request_wall_time_ms": float(e2e_avg_req), - "avg_input_preprocess_time_ms": float( - self.input_preprocess_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 - ), + "avg_input_preprocess_time_ms": self._average(self.input_preprocess_total_ms, self.e2e_count), "avg_engine_pipeline_time_ms": float(avg_engine_pipeline_ms), - "avg_stage_gen_total_time_ms": float(stage_gen_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0), - "avg_output_processor_time_ms": float( - output_processor_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 - ), - "avg_stage_handoff_total_time_ms": float( - stage_handoff_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 - ), - "avg_ar2diffusion_time_ms": float(ar2diffusion_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0), - "avg_final_output_time_ms": float( - self.final_output_total_ms / self.e2e_count if self.e2e_count > 0 else 0.0 - ), - "avg_breakdown_delta_time_ms": float(breakdown_delta_ms / self.e2e_count if self.e2e_count > 0 else 0.0), + "avg_stage_gen_total_time_ms": self._average(stage_gen_total_ms, self.e2e_count), + "avg_output_processor_time_ms": self._average(output_processor_total_ms, self.e2e_count), + "avg_stage_handoff_total_time_ms": self._average(stage_handoff_total_ms, self.e2e_count), + "avg_ar2diffusion_time_ms": self._average(ar2diffusion_total_ms, self.e2e_count), + "avg_final_output_time_ms": self._average(self.final_output_total_ms, self.e2e_count), + "avg_breakdown_delta_time_ms": self._average(breakdown_delta_ms, self.e2e_count), "avg_tokens_per_s": float(avg_tok), } for edge, handoff_time in sorted(handoff_edge_totals.items()): From d054cbc92d940143ea04a0eff1c924253a30a44c Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Tue, 12 May 2026 02:47:55 +0000 Subject: [PATCH 11/11] Limit logged request breakdowns Signed-off-by: bjf-frz --- tests/entrypoints/test_serve.py | 17 +++++++++++ tests/metrics/test_stats.py | 46 +++++++++++++++++++++++++++++ vllm_omni/entrypoints/async_omni.py | 1 + vllm_omni/entrypoints/cli/serve.py | 7 +++++ vllm_omni/entrypoints/omni.py | 1 + vllm_omni/entrypoints/omni_base.py | 1 + vllm_omni/metrics/stats.py | 27 ++++++++++++++++- 7 files changed, 99 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/test_serve.py b/tests/entrypoints/test_serve.py index 1267a189a32..e02b79d9b58 100644 --- a/tests/entrypoints/test_serve.py +++ b/tests/entrypoints/test_serve.py @@ -31,6 +31,23 @@ def test_serve_parser_accepts_no_async_chunk() -> None: assert args.async_chunk is False +def test_serve_parser_accepts_log_stats_request_breakdown_limit() -> None: + try: + from vllm.utils.argparse_utils import FlexibleArgumentParser + except Exception as exc: + pytest.skip(f"Cannot build parser in this environment: {exc}") + + root = FlexibleArgumentParser() + subparsers = root.add_subparsers(dest="subcommand") + cmd = OmniServeCommand() + cmd.subparser_init(subparsers) + + argv = ["serve", "fake-model", "--omni", "--log-stats-request-breakdown-limit", "5"] + args = root.parse_args(argv) + + assert args.log_stats_request_breakdown_limit == 5 + + def _make_headless_args() -> argparse.Namespace: return argparse.Namespace( model="fake-model", diff --git a/tests/metrics/test_stats.py b/tests/metrics/test_stats.py index 357b6319319..b273184cef8 100644 --- a/tests/metrics/test_stats.py +++ b/tests/metrics/test_stats.py @@ -113,6 +113,52 @@ def test_build_and_log_summary_e2e_only() -> None: assert stage_entry["stages"] == [] +def test_request_breakdown_log_limit_does_not_truncate_summary_tables() -> None: + agg = OrchestratorAggregator( + num_stages=1, + log_stats=True, + wall_start_ts=0.0, + final_stage_id_for_e2e=0, + request_breakdown_limit=2, + ) + for idx in range(4): + agg.e2e_events.append( + RequestE2EStats( + request_id=f"r{idx}", + request_wall_time_ms=10.0, + input_preprocess_time_ms=1.0, + engine_pipeline_time_ms=9.0, + total_tokens=1, + transfers_total_time_ms=0.0, + transfers_total_bytes=0, + ) + ) + agg.e2e_count = 4 + + lines = agg._build_omni_metrics_summary_lines( + { + "num_of_requests": 4, + "request_wall_time_ms": 40.0, + "input_preprocess_wall_time_ms": 4.0, + "engine_pipeline_time_ms": 36.0, + "final_output_total_time_ms": 0.0, + "avg_input_preprocess_time_ms": 1.0, + "avg_final_output_time_ms": 0.0, + } + ) + rendered = "\n".join(lines) + + assert "Request r0 Breakdown" in rendered + assert "Request r1 Breakdown" in rendered + assert "Request r2 Breakdown" not in rendered + assert "Request r3 Breakdown" not in rendered + assert "Request breakdowns shown:" in rendered + assert "Request breakdowns omitted:" in rendered + + summary = agg.build_and_log_summary() + assert {entry["request_id"] for entry in summary["e2e_table"]} == {"r0", "r1", "r2", "r3"} + + def test_build_and_log_summary_multiple_requests() -> None: agg = OrchestratorAggregator( num_stages=2, log_stats=True, wall_start_ts=0.0, final_stage_id_for_e2e={"r1": 1, "r2": 0} diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index a57c2672e94..747a77f3717 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -302,6 +302,7 @@ async def generate( wall_start_ts, final_stage_id_for_e2e, stage_metadata=getattr(self.engine, "stage_metadata", None), + request_breakdown_limit=self.log_stats_request_breakdown_limit, ) req_state = ClientRequestState(request_id) req_state.metrics = metrics diff --git a/vllm_omni/entrypoints/cli/serve.py b/vllm_omni/entrypoints/cli/serve.py index b4293d59fd7..9a28ff8076e 100644 --- a/vllm_omni/entrypoints/cli/serve.py +++ b/vllm_omni/entrypoints/cli/serve.py @@ -216,6 +216,13 @@ def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgu action="store_true", help="Enable logging the stats.", ) + omni_config_group.add_argument( + "--log-stats-request-breakdown-limit", + type=int, + default=5, + help="Maximum number of per-request breakdowns to print in the final stats summary. " + "Use a negative value to print all request breakdowns.", + ) omni_config_group.add_argument( "--log-file", type=str, diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py index ebf1301c0a0..8485d852eec 100644 --- a/vllm_omni/entrypoints/omni.py +++ b/vllm_omni/entrypoints/omni.py @@ -126,6 +126,7 @@ def _run_generation( wall_start_ts, req_final_stage_ids, stage_metadata=getattr(self.engine, "stage_metadata", None), + request_breakdown_limit=self.log_stats_request_breakdown_limit, ) for req_id, prompt in zip(request_ids, request_prompts): diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 10b64deab76..0d0b230b188 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -143,6 +143,7 @@ def __init__( stage_init_timeout = kwargs.pop("stage_init_timeout", 300) init_timeout = kwargs.pop("init_timeout", 600) log_stats = kwargs.pop("log_stats", False) + self.log_stats_request_breakdown_limit = kwargs.pop("log_stats_request_breakdown_limit", 5) self._enable_ar_profiler = kwargs.pop("enable_ar_profiler", False) # NOTE: read-only lookup — must NOT pop. Popping here drops the key # before it reaches ``StageConfigFactory._create_from_registry``, so diff --git a/vllm_omni/metrics/stats.py b/vllm_omni/metrics/stats.py index 90f575b54be..f3f7d5c9074 100644 --- a/vllm_omni/metrics/stats.py +++ b/vllm_omni/metrics/stats.py @@ -161,6 +161,7 @@ def e2e_tpt(self) -> float: STAGE_FIELDS = _build_field_defs(StageRequestStats, STAGE_EXCLUDE, FIELD_TRANSFORMS) TRANSFER_FIELDS = _build_field_defs(TransferEdgeStats, TRANSFER_EXCLUDE, FIELD_TRANSFORMS) E2E_FIELDS = _build_field_defs(RequestE2EStats, E2E_EXCLUDE, FIELD_TRANSFORMS) +DEFAULT_REQUEST_BREAKDOWN_LIMIT = 5 class OrchestratorAggregator: @@ -171,11 +172,13 @@ def __init__( wall_start_ts: float, final_stage_id_for_e2e: dict[str, int] | int, stage_metadata: list[dict[str, Any]] | None = None, + request_breakdown_limit: int | None = DEFAULT_REQUEST_BREAKDOWN_LIMIT, ) -> None: self.num_stages = int(num_stages) self.log_stats = bool(log_stats) self.final_stage_id_for_e2e = final_stage_id_for_e2e self.stage_metadata = list(stage_metadata or []) + self.request_breakdown_limit = self._normalize_request_breakdown_limit(request_breakdown_limit) self.init_run_state(wall_start_ts) self.stage_events: dict[str, list[StageRequestStats]] = {} self.transfer_events: dict[ @@ -236,6 +239,13 @@ def _format_seconds(self, ms: float) -> str: def _format_ms(self, ms: float) -> str: return f"{ms:,.3f} ms" + @staticmethod + def _normalize_request_breakdown_limit(limit: int | None) -> int | None: + if limit is None: + return None + limit = int(limit) + return None if limit < 0 else limit + def _log_omni_timing(self, evt: RequestE2EStats) -> None: rid = evt.request_id stages = [] @@ -558,7 +568,22 @@ def _build_omni_metrics_summary_lines(self, overall_summary: dict[str, Any]) -> ] ) - for rid in sorted(set(self.stage_events.keys()) | {evt.request_id for evt in self.e2e_events}): + all_breakdown_request_ids = sorted(set(self.stage_events.keys()) | {evt.request_id for evt in self.e2e_events}) + if self.request_breakdown_limit is None: + breakdown_request_ids = all_breakdown_request_ids + else: + breakdown_request_ids = all_breakdown_request_ids[: self.request_breakdown_limit] + omitted_breakdowns = len(all_breakdown_request_ids) - len(breakdown_request_ids) + if omitted_breakdowns > 0: + lines.extend( + [ + "", + self._summary_line("Request breakdowns shown:", len(breakdown_request_ids)), + self._summary_line("Request breakdowns omitted:", omitted_breakdowns), + ] + ) + + for rid in breakdown_request_ids: stage_evts = sorted( self.stage_events.get(rid, []), key=lambda evt: evt.stage_id if evt.stage_id is not None else -1,