-
-
Notifications
You must be signed in to change notification settings - Fork 15.7k
Bugfix: zero-initialized metrics for failed/aborted/skipped requests throwing off histograms #32726
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -213,20 +213,49 @@ class RequestStateStats: | |
| is_corrupted: bool = False | ||
|
|
||
|
|
||
| @dataclass | ||
| class ScheduledTiming: | ||
| """Timing for a request that was scheduled but didn't generate tokens. | ||
|
|
||
| This occurs when a request is aborted during prefill or fails due to | ||
| errors like KV load failures. | ||
| """ | ||
|
|
||
| queued_time: float | ||
|
|
||
|
|
||
| @dataclass | ||
| class CompletedTiming: | ||
| """Timing for a request that generated at least one token.""" | ||
|
|
||
| queued_time: float | ||
| prefill_time: float | ||
| decode_time: float | ||
| inference_time: float | ||
| # None if request generated only a single token | ||
| mean_time_per_output_token: float | None = None | ||
|
|
||
|
|
||
| RequestTiming = ScheduledTiming | CompletedTiming | None | ||
|
|
||
|
|
||
| @dataclass | ||
| class FinishedRequestStats: | ||
| """Stats associated with a finished request.""" | ||
| """Stats associated with a finished request. | ||
|
|
||
| The timing field uses its type to encode how far the | ||
| request progressed before finishing: | ||
| - None: rejected before scheduling (e.g., client abort before scheduling) | ||
| - ScheduledTiming: scheduled but no tokens generated (abort/error during prefill) | ||
| - CompletedTiming: generated at least one token (normal completion) | ||
| """ | ||
|
|
||
| finish_reason: "FinishReason" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I particularly don't like the way this data structure has this type-like field ( which you might expect would determine which fields below are valid, but then we have this other We could make it all more coherent, but it doesn't seem like a bit deal to just say some of the fields may not be valid, and we'll only observe the fields that are valid Some docs here explaining that some of the fields may not be valid would be nice though |
||
| e2e_latency: float = 0.0 | ||
| num_prompt_tokens: int = 0 | ||
| num_generation_tokens: int = 0 | ||
| max_tokens_param: int | None = None | ||
| queued_time: float = 0.0 | ||
| prefill_time: float = 0.0 | ||
| inference_time: float = 0.0 | ||
| decode_time: float = 0.0 | ||
| mean_time_per_output_token: float = 0.0 | ||
| timing: RequestTiming = None | ||
| is_corrupted: bool = False | ||
| num_cached_tokens: int = 0 | ||
|
|
||
|
|
@@ -342,6 +371,10 @@ def update_from_output( | |
| prompt_len=prompt_len, | ||
| ) | ||
|
|
||
| # Only record first token latency when a token was actually generated. | ||
| # is_prefilling can be True even when no tokens are produced (e.g., | ||
| # KV-cache load failures, aborts during prefill). | ||
| if is_prefilling and num_new_generation_tokens > 0: | ||
| first_token_latency = self._time_since(req_stats.arrival_time) | ||
| self.time_to_first_tokens_iter.append(first_token_latency) | ||
| req_stats.first_token_latency = first_token_latency | ||
|
|
@@ -368,14 +401,16 @@ def update_from_output( | |
| lora_name, | ||
| ) | ||
|
|
||
| # Process the batch-level "new tokens" engine core event | ||
| if is_prefilling: | ||
| req_stats.first_token_ts = engine_core_timestamp | ||
| else: | ||
| itl = engine_core_timestamp - req_stats.last_token_ts | ||
| self.inter_token_latencies_iter.append(itl) | ||
| # Process the batch-level "new tokens" engine core event. | ||
| # Only update timestamps when tokens were actually generated. | ||
| if num_new_generation_tokens > 0: | ||
| if is_prefilling: | ||
| req_stats.first_token_ts = engine_core_timestamp | ||
| else: | ||
| itl = engine_core_timestamp - req_stats.last_token_ts | ||
| self.inter_token_latencies_iter.append(itl) | ||
|
|
||
| req_stats.last_token_ts = engine_core_timestamp | ||
| req_stats.last_token_ts = engine_core_timestamp | ||
|
|
||
| def update_from_events( | ||
| self, | ||
|
|
@@ -411,39 +446,57 @@ def update_from_finished_request( | |
| ): | ||
| e2e_latency = self._time_since(req_stats.arrival_time) | ||
|
|
||
| # Queued interval is from first QUEUED event to first SCHEDULED | ||
| queued_time = req_stats.scheduled_ts - req_stats.queued_ts | ||
|
|
||
| # Prefill interval is from first SCHEDULED to first NEW_TOKEN | ||
| # Any preemptions during prefill is included in the interval | ||
| prefill_time = req_stats.first_token_ts - req_stats.scheduled_ts | ||
|
|
||
| # Decode interval is from first NEW_TOKEN to last NEW_TOKEN | ||
| # Any preemptions during decode are included | ||
| decode_time = req_stats.last_token_ts - req_stats.first_token_ts | ||
|
|
||
| # Inference interval is from first SCHEDULED to last NEW_TOKEN | ||
| # Any preemptions during prefill or decode are included | ||
| inference_time = req_stats.last_token_ts - req_stats.scheduled_ts | ||
|
|
||
| # Do not count the token generated by the prefill phase | ||
| mean_time_per_output_token = ( | ||
| decode_time / (req_stats.num_generation_tokens - 1) | ||
| if req_stats.num_generation_tokens - 1 > 0 | ||
| else 0 | ||
| ) | ||
| # build timing based on how far the request progressed | ||
| timing: RequestTiming = None | ||
| was_queued = req_stats.queued_ts > 0 | ||
| was_scheduled = req_stats.scheduled_ts > 0 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Honestly, I'm pretty sceptical of all of this new logic to determine "how far the request progressed" The fix, in the end, is nothing more than: AFAICT, we don't actually use this "how far the request progressed" information ... |
||
| got_first_token = req_stats.first_token_ts > 0 | ||
| got_last_token = req_stats.last_token_ts > 0 | ||
|
|
||
| if was_queued and was_scheduled: | ||
| # queued: from first QUEUED event to first SCHEDULED | ||
| queued_time = req_stats.scheduled_ts - req_stats.queued_ts | ||
|
|
||
| if got_first_token and got_last_token: | ||
| # request generated tokens - full timing available | ||
|
|
||
| # prefill: from first SCHEDULED to first NEW_TOKEN | ||
| # (any preemptions during prefill are included) | ||
| prefill_time = req_stats.first_token_ts - req_stats.scheduled_ts | ||
|
|
||
| # decode: from first NEW_TOKEN to last NEW_TOKEN | ||
| # (any preemptions during decode are included) | ||
| decode_time = req_stats.last_token_ts - req_stats.first_token_ts | ||
|
|
||
| # inference: from first SCHEDULED to last NEW_TOKEN | ||
| # (any preemptions during prefill or decode are included) | ||
| inference_time = req_stats.last_token_ts - req_stats.scheduled_ts | ||
|
|
||
| # don't count the token generated by the prefill phase | ||
| mean_tpot = ( | ||
| decode_time / (req_stats.num_generation_tokens - 1) | ||
| if req_stats.num_generation_tokens > 1 | ||
| else None | ||
| ) | ||
| timing = CompletedTiming( | ||
| queued_time=queued_time, | ||
| prefill_time=prefill_time, | ||
| decode_time=decode_time, | ||
| inference_time=inference_time, | ||
| mean_time_per_output_token=mean_tpot, | ||
| ) | ||
| else: | ||
| # scheduled but no tokens (abort during prefill, KV error, etc.) | ||
| timing = ScheduledTiming(queued_time=queued_time) | ||
| # else: timing stays None (rejected before scheduling) | ||
|
|
||
| finished_req = FinishedRequestStats( | ||
| finish_reason=finish_reason, | ||
| e2e_latency=e2e_latency, | ||
| num_prompt_tokens=num_prompt_tokens, | ||
| num_generation_tokens=req_stats.num_generation_tokens, | ||
| max_tokens_param=max_tokens_param, | ||
| queued_time=queued_time, | ||
| prefill_time=prefill_time, | ||
| inference_time=inference_time, | ||
| decode_time=decode_time, | ||
| mean_time_per_output_token=mean_time_per_output_token, | ||
| timing=timing, | ||
| is_corrupted=req_stats.is_corrupted, | ||
| num_cached_tokens=num_cached_tokens, | ||
| ) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
ScheduledTimingdataclass name, while technically accurate, could be made more explicit to immediately convey its purpose in the context of requests that did not complete token generation. Given that this PR addresses issues with zero-initialized metrics for failed/aborted/skipped requests, a name likeScheduledButIncompleteTimingorAbortedOrFailedScheduledTimingmight more clearly communicate that this timing object represents a request that progressed to scheduling but did not reach full completion. This improves maintainability by making the intent clearer at a glance, reducing potential for misinterpretation of critical metrics.