Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions tests/v1/metrics/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def test_prompt_token_stats_all_computed():
# Case 1: No caching (All tokens computed locally)
stats.update_from_output(
num_cached_tokens=0,
num_external_computed_tokens=0,
num_external_cached_tokens=0,
prompt_len=1000,
)

Expand All @@ -133,7 +133,7 @@ def test_prompt_token_stats_partial_local_cache():
# Case 2: Partial local cache
stats.update_from_output(
num_cached_tokens=300,
num_external_computed_tokens=0,
num_external_cached_tokens=0,
prompt_len=1000,
)

Expand All @@ -149,7 +149,7 @@ def test_prompt_token_stats_partial_external_transfer():
# Case 3: Partial external transfer
stats.update_from_output(
num_cached_tokens=500,
num_external_computed_tokens=500,
num_external_cached_tokens=500,
prompt_len=1000,
)

Expand All @@ -165,7 +165,7 @@ def test_prompt_token_stats_mixed_sources():
# Case 4: Mixed sources
stats.update_from_output(
num_cached_tokens=600,
num_external_computed_tokens=200,
num_external_cached_tokens=200,
prompt_len=1000,
)

Expand All @@ -185,7 +185,7 @@ def test_prompt_token_stats_full_local_cache_recompute():
# Case 5: Full local cache (999 cached after reduction, 1 recomputed)
stats.update_from_output(
num_cached_tokens=999,
num_external_computed_tokens=0,
num_external_cached_tokens=0,
prompt_len=1000,
)

Expand All @@ -201,7 +201,7 @@ def test_prompt_token_stats_full_external_transfer_recompute():
# Case 6: Full external transfer (999 cached after reduction, 1 recomputed)
stats.update_from_output(
num_cached_tokens=999,
num_external_computed_tokens=1000,
num_external_cached_tokens=1000,
prompt_len=1000,
)

Expand Down
6 changes: 5 additions & 1 deletion vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,9 @@ def schedule(self) -> SchedulerOutput:
# Count the number of prefix cached tokens.
if request.num_cached_tokens < 0:
request.num_cached_tokens = num_computed_tokens
request.num_external_cached_tokens = (
request.num_external_computed_tokens
)
# Encoder-related.
if encoder_inputs_to_schedule:
scheduled_encoder_inputs[request_id] = encoder_inputs_to_schedule
Expand Down Expand Up @@ -1468,7 +1471,7 @@ def update_from_output(
kv_transfer_params=kv_transfer_params,
trace_headers=request.trace_headers,
num_cached_tokens=request.num_cached_tokens,
num_external_computed_tokens=request.num_external_computed_tokens,
num_external_cached_tokens=request.num_external_cached_tokens,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The field name passed to EngineCoreOutput has been changed from num_external_computed_tokens to num_external_cached_tokens. However, the corresponding updates to the EngineCoreOutput class definition (likely in vllm/v1/engine.py) and the metrics processing logic in vllm/v1/metrics/stats.py are missing from the provided patches. As seen in the provided file content for vllm/v1/metrics/stats.py (line 355), the code still attempts to access output.num_external_computed_tokens, which will result in an AttributeError at runtime. Additionally, the PromptTokenStats.update_from_output method signature (line 270) and its calls in the tests (which are updated in this PR) will be inconsistent, leading to TypeError during test execution. Please ensure that the refactoring is completed across all affected files.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

routed_experts=routed_experts,
num_nans_in_logits=request.num_nans_in_logits,
)
Expand Down Expand Up @@ -2072,6 +2075,7 @@ def _update_waiting_for_remote_kv(self, request: Request) -> None:
# Count the number of prefix cached tokens.
if request.num_cached_tokens < 0:
request.num_cached_tokens = request.num_computed_tokens
request.num_external_cached_tokens = request.num_external_computed_tokens

self.finished_recving_kv_req_ids.remove(request.request_id)

Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ class EngineCoreOutput(
trace_headers: Mapping[str, str] | None = None
# The number of tokens with prefix cache hits (local + external).
num_cached_tokens: int = 0
# The number of tokens computed remotely (original count from connector).
num_external_computed_tokens: int = 0
# The number of tokens with external prefix cache hits.
num_external_cached_tokens: int = 0
routed_experts: np.ndarray | None = None
# The number of NaNs in logits.
# A value greater than 0 indicates that the output is corrupted.
Expand Down
8 changes: 4 additions & 4 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ class PromptTokenStats:
def update_from_output(
self,
num_cached_tokens: int,
num_external_computed_tokens: int,
num_external_cached_tokens: int,
prompt_len: int,
) -> None:
"""Update stats from a prefill output."""
Expand All @@ -280,7 +280,7 @@ def update_from_output(
recomputed = 1 if (num_cached_tokens + 1 == prompt_len) else 0

self.computed += prompt_len - num_cached_tokens
self.external_kv_transfer += num_external_computed_tokens
self.external_kv_transfer += num_external_cached_tokens
# FIXME(yifan): local_cache_hit can go negative after preemption.
# num_cached_tokens is a one-time snapshot from first scheduling and
# is never reset on preemption, while num_external_computed_tokens is
Expand All @@ -290,7 +290,7 @@ def update_from_output(
# as a separate metric rather than reusing num_external_computed_tokens
# for metric directly.
self.local_cache_hit += max(
0, (num_cached_tokens + recomputed - num_external_computed_tokens)
0, (num_cached_tokens + recomputed - num_external_cached_tokens)
)
self.cached_tokens += num_cached_tokens
self.recomputed_tokens += recomputed
Expand Down Expand Up @@ -352,7 +352,7 @@ def update_from_output(
if is_prefilling:
self.prompt_token_stats.update_from_output(
num_cached_tokens=output.num_cached_tokens,
num_external_computed_tokens=output.num_external_computed_tokens,
num_external_cached_tokens=output.num_external_cached_tokens,
prompt_len=prompt_len,
)

Expand Down
3 changes: 3 additions & 0 deletions vllm/v1/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ def __init__(
# The number of tokens that have been computed remotely.
self.num_external_computed_tokens = 0

# The number of tokens fetched from an external KV cache.
self.num_external_cached_tokens = -1

self.block_hashes: list[BlockHash] = []
# Store the block hasher without binding self to avoid creating a
# reference cycle (Request -> partial -> Request) that prevents
Expand Down
Loading