diff --git a/tests/v1/metrics/test_stats.py b/tests/v1/metrics/test_stats.py index d49874adc998..58ab358432d6 100644 --- a/tests/v1/metrics/test_stats.py +++ b/tests/v1/metrics/test_stats.py @@ -116,7 +116,7 @@ def test_prompt_token_stats_all_computed(): # Case 1: No caching (All tokens computed locally) stats.update_from_output( num_cached_tokens=0, - num_external_computed_tokens=0, + num_external_cached_tokens=0, prompt_len=1000, ) @@ -133,7 +133,7 @@ def test_prompt_token_stats_partial_local_cache(): # Case 2: Partial local cache stats.update_from_output( num_cached_tokens=300, - num_external_computed_tokens=0, + num_external_cached_tokens=0, prompt_len=1000, ) @@ -149,7 +149,7 @@ def test_prompt_token_stats_partial_external_transfer(): # Case 3: Partial external transfer stats.update_from_output( num_cached_tokens=500, - num_external_computed_tokens=500, + num_external_cached_tokens=500, prompt_len=1000, ) @@ -165,7 +165,7 @@ def test_prompt_token_stats_mixed_sources(): # Case 4: Mixed sources stats.update_from_output( num_cached_tokens=600, - num_external_computed_tokens=200, + num_external_cached_tokens=200, prompt_len=1000, ) @@ -185,7 +185,7 @@ def test_prompt_token_stats_full_local_cache_recompute(): # Case 5: Full local cache (999 cached after reduction, 1 recomputed) stats.update_from_output( num_cached_tokens=999, - num_external_computed_tokens=0, + num_external_cached_tokens=0, prompt_len=1000, ) @@ -201,7 +201,7 @@ def test_prompt_token_stats_full_external_transfer_recompute(): # Case 6: Full external transfer (999 cached after reduction, 1 recomputed) stats.update_from_output( num_cached_tokens=999, - num_external_computed_tokens=1000, + num_external_cached_tokens=1000, prompt_len=1000, ) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index fe524ccace16..25fc4fa61cca 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -829,6 +829,9 @@ def schedule(self) -> SchedulerOutput: # Count the number of prefix cached tokens. if request.num_cached_tokens < 0: request.num_cached_tokens = num_computed_tokens + request.num_external_cached_tokens = ( + request.num_external_computed_tokens + ) # Encoder-related. if encoder_inputs_to_schedule: scheduled_encoder_inputs[request_id] = encoder_inputs_to_schedule @@ -1468,7 +1471,7 @@ def update_from_output( kv_transfer_params=kv_transfer_params, trace_headers=request.trace_headers, num_cached_tokens=request.num_cached_tokens, - num_external_computed_tokens=request.num_external_computed_tokens, + num_external_cached_tokens=request.num_external_cached_tokens, routed_experts=routed_experts, num_nans_in_logits=request.num_nans_in_logits, ) @@ -2072,6 +2075,7 @@ def _update_waiting_for_remote_kv(self, request: Request) -> None: # Count the number of prefix cached tokens. if request.num_cached_tokens < 0: request.num_cached_tokens = request.num_computed_tokens + request.num_external_cached_tokens = request.num_external_computed_tokens self.finished_recving_kv_req_ids.remove(request.request_id) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 114d45fc4ff7..ea96ce6da880 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -159,8 +159,8 @@ class EngineCoreOutput( trace_headers: Mapping[str, str] | None = None # The number of tokens with prefix cache hits (local + external). num_cached_tokens: int = 0 - # The number of tokens computed remotely (original count from connector). - num_external_computed_tokens: int = 0 + # The number of tokens with external prefix cache hits. + num_external_cached_tokens: int = 0 routed_experts: np.ndarray | None = None # The number of NaNs in logits. # A value greater than 0 indicates that the output is corrupted. diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 45f002e01edb..db2dfa1194cd 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -270,7 +270,7 @@ class PromptTokenStats: def update_from_output( self, num_cached_tokens: int, - num_external_computed_tokens: int, + num_external_cached_tokens: int, prompt_len: int, ) -> None: """Update stats from a prefill output.""" @@ -280,7 +280,7 @@ def update_from_output( recomputed = 1 if (num_cached_tokens + 1 == prompt_len) else 0 self.computed += prompt_len - num_cached_tokens - self.external_kv_transfer += num_external_computed_tokens + self.external_kv_transfer += num_external_cached_tokens # FIXME(yifan): local_cache_hit can go negative after preemption. # num_cached_tokens is a one-time snapshot from first scheduling and # is never reset on preemption, while num_external_computed_tokens is @@ -290,7 +290,7 @@ def update_from_output( # as a separate metric rather than reusing num_external_computed_tokens # for metric directly. self.local_cache_hit += max( - 0, (num_cached_tokens + recomputed - num_external_computed_tokens) + 0, (num_cached_tokens + recomputed - num_external_cached_tokens) ) self.cached_tokens += num_cached_tokens self.recomputed_tokens += recomputed @@ -352,7 +352,7 @@ def update_from_output( if is_prefilling: self.prompt_token_stats.update_from_output( num_cached_tokens=output.num_cached_tokens, - num_external_computed_tokens=output.num_external_computed_tokens, + num_external_cached_tokens=output.num_external_cached_tokens, prompt_len=prompt_len, ) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 946e71c15d35..e30575b12884 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -162,6 +162,9 @@ def __init__( # The number of tokens that have been computed remotely. self.num_external_computed_tokens = 0 + # The number of tokens fetched from an external KV cache. + self.num_external_cached_tokens = -1 + self.block_hashes: list[BlockHash] = [] # Store the block hasher without binding self to avoid creating a # reference cycle (Request -> partial -> Request) that prevents