Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions tests/v1/metrics/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,53 @@ def test_prompt_token_stats_full_external_transfer_recompute():
assert stats.local_cache_hit == 0
assert stats.external_kv_transfer == 1000
assert stats.recomputed_tokens == 1


def test_prompt_token_stats_pd_disagg_external_exceeds_cached():
"""Test P/D disagg case where external tokens exceed cached tokens.

In P/D disaggregation, the decode instance may receive more tokens via
external KV transfer than it has cached locally. This previously caused
negative local_cache_hit values which crashed Prometheus counters.
"""
stats = PromptTokenStats()

# Case: Decode receives 7000 tokens from prefill, but has 0 local cache
# This should NOT result in negative local_cache_hit
stats.update_from_output(
num_cached_tokens=0,
num_external_computed_tokens=7000,
prompt_len=7000,
)

assert stats.computed == 7000 # prompt_len - num_cached_tokens
assert stats.local_cache_hit == 0 # Should be clamped to 0, not -7000
assert stats.external_kv_transfer == 7000
assert stats.total == 7000

# Verify all values are non-negative (required for Prometheus counters)
assert stats.computed >= 0
assert stats.local_cache_hit >= 0
assert stats.external_kv_transfer >= 0
assert stats.cached_tokens >= 0
assert stats.recomputed_tokens >= 0


def test_prompt_token_stats_pd_disagg_partial_overlap():
"""Test P/D disagg with partial overlap between external and cached."""
stats = PromptTokenStats()

# Case: Some local cache, but more external transfer
# num_cached_tokens=100, num_external=500
# Old behavior: local_cache_hit = 100 - 500 = -400 (BUG!)
# New behavior: local_cache_hit = max(0, 100 - 500) = 0
stats.update_from_output(
num_cached_tokens=100,
num_external_computed_tokens=500,
prompt_len=1000,
)

assert stats.computed == 900 # 1000 - 100
assert stats.local_cache_hit == 0 # Clamped from -400
assert stats.external_kv_transfer == 500
assert stats.local_cache_hit >= 0 # Must be non-negative for Prometheus
6 changes: 4 additions & 2 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,10 @@ def update_from_output(

self.computed += prompt_len - num_cached_tokens
self.external_kv_transfer += num_external_computed_tokens
self.local_cache_hit += (
num_cached_tokens + recomputed - num_external_computed_tokens
# Clamp to non-negative: in P/D disagg, external tokens can exceed
# local cached tokens, making this calculation negative.
self.local_cache_hit += max(
0, num_cached_tokens + recomputed - num_external_computed_tokens
)
Comment on lines +280 to 282
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This change correctly prevents a crash by ensuring the local_cache_hit increment is non-negative. However, it introduces a critical issue by breaking the accounting invariants for PromptTokenStats documented in the class docstring (lines 246-248).

The invariants are:

  1. computed + local_cache_hit + external_kv_transfer - recomputed_tokens = total
  2. local_cache_hit + external_kv_transfer - recomputed_tokens = cached_tokens

When num_cached_tokens + recomputed - num_external_computed_tokens is negative, clamping its contribution to local_cache_hit to 0 causes these identities to no longer hold, leading to inconsistent and incorrect metrics.

For example, let X = num_cached_tokens + recomputed - num_external_computed_tokens. With the original code, the change (delta) to both sides of the first invariant was prompt_len. With this new change, when X < 0, the delta of the left-hand side becomes prompt_len - X, which does not equal prompt_len (the delta of the right-hand side). This means the metrics no longer add up correctly.

A more robust fix would adjust other metrics like computed and cached_tokens to compensate for clamping local_cache_hit, thus preserving the invariants. This would require a more comprehensive change to the update logic in this method to ensure all metrics remain consistent.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@markmc any suggestion on this? Thank you!

self.cached_tokens += num_cached_tokens
self.recomputed_tokens += recomputed
Expand Down