Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def test_multi_block_correctness():


def test_cold_decode_no_cache_hit_metrics():
"""Cold decode: external_kv_transfer==P, local_cache_hit==0."""
"""Cold decode: external_kv_transfer==P, local_cache_hit==0, local_compute==0."""
n0 = _fetch_nixl_bytes(DECODE_HOST, DECODE_PORT)
m0 = _fetch_decode_metrics()
proxy_text, P = _complete(proxy_client, MEDIUM_PROMPT)
Expand All @@ -312,8 +312,8 @@ def test_cold_decode_no_cache_hit_metrics():
assert d["external_kv_transfer"] == P, (
f"expected external_kv_transfer={P}, got {d['external_kv_transfer']}"
)
assert d["local_compute"] == 1, (
f"expected local_compute=1, got {d['local_compute']}"
assert d["local_compute"] == 0, (
f"expected local_compute=0, got {d['local_compute']}"
)
assert d["local_cache_hit"] == 0, (
f"expected local_cache_hit=0, got {d['local_cache_hit']}"
Expand Down Expand Up @@ -341,15 +341,15 @@ def test_full_decode_gpu_cache_hit_metrics():
print(f"FULL CACHE HIT: {P} tokens, cached={cached}, nixl={expected_nixl}")
print(f" metrics delta: {d}, nixl_bytes_delta={n1 - n0}")
assert len(proxy_text) > 0, "proxy returned empty response"
assert d["local_cache_hit"] == cached - 1, (
f"expected local_cache_hit={cached - 1}, got {d['local_cache_hit']}"
assert d["local_cache_hit"] == cached, (
f"expected local_cache_hit={cached}, got {d['local_cache_hit']}"
)
assert d["external_kv_transfer"] == expected_nixl, (
f"expected external_kv_transfer={expected_nixl}, "
f"got {d['external_kv_transfer']}"
)
assert d["local_compute"] == 1, (
f"expected local_compute=1 (recomputed last token), got {d['local_compute']}"
assert d["local_compute"] == 0, (
f"expected local_compute=0, got {d['local_compute']}"
)
assert n1 - n0 > 0, (
f"expected nixl_bytes_transferred to increase (partial NIXL for "
Expand Down Expand Up @@ -383,11 +383,11 @@ def test_partial_decode_gpu_cache_hit_metrics():
f"expected external_kv_transfer={expected_nixl}, "
f"got {d['external_kv_transfer']}"
)
assert d["local_cache_hit"] == cached - 1, (
f"expected local_cache_hit={cached - 1}, got {d['local_cache_hit']}"
assert d["local_cache_hit"] == cached, (
f"expected local_cache_hit={cached}, got {d['local_cache_hit']}"
)
assert d["local_compute"] == 1, (
f"expected local_compute=1 (recomputed last token), got {d['local_compute']}"
assert d["local_compute"] == 0, (
f"expected local_compute=0, got {d['local_compute']}"
)
assert n1 - n0 > 0, (
f"expected nixl_bytes_transferred to increase (NIXL for uncached "
Expand Down
Loading