diff --git a/tests/v1/kv_connector/nixl_integration/test_multi_connector_edge_cases.py b/tests/v1/kv_connector/nixl_integration/test_multi_connector_edge_cases.py index 602a7d179162..f109190a4a0a 100644 --- a/tests/v1/kv_connector/nixl_integration/test_multi_connector_edge_cases.py +++ b/tests/v1/kv_connector/nixl_integration/test_multi_connector_edge_cases.py @@ -297,7 +297,7 @@ def test_multi_block_correctness(): def test_cold_decode_no_cache_hit_metrics(): - """Cold decode: external_kv_transfer==P, local_cache_hit==0.""" + """Cold decode: external_kv_transfer==P, local_cache_hit==0, local_compute==0.""" n0 = _fetch_nixl_bytes(DECODE_HOST, DECODE_PORT) m0 = _fetch_decode_metrics() proxy_text, P = _complete(proxy_client, MEDIUM_PROMPT) @@ -312,8 +312,8 @@ def test_cold_decode_no_cache_hit_metrics(): assert d["external_kv_transfer"] == P, ( f"expected external_kv_transfer={P}, got {d['external_kv_transfer']}" ) - assert d["local_compute"] == 1, ( - f"expected local_compute=1, got {d['local_compute']}" + assert d["local_compute"] == 0, ( + f"expected local_compute=0, got {d['local_compute']}" ) assert d["local_cache_hit"] == 0, ( f"expected local_cache_hit=0, got {d['local_cache_hit']}" @@ -341,15 +341,15 @@ def test_full_decode_gpu_cache_hit_metrics(): print(f"FULL CACHE HIT: {P} tokens, cached={cached}, nixl={expected_nixl}") print(f" metrics delta: {d}, nixl_bytes_delta={n1 - n0}") assert len(proxy_text) > 0, "proxy returned empty response" - assert d["local_cache_hit"] == cached - 1, ( - f"expected local_cache_hit={cached - 1}, got {d['local_cache_hit']}" + assert d["local_cache_hit"] == cached, ( + f"expected local_cache_hit={cached}, got {d['local_cache_hit']}" ) assert d["external_kv_transfer"] == expected_nixl, ( f"expected external_kv_transfer={expected_nixl}, " f"got {d['external_kv_transfer']}" ) - assert d["local_compute"] == 1, ( - f"expected local_compute=1 (recomputed last token), got {d['local_compute']}" + assert d["local_compute"] == 0, ( + f"expected local_compute=0, got {d['local_compute']}" ) assert n1 - n0 > 0, ( f"expected nixl_bytes_transferred to increase (partial NIXL for " @@ -383,11 +383,11 @@ def test_partial_decode_gpu_cache_hit_metrics(): f"expected external_kv_transfer={expected_nixl}, " f"got {d['external_kv_transfer']}" ) - assert d["local_cache_hit"] == cached - 1, ( - f"expected local_cache_hit={cached - 1}, got {d['local_cache_hit']}" + assert d["local_cache_hit"] == cached, ( + f"expected local_cache_hit={cached}, got {d['local_cache_hit']}" ) - assert d["local_compute"] == 1, ( - f"expected local_compute=1 (recomputed last token), got {d['local_compute']}" + assert d["local_compute"] == 0, ( + f"expected local_compute=0, got {d['local_compute']}" ) assert n1 - n0 > 0, ( f"expected nixl_bytes_transferred to increase (NIXL for uncached "