From d36420d02eecdf8eefda716a12aa7f5f1a22f453 Mon Sep 17 00:00:00 2001 From: zhewenli Date: Wed, 22 Oct 2025 21:50:06 -0700 Subject: [PATCH 1/3] update tests Signed-off-by: zhewenli --- tests/v1/kv_offload/test_cpu_gpu.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py index 81b57f1ca0c8..de5cab9bb974 100644 --- a/tests/v1/kv_offload/test_cpu_gpu.py +++ b/tests/v1/kv_offload/test_cpu_gpu.py @@ -8,11 +8,26 @@ from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend -from vllm.v1.attention.backends.flashinfer import FlashInferBackend -from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler +BACKENDS_TO_TEST = [FlashAttentionBackend] + +try: + from vllm.v1.attention.backends.flashinfer import FlashInferBackend + + BACKENDS_TO_TEST.append(FlashInferBackend) +except ImportError: + pass + +try: + from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend + + BACKENDS_TO_TEST.append(FlashAttnMLABackend) +except ImportError: + pass + + NUM_GPU_BLOCKS = [64] NUM_CPU_BLOCKS = [256] GPU_BLOCK_SIZES = [16] @@ -56,7 +71,7 @@ def test_transfer( current_platform.seed_everything(seed) # create per-layer GPU KV caches - attn_backends_list = [FlashAttentionBackend, FlashInferBackend, FlashAttnMLABackend] + attn_backends_list = BACKENDS_TO_TEST gpu_caches = {} attn_backends = {} From 834e042be686547df19cfa26494ffbcef97f4933 Mon Sep 17 00:00:00 2001 From: zhewenli Date: Wed, 22 Oct 2025 22:22:42 -0700 Subject: [PATCH 2/3] update logic to use current_platform Signed-off-by: zhewenli --- tests/v1/kv_offload/test_cpu_gpu.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py index de5cab9bb974..3c50456d036e 100644 --- a/tests/v1/kv_offload/test_cpu_gpu.py +++ b/tests/v1/kv_offload/test_cpu_gpu.py @@ -13,20 +13,23 @@ BACKENDS_TO_TEST = [FlashAttentionBackend] -try: +if current_platform.is_cuda(): from vllm.v1.attention.backends.flashinfer import FlashInferBackend BACKENDS_TO_TEST.append(FlashInferBackend) -except ImportError: - pass -try: from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend BACKENDS_TO_TEST.append(FlashAttnMLABackend) -except ImportError: - pass +if current_platform.is_rocm(): + from vllm.v1.attention.backends.rocm_attn import RocmAttentionBackend + + BACKENDS_TO_TEST.append(RocmAttentionBackend) + + from vllm.v1.attention.backends.mla.triton_mla import TritonMLABackend + + BACKENDS_TO_TEST.append(TritonMLABackend) NUM_GPU_BLOCKS = [64] NUM_CPU_BLOCKS = [256] @@ -41,6 +44,10 @@ NUM_MAPPINGS = [3] +@pytest.mark.skipif( + len(BACKENDS_TO_TEST) < 2, + reason="Need at least 2 backends to test heterogeneous KV cache layouts", +) @pytest.mark.parametrize("gpu_to_cpu", [True, False]) @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -70,7 +77,6 @@ def test_transfer( ) -> None: current_platform.seed_everything(seed) - # create per-layer GPU KV caches attn_backends_list = BACKENDS_TO_TEST gpu_caches = {} From f8a912310949b7c5651032bd58caa5a53e484b01 Mon Sep 17 00:00:00 2001 From: zhewenli Date: Wed, 22 Oct 2025 22:37:11 -0700 Subject: [PATCH 3/3] update Signed-off-by: zhewenli --- tests/v1/kv_offload/test_cpu_gpu.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py index 3c50456d036e..0d4fa344d298 100644 --- a/tests/v1/kv_offload/test_cpu_gpu.py +++ b/tests/v1/kv_offload/test_cpu_gpu.py @@ -13,7 +13,7 @@ BACKENDS_TO_TEST = [FlashAttentionBackend] -if current_platform.is_cuda(): +if not current_platform.is_rocm(): from vllm.v1.attention.backends.flashinfer import FlashInferBackend BACKENDS_TO_TEST.append(FlashInferBackend) @@ -22,15 +22,6 @@ BACKENDS_TO_TEST.append(FlashAttnMLABackend) -if current_platform.is_rocm(): - from vllm.v1.attention.backends.rocm_attn import RocmAttentionBackend - - BACKENDS_TO_TEST.append(RocmAttentionBackend) - - from vllm.v1.attention.backends.mla.triton_mla import TritonMLABackend - - BACKENDS_TO_TEST.append(TritonMLABackend) - NUM_GPU_BLOCKS = [64] NUM_CPU_BLOCKS = [256] GPU_BLOCK_SIZES = [16] @@ -44,10 +35,6 @@ NUM_MAPPINGS = [3] -@pytest.mark.skipif( - len(BACKENDS_TO_TEST) < 2, - reason="Need at least 2 backends to test heterogeneous KV cache layouts", -) @pytest.mark.parametrize("gpu_to_cpu", [True, False]) @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -77,6 +64,7 @@ def test_transfer( ) -> None: current_platform.seed_everything(seed) + # create per-layer GPU KV caches based on available attn_backends attn_backends_list = BACKENDS_TO_TEST gpu_caches = {}