diff --git a/.gitignore b/.gitignore index ffa36dee1ab9..6ab8057bf8fc 100644 --- a/.gitignore +++ b/.gitignore @@ -221,3 +221,9 @@ csrc/moe/marlin_moe_wna16/kernel_* # Ignore ep_kernels_workspace folder ep_kernels_workspace/ + +# Zhewen local dev +*.md +*.ipynb +*.txt +*.log diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py index 81b57f1ca0c8..86e2f89af3e1 100644 --- a/tests/v1/kv_offload/test_cpu_gpu.py +++ b/tests/v1/kv_offload/test_cpu_gpu.py @@ -8,11 +8,40 @@ from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend -from vllm.v1.attention.backends.flashinfer import FlashInferBackend -from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler +BACKENDS_TO_TEST = [FlashAttentionBackend] + +try: + from vllm.v1.attention.backends.flashinfer import FlashInferBackend + + BACKENDS_TO_TEST.append(FlashInferBackend) +except ImportError: + pass + +try: + from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend + + BACKENDS_TO_TEST.append(FlashAttnMLABackend) +except ImportError: + pass + +if current_platform.is_rocm(): + try: + from vllm.v1.attention.backends.rocm_attn import RocmAttentionBackend + + BACKENDS_TO_TEST.append(RocmAttentionBackend) + except ImportError: + pass + +try: + from vllm.v1.attention.backends.triton_attn import TritonAttentionBackend + + BACKENDS_TO_TEST.append(TritonAttentionBackend) +except ImportError: + pass + NUM_GPU_BLOCKS = [64] NUM_CPU_BLOCKS = [256] GPU_BLOCK_SIZES = [16] @@ -26,6 +55,10 @@ NUM_MAPPINGS = [3] +@pytest.mark.skipif( + len(BACKENDS_TO_TEST) < 2, + reason="Need at least 2 backends to test heterogeneous KV cache layouts", +) @pytest.mark.parametrize("gpu_to_cpu", [True, False]) @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -55,8 +88,7 @@ def test_transfer( ) -> None: current_platform.seed_everything(seed) - # create per-layer GPU KV caches - attn_backends_list = [FlashAttentionBackend, FlashInferBackend, FlashAttnMLABackend] + attn_backends_list = BACKENDS_TO_TEST gpu_caches = {} attn_backends = {}