vllm-project · LucasWilkinson · May 16, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
@@ -839,7 +839,7 @@ steps:
   num_gpus: 2
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
   - vllm/v1/worker/kv_connector_model_runner_mixin.py
   - tests/v1/kv_connector/nixl_integration/
   - vllm/platforms/rocm.py
@@ -866,7 +866,7 @@ steps:
   num_gpus: 4
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
   - tests/v1/kv_connector/nixl_integration/
   - vllm/platforms/rocm.py
   commands:
@@ -2341,7 +2341,7 @@ steps:
   num_gpus: 4
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
   - tests/v1/kv_connector/nixl_integration/
   - vllm/platforms/rocm.py
   commands:
@@ -2377,7 +2377,7 @@ steps:
   optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
   - tests/v1/kv_connector/nixl_integration/
   - vllm/platforms/rocm.py
   commands:
@@ -2391,7 +2391,7 @@ steps:
   num_gpus: 4
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
   - tests/v1/kv_connector/nixl_integration/
   - vllm/platforms/rocm.py
   commands:
@@ -2405,7 +2405,7 @@ steps:
   num_gpus: 4
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
   - tests/v1/kv_connector/nixl_integration/
   - vllm/platforms/rocm.py
   commands:
@@ -3353,7 +3353,7 @@ steps:
   optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
   - vllm/v1/worker/kv_connector_model_runner_mixin.py
   - tests/v1/kv_connector/nixl_integration/
   - vllm/platforms/rocm.py
@@ -3369,7 +3369,7 @@ steps:
   optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
   - tests/v1/kv_connector/nixl_integration/
   - vllm/platforms/rocm.py
   commands:
@@ -3384,7 +3384,7 @@ steps:
   optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
   - tests/v1/kv_connector/nixl_integration/
   - vllm/platforms/rocm.py
   commands:

diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
@@ -11,6 +11,7 @@
 import logging
 import types
 from contextlib import contextmanager
+from math import prod
 
 import numpy as np
 import torch
@@ -30,10 +31,13 @@
 )
 from vllm.v1.attention.backends.utils import (
     CommonAttentionMetadata,
-    get_kv_cache_layout,
-    set_kv_cache_layout,
+    resolve_kv_cache_layout,
+)
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    compute_layer_kv_cache_shape_bytes,
+    reshape_kv_cache,
 )
-from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 # ============================================================================
 # Backend Configuration
@@ -324,52 +328,23 @@ def _create_input_tensors(
 def _create_kv_cache(
     config: BenchmarkConfig,
     max_num_blocks: int,
-    backend_class,
     device: torch.device,
     dtype: torch.dtype,
 ) -> list:
-    """Create KV cache tensors for all layers using the backend's methods.
-
-    Uses the backend's get_kv_cache_shape() and get_kv_cache_stride_order()
-    to create the cache with the correct shape and memory layout.
-    """
-    # Get the logical shape from the backend
-    cache_shape = backend_class.get_kv_cache_shape(
-        num_blocks=max_num_blocks,
+    """Create KV cache tensors for all layers using the standard allocator."""
+    spec = FullAttentionSpec(
         block_size=config.block_size,
         num_kv_heads=config.num_kv_heads,
         head_size=config.head_dim,
+        dtype=dtype,
     )
-
-    # Get the stride order for custom memory layout
-    try:
-        stride_order = backend_class.get_kv_cache_stride_order()
-        assert len(stride_order) == len(cache_shape)
-    except (AttributeError, NotImplementedError):
-        stride_order = tuple(range(len(cache_shape)))
-
-    # Permute shape to physical layout order
-    physical_shape = tuple(cache_shape[i] for i in stride_order)
-
-    # Compute inverse permutation to get back to logical view
-    inv_order = [stride_order.index(i) for i in range(len(stride_order))]
-
-    # Use fp8 dtype for cache when requested.
-    cache_dtype = dtype
-    if config.kv_cache_dtype == "fp8":
-        from vllm.platforms import current_platform
-
-        cache_dtype = current_platform.fp8_dtype()
-
-    cache_list = []
-    for _ in range(config.num_layers):
-        # Allocate in physical layout order (contiguous in memory)
-        cache = torch.zeros(*physical_shape, device=device, dtype=cache_dtype)
-        # Permute to logical view
-        cache = cache.permute(*inv_order)
-        cache_list.append(cache)
-
-    return cache_list
+    layout = resolve_kv_cache_layout()
+    total_bytes = (
+        prod(compute_layer_kv_cache_shape_bytes(spec, max_num_blocks))
+        * config.num_layers
+    )
+    buf = torch.zeros(total_bytes, device=device, dtype=torch.int8)
+    return reshape_kv_cache(buf, spec, max_num_blocks, config.num_layers, layout)
 
 
 # ============================================================================
@@ -514,13 +489,6 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
                 backend_cfg, config, device, dtype
             )
 
-            # Set KV cache layout if the backend requires a specific one
-            # (e.g., FlashInfer requires HND on SM100/Blackwell for TRTLLM attention)
-            required_layout = backend_class.get_required_kv_cache_layout()
-            if required_layout is not None:
-                set_kv_cache_layout(required_layout)
-                get_kv_cache_layout.cache_clear()
-
             common_metadata = _build_common_attn_metadata(
                 q_lens, kv_lens, config.block_size, device
             )
@@ -549,9 +517,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
                 config, total_q, device, dtype, quantize_query=quantize_query
             )
 
-            cache_list = _create_kv_cache(
-                config, max_num_blocks, backend_class, device, dtype
-            )
+            cache_list = _create_kv_cache(config, max_num_blocks, device, dtype)
 
             times, mem_stats = _run_single_benchmark(
                 config,

diff --git a/docs/features/nixl_connector_compatibility.md b/docs/features/nixl_connector_compatibility.md
@@ -59,7 +59,7 @@ th:not(:first-child) {
 
 <sup>1</sup> P and D instances must use the same speculation configuration.
 
-<sup>2</sup> Requires `FLASH_ATTN` or `FLASHINFER` backend **and** `HND` KV cache layout. Enable via `--kv-transfer-config '{"kv_connector_extra_config": {"enable_cross_layers_blocks": "True"}}'`.
+<sup>2</sup> Cross-layer contiguity is achieved by using a `BLHNC` layout (set via `VLLM_KV_CACHE_LAYOUT=BLHNC` or `--enable-cross-layers`).
 
 <sup>3</sup> Supported only when HMA is **not** required (i.e., non-hybrid models). Block IDs are remapped automatically. Only P block size < D block size is supported.
 

diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
@@ -389,15 +389,6 @@ Support use case: Prefill with 'HND' and decode with 'NHD' with experimental con
 --kv-transfer-config '{..., "enable_permute_local_kv":"True"}'
 ```
 
-### Cross layers blocks
-
-By default, this feature is disabled. On attention backends that support this feature, each logical block is contiguous in physical memory. This reduces the number of buffers that need to be transferred.
-To enable this feature:
-
-```bash
---kv-transfer-config '{..., "kv_connector_extra_config": {"enable_cross_layers_blocks": "True"}}'
-```
-
 ## Example Scripts/Code
 
 Refer to these example scripts in the vLLM repository:

diff --git a/tests/compile/passes/test_fusion_attn.py b/tests/compile/passes/test_fusion_attn.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
+from math import prod
 
 import pytest
 import torch._dynamo
@@ -39,7 +40,12 @@
 from vllm.utils.flashinfer import has_flashinfer
 from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
-from vllm.v1.kv_cache_interface import AttentionSpec, get_kv_quant_mode
+from vllm.v1.attention.backends.utils import resolve_kv_cache_layout
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    compute_layer_kv_cache_shape_bytes,
+    get_kv_quant_mode,
+)
 
 DEVICE_TYPE = current_platform.device_type
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -108,29 +114,28 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
         max_blocks = (max(batch_spec.seq_lens) + self.block_size - 1) // self.block_size
         num_blocks = batch_size * max_blocks
 
-        # Fetch the attention backend and kv cache shape and stride order
-        attn_backend = self.attn.attn_backend
-        kv_cache_shape = attn_backend.get_kv_cache_shape(
-            num_blocks, self.block_size, self.num_kv_heads, self.head_size
+        spec = AttentionSpec(
+            block_size=self.block_size,
+            num_kv_heads=self.num_kv_heads,
+            head_size=self.head_size,
+            dtype=self.attn.kv_cache_torch_dtype,
+            kv_quant_mode=get_kv_quant_mode(self.attn.kv_cache_dtype),
         )
-        try:
-            kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
-        except (AttributeError, NotImplementedError):
-            kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
-
-        kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
+        layout = resolve_kv_cache_layout()
+        kv_cache_shape = compute_layer_kv_cache_shape_bytes(spec, num_blocks)
+        kv_cache_stride_order = layout.layer_stride_order
+        physical_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
         inv_order = [
             kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order))
         ]
 
-        # Create dummy KV cache
         raw_tensor = torch.zeros(
-            2 * num_blocks * self.block_size * self.num_kv_heads * self.head_size,
-            dtype=self.attn.kv_cache_torch_dtype,
+            prod(kv_cache_shape),
+            dtype=torch.int8,
             device=self.device,
         )
-        raw_tensor = raw_tensor.view(kv_cache_shape)
-        kv_cache = raw_tensor.permute(*inv_order)
+        raw_tensor = raw_tensor.view(physical_shape)
+        kv_cache = raw_tensor.permute(*inv_order).view(self.attn.kv_cache_torch_dtype)
 
         self.attn.kv_cache = kv_cache
 

diff --git a/tests/compile/passes/test_mla_attn_quant_fusion.py b/tests/compile/passes/test_mla_attn_quant_fusion.py
@@ -150,27 +150,14 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
         max_blocks = (max(batch_spec.seq_lens) + self.block_size - 1) // self.block_size
         num_blocks = batch_size * max_blocks
 
-        # MLA KV cache is 3D: (num_blocks, block_size, head_size)
-        attn_backend = self.mla_attn.attn_backend
-        kv_cache_shape = attn_backend.get_kv_cache_shape(
-            num_blocks, self.block_size, 1, self.head_size
-        )
-        try:
-            kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
-        except (AttributeError, NotImplementedError):
-            kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
-
-        ordered_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
-        inv_order = [
-            kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order))
-        ]
-
-        raw_tensor = torch.zeros(
-            ordered_shape, dtype=self.kv_cache_dtype, device=self.device
+        # MLA KV cache is 4D: (num_blocks, num_heads=1, block_size, head_size)
+        kv_cache = torch.zeros(
+            (num_blocks, 1, self.block_size, self.head_size),
+            dtype=self.kv_cache_dtype,
+            device=self.device,
         )
-        kv_cache = raw_tensor.permute(*inv_order)
 
-        self.mla_attn.kv_cache = kv_cache
+        self.mla_attn.bind_kv_cache(kv_cache)
 
         self.attn_metadata = self.builder.build(
             common_prefix_len=0, common_attn_metadata=common_attn_metadata

diff --git a/tests/compile/passes/test_mla_rope_kvcache_cat_fusion.py b/tests/compile/passes/test_mla_rope_kvcache_cat_fusion.py
@@ -165,29 +165,15 @@ def build_attn_metadata(self, batch_size: int) -> CommonAttentionMetadata:
         max_blocks = (max(batch_spec.seq_lens) + self.block_size - 1) // self.block_size
         num_blocks = batch_size * max_blocks
 
-        # Fetch the attention backend and kv cache shape and stride order
-        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
-            num_blocks, self.block_size, self.num_kv_heads, self.head_size
-        )
-        try:
-            kv_cache_stride_order = self.attn_backend.get_kv_cache_stride_order()
-        except (AttributeError, NotImplementedError):
-            kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
-
-        kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
-        inv_order = [
-            kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order))
-        ]
-
-        raw_tensor = torch.zeros(
-            num_blocks * self.block_size * self.num_kv_heads * self.head_size,
+        # MLA uses a 4D KV cache: (num_blocks, num_heads=1, block_size, head_size).
+        kv_cache_shape = (num_blocks, 1, self.block_size, self.head_size)
+        kv_cache = torch.zeros(
+            kv_cache_shape,
             dtype=self.kv_cache_dtype,
             device=self.device,
         )
-        raw_tensor = raw_tensor.view(kv_cache_shape)
-        kv_cache = raw_tensor.permute(*inv_order)
 
-        self.mla_attn.kv_cache = kv_cache
+        self.mla_attn.bind_kv_cache(kv_cache)
 
         # Build attn metadata
         attn_metadata = self.builder.build(