From 2a25c4bbea2f99cb45d05a0605b1464a6de74ade Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 18 Feb 2026 10:40:06 -0500
Subject: [PATCH 01/25] Fix

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 tests/models/registry.py                    | 5 ++++-
 vllm/model_executor/models/minicpm_eagle.py | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 16e64ea9e6d8..1a2556d5c944 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -761,7 +761,10 @@ def check_available_online(
         trust_remote_code=True,
         extras={"2b": "h2oai/h2ovl-mississippi-2b"},
         max_transformers_version="4.48",
-        transformers_version_reason={"hf": "HF model is not compatible."},
+        transformers_version_reason={
+            "hf": "HF model is not compatible.",
+            "vllm": "Remote config code is not compatible.",
+        },
     ),
     "HCXVisionForCausalLM": _HfExamplesInfo(
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index e9f1a91bfc4a..4334ad4567fa 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -360,7 +360,12 @@ def forward(
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                f"{type(self).__name__} does not support multimodal inputs yet."
+            )
         hidden_states, hidden_states2 = self.model(input_ids, positions, hidden_states)
         hidden_states = hidden_states / self.scale_width
         hidden_states2 = hidden_states2 / self.scale_width

From 1a4018693ddb222ca1d5453d39e898e03a7bc7d3 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 18 Feb 2026 10:46:07 -0500
Subject: [PATCH 02/25] Fix

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 tests/models/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/utils.py b/tests/models/utils.py
index 4830f18dccf5..d68d711fd1c0 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -457,6 +457,9 @@ def dummy_hf_overrides(
     # Kimi uses `num_expert_group` instead of `n_group`.
     if n_group is None:
         n_group = getattr(text_config, "num_expert_group", None)
+    # InternS1Pro uses `router_n_groups` instead of `n_group`.
+    if n_group is None:
+        n_group = getattr(text_config, "router_n_groups", None)
     num_experts = n_group * 2 if n_group is not None else 2
 
     # we use three layers for Gemma-3n to check

From c617228cdfd159901dcd4e3bf0d2612c9dec3d79 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 18 Feb 2026 10:54:59 -0500
Subject: [PATCH 03/25] Fix

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 tests/models/utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/utils.py b/tests/models/utils.py
index d68d711fd1c0..8c1fb63d67ac 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -489,12 +489,14 @@ class DummyConfig:
     # Only set MoE related config when the model has MoE layers.
     # Otherwise all models detected as MoE by _get_transformers_backend_cls.
     if model_arch_config.num_experts > 0:
+        orig_topk = getattr(text_config, "num_experts_per_tok", 2)
+        topk = min(orig_topk, 2)
         update_dict.update(
             {
                 "num_experts": num_experts,
-                "num_experts_per_tok": 2,
+                "num_experts_per_tok": topk,
                 # Kimi uses `num_experts_per_token`.
-                "num_experts_per_token": 2,
+                "num_experts_per_token": topk,
                 "num_local_experts": num_experts,
                 # Otherwise there will not be any expert layers
                 "first_k_dense_replace": 0,

From 6ac74b88b39b8d2433f73f62f7ad148fec734946 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 18 Feb 2026 11:04:21 -0500
Subject: [PATCH 04/25] Make _update_block_size_for_backend fault-tolerant

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 tests/models/registry.py |  5 +----
 vllm/platforms/cuda.py   | 20 ++++++++++++++++----
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 1a2556d5c944..16e64ea9e6d8 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -761,10 +761,7 @@ def check_available_online(
         trust_remote_code=True,
         extras={"2b": "h2oai/h2ovl-mississippi-2b"},
         max_transformers_version="4.48",
-        transformers_version_reason={
-            "hf": "HF model is not compatible.",
-            "vllm": "Remote config code is not compatible.",
-        },
+        transformers_version_reason={"hf": "HF model is not compatible."},
     ),
     "HCXVisionForCausalLM": _HfExamplesInfo(
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2314d0a8b675..0c3e43ee65bb 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -179,10 +179,22 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         # Skip hybrid (attention+mamba) models — their block_size is
         # managed by HybridAttentionMambaModelConfig
         if model_config is not None and not model_config.is_hybrid:
-            cls._update_block_size_for_backend(
-                vllm_config,
-                user_specified_block_size,
-            )
+            try:
+                cls._update_block_size_for_backend(
+                    vllm_config,
+                    user_specified_block_size,
+                )
+            except Exception:
+                # Some models (e.g. trust_remote_code models with
+                # incompatible transformers versions) may fail here.
+                # Fall back to the default block_size rather than
+                # crashing during config validation.
+                logger.debug(
+                    "Failed to update block size for attention backend, "
+                    "using default block_size=%d.",
+                    cache_config.block_size,
+                    exc_info=True,
+                )
 
         scheduler_config = vllm_config.scheduler_config
         # Note: model_config may be None during testing

From dfed2a16d848acf4b33787d1ea5081b9e1e79790 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 18 Feb 2026 13:38:06 -0500
Subject: [PATCH 05/25] Workaround by limiting to MLA

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/model_executor/models/minicpm_eagle.py | 5 -----
 vllm/platforms/cuda.py                      | 8 +++++++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index 4334ad4567fa..e9f1a91bfc4a 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -360,12 +360,7 @@ def forward(
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        if inputs_embeds is not None:
-            raise NotImplementedError(
-                f"{type(self).__name__} does not support multimodal inputs yet."
-            )
         hidden_states, hidden_states2 = self.model(input_ids, positions, hidden_states)
         hidden_states = hidden_states / self.scale_width
         hidden_states2 = hidden_states2 / self.scale_width
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 0c3e43ee65bb..b6ec4748bb93 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -178,7 +178,13 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         # Note: model_config may be None during testing.
         # Skip hybrid (attention+mamba) models — their block_size is
         # managed by HybridAttentionMambaModelConfig
-        if model_config is not None and not model_config.is_hybrid:
+        # TODO(matt): Limiting this to MLA models is a workaround to avoid
+        # CUDA initialization during testing. Fix this and remove the MLA check
+        if (
+            model_config is not None
+            and not model_config.is_hybrid
+            and model_config.use_mla
+        ):
             try:
                 cls._update_block_size_for_backend(
                     vllm_config,

From 406575751f7effc5ea9736051ad6002233f1d38b Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 18 Feb 2026 13:44:19 -0500
Subject: [PATCH 06/25] Try-except should no longer be necessary

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/platforms/cuda.py | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index b6ec4748bb93..746f8282b1ad 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -185,22 +185,10 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             and not model_config.is_hybrid
             and model_config.use_mla
         ):
-            try:
-                cls._update_block_size_for_backend(
-                    vllm_config,
-                    user_specified_block_size,
-                )
-            except Exception:
-                # Some models (e.g. trust_remote_code models with
-                # incompatible transformers versions) may fail here.
-                # Fall back to the default block_size rather than
-                # crashing during config validation.
-                logger.debug(
-                    "Failed to update block size for attention backend, "
-                    "using default block_size=%d.",
-                    cache_config.block_size,
-                    exc_info=True,
-                )
+            cls._update_block_size_for_backend(
+                vllm_config,
+                user_specified_block_size,
+            )
 
         scheduler_config = vllm_config.scheduler_config
         # Note: model_config may be None during testing

From 40d3782bc130263371dd4aa025f827f2997d31ad Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 18 Feb 2026 17:20:17 -0500
Subject: [PATCH 07/25] Lazy allocate workspaces

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/v1/attention/backends/mla/cutlass_mla.py | 12 ++++++++++--
 .../attention/backends/mla/flashinfer_mla.py  | 19 +++++++++++++------
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 6d10a9d66e20..851188e0c240 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -95,7 +95,15 @@ def ensure_size(self, attn_metadata: MLACommonMetadata, num_kv_splits: int):
             self._workspace_buf.resize_(workspace_size)
 
 
-g_sm100_workspace = SM100Workspace(128 * 1024 * 1024)  # 128MB
+g_sm100_workspace: SM100Workspace | None = None
+
+
+def _get_sm100_workspace() -> SM100Workspace:
+    global g_sm100_workspace
+    if g_sm100_workspace is None:
+        g_sm100_workspace = SM100Workspace(128 * 1024 * 1024)  # 128MB
+    return g_sm100_workspace
+
 
 MAX_HEADS = 128
 
@@ -159,7 +167,7 @@ def __init__(
             self._num_kv_splits = -1  # => Auto-detect
 
         # Share workspace buffer across all executions
-        self._workspace = g_sm100_workspace
+        self._workspace = _get_sm100_workspace()
 
     def _sm100_cutlass_mla_decode(
         self,
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
index 58d4bec7c92e..7b28207aa18b 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -94,11 +94,18 @@ def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
         return "HND"
 
 
-g_fi_workspace = torch.zeros(
-    FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE,
-    dtype=torch.uint8,
-    device="cuda",
-)
+g_fi_workspace: torch.Tensor | None = None
+
+
+def _get_fi_workspace() -> torch.Tensor:
+    global g_fi_workspace
+    if g_fi_workspace is None:
+        g_fi_workspace = torch.zeros(
+            FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE,
+            dtype=torch.uint8,
+            device="cuda",
+        )
+    return g_fi_workspace
 
 
 class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
@@ -146,7 +153,7 @@ def __init__(
                 "FlashInferMLAImpl"
             )
 
-        self._workspace_buffer = g_fi_workspace
+        self._workspace_buffer = _get_fi_workspace()
         self.bmm1_scale: float | None = None
         self.bmm2_scale: float | None = None
 

From d473952f622ef7e1c9c994938629c2f12c2c2101 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 18 Feb 2026 21:31:26 -0500
Subject: [PATCH 08/25] Call immediately before initializing kv cache

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/config/cache.py                          |  4 +-
 vllm/config/vllm.py                           | 12 +++++-
 vllm/platforms/cuda.py                        | 38 +++++++++----------
 vllm/platforms/interface.py                   | 11 ++++++
 vllm/v1/attention/backends/mla/cutlass_mla.py | 12 +-----
 .../attention/backends/mla/flashinfer_mla.py  | 19 +++-------
 vllm/v1/engine/core.py                        |  3 ++
 7 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 0823b00a351c..313a4577b507 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -41,8 +41,8 @@ class CacheConfig:
     block_size: SkipValidation[int] = None  # type: ignore[assignment]
     """Size of a contiguous cache block in number of tokens.
 
-    This is None until `Platform.check_and_update_config()` sets it based on
-    the current platform. Always an int by the time the engine starts."""
+    This is None until the platform sets it. Always an int by the time
+    the engine starts."""
     gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
     """The fraction of GPU memory to be used for the model executor, which can
     range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index e951e6f2c8aa..9de4f51dd1d3 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -916,7 +916,12 @@ def has_blocked_weights():
         current_platform.check_and_update_config(self)
 
         # If DCP, ensure the block size is right.
-        if self.parallel_config.decode_context_parallel_size > 1:
+        # block_size may still be None here (set later by
+        # Platform.update_block_size_for_backend in EngineCore).
+        if (
+            self.cache_config.block_size is not None
+            and self.parallel_config.decode_context_parallel_size > 1
+        ):
             if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
                 self.parallel_config.cp_kv_cache_interleave_size
                 != self.parallel_config.dcp_kv_cache_interleave_size
@@ -1108,7 +1113,10 @@ def has_blocked_weights():
             # Default to enable HMA if not explicitly disabled by user or logic above.
             self.scheduler_config.disable_hybrid_kv_cache_manager = False
 
-        if self.cache_config.mamba_cache_mode == "align":
+        if (
+            self.cache_config.mamba_cache_mode == "align"
+            and self.cache_config.block_size is not None
+        ):
             assert (
                 self.cache_config.block_size
                 <= self.scheduler_config.max_num_batched_tokens
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 746f8282b1ad..d7c49733dda3 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -169,27 +169,6 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
 
-        cache_config = vllm_config.cache_config
-        user_specified_block_size = cache_config.block_size is not None
-        if not user_specified_block_size:
-            cache_config.block_size = 16
-
-        # Ensure block_size is compatible with the attention backend.
-        # Note: model_config may be None during testing.
-        # Skip hybrid (attention+mamba) models — their block_size is
-        # managed by HybridAttentionMambaModelConfig
-        # TODO(matt): Limiting this to MLA models is a workaround to avoid
-        # CUDA initialization during testing. Fix this and remove the MLA check
-        if (
-            model_config is not None
-            and not model_config.is_hybrid
-            and model_config.use_mla
-        ):
-            cls._update_block_size_for_backend(
-                vllm_config,
-                user_specified_block_size,
-            )
-
         scheduler_config = vllm_config.scheduler_config
         # Note: model_config may be None during testing
         if (
@@ -204,6 +183,23 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             )
             scheduler_config.disable_chunked_mm_input = True
 
+    @classmethod
+    def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
+        cache_config = vllm_config.cache_config
+        user_specified_block_size = cache_config.block_size is not None
+        if not user_specified_block_size:
+            cache_config.block_size = 16
+
+        model_config = vllm_config.model_config
+        # Note: model_config may be None during testing.
+        # Skip hybrid (attention+mamba) models — their block_size is
+        # managed by HybridAttentionMambaModelConfig
+        if model_config is not None and not model_config.is_hybrid:
+            cls._update_block_size_for_backend(
+                vllm_config,
+                user_specified_block_size,
+            )
+
     @classmethod
     def _update_block_size_for_backend(
         cls,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 6794c05f5e52..07a73b5fdff9 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -406,6 +406,17 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         """
         pass
 
+    @classmethod
+    def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
+        """Ensure block_size is compatible with the attention backend.
+
+        Called from EngineCore after CUDA is initialized and the model
+        executor is created, but before KV caches are allocated.
+        Platforms that need to adjust block_size based on the selected
+        attention backend should override this method.
+        """
+        pass
+
     @classmethod
     def verify_model_arch(cls, model_arch: str) -> None:
         """
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 851188e0c240..6d10a9d66e20 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -95,15 +95,7 @@ def ensure_size(self, attn_metadata: MLACommonMetadata, num_kv_splits: int):
             self._workspace_buf.resize_(workspace_size)
 
 
-g_sm100_workspace: SM100Workspace | None = None
-
-
-def _get_sm100_workspace() -> SM100Workspace:
-    global g_sm100_workspace
-    if g_sm100_workspace is None:
-        g_sm100_workspace = SM100Workspace(128 * 1024 * 1024)  # 128MB
-    return g_sm100_workspace
-
+g_sm100_workspace = SM100Workspace(128 * 1024 * 1024)  # 128MB
 
 MAX_HEADS = 128
 
@@ -167,7 +159,7 @@ def __init__(
             self._num_kv_splits = -1  # => Auto-detect
 
         # Share workspace buffer across all executions
-        self._workspace = _get_sm100_workspace()
+        self._workspace = g_sm100_workspace
 
     def _sm100_cutlass_mla_decode(
         self,
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
index 7b28207aa18b..58d4bec7c92e 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -94,18 +94,11 @@ def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
         return "HND"
 
 
-g_fi_workspace: torch.Tensor | None = None
-
-
-def _get_fi_workspace() -> torch.Tensor:
-    global g_fi_workspace
-    if g_fi_workspace is None:
-        g_fi_workspace = torch.zeros(
-            FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE,
-            dtype=torch.uint8,
-            device="cuda",
-        )
-    return g_fi_workspace
+g_fi_workspace = torch.zeros(
+    FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE,
+    dtype=torch.uint8,
+    device="cuda",
+)
 
 
 class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
@@ -153,7 +146,7 @@ def __init__(
                 "FlashInferMLAImpl"
             )
 
-        self._workspace_buffer = _get_fi_workspace()
+        self._workspace_buffer = g_fi_workspace
         self.bmm1_scale: float | None = None
         self.bmm2_scale: float | None = None
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 573a31027e7c..5a1170700719 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -23,6 +23,7 @@
 from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.platforms import current_platform
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.tracing import instrument, maybe_init_worker_tracer
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
@@ -110,6 +111,8 @@ def __init__(
 
         self.available_gpu_memory_for_kv_cache = -1
 
+        current_platform.update_block_size_for_backend(vllm_config)
+
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
             vllm_config

From aa9c7c9a8b158d459999f1c04aca5179f1a4acfd Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 18 Feb 2026 21:40:16 -0500
Subject: [PATCH 09/25] Preserve block size validation

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/config/vllm.py    | 105 ++++++++++++++++++++---------------------
 vllm/v1/engine/core.py |   4 ++
 2 files changed, 55 insertions(+), 54 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 9de4f51dd1d3..fffe769e7a4b 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -915,37 +915,6 @@ def has_blocked_weights():
             )
         current_platform.check_and_update_config(self)
 
-        # If DCP, ensure the block size is right.
-        # block_size may still be None here (set later by
-        # Platform.update_block_size_for_backend in EngineCore).
-        if (
-            self.cache_config.block_size is not None
-            and self.parallel_config.decode_context_parallel_size > 1
-        ):
-            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
-                self.parallel_config.cp_kv_cache_interleave_size
-                != self.parallel_config.dcp_kv_cache_interleave_size
-            ):
-                self.parallel_config.cp_kv_cache_interleave_size = (
-                    self.parallel_config.dcp_kv_cache_interleave_size
-                )
-                logger.warning_once(
-                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
-                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
-                    "deprecated when PCP is fully supported."
-                )
-            assert (
-                self.parallel_config.cp_kv_cache_interleave_size
-                <= self.cache_config.block_size
-                and self.cache_config.block_size
-                % self.parallel_config.cp_kv_cache_interleave_size
-                == 0
-            ), (
-                f"Block_size({self.cache_config.block_size}) should be greater "
-                "than or equal to and divisible by cp_kv_cache_interleave_size "
-                f"({self.parallel_config.cp_kv_cache_interleave_size})."
-            )
-
         # Do this after all the updates to compilation_config.mode
         effective_dp_size = (
             self.parallel_config.data_parallel_size
@@ -1113,29 +1082,6 @@ def has_blocked_weights():
             # Default to enable HMA if not explicitly disabled by user or logic above.
             self.scheduler_config.disable_hybrid_kv_cache_manager = False
 
-        if (
-            self.cache_config.mamba_cache_mode == "align"
-            and self.cache_config.block_size is not None
-        ):
-            assert (
-                self.cache_config.block_size
-                <= self.scheduler_config.max_num_batched_tokens
-            ), (
-                "In Mamba cache align mode, block_size "
-                f"({self.cache_config.block_size}) must be <= "
-                "max_num_batched_tokens "
-                f"({self.scheduler_config.max_num_batched_tokens})."
-            )
-            if self.scheduler_config.long_prefill_token_threshold > 0:
-                assert (
-                    self.scheduler_config.long_prefill_token_threshold
-                    >= self.cache_config.block_size
-                )
-            assert not self.scheduler_config.disable_chunked_mm_input, (
-                "Chunked MM input is required because we need the flexibility to "
-                "schedule a multiple of block_size tokens even if they are in the "
-                "middle of a mm input"
-            )
         if self.compilation_config.debug_dump_path:
             self.compilation_config.debug_dump_path = (
                 self.compilation_config.debug_dump_path.absolute().expanduser()
@@ -1496,6 +1442,57 @@ def __str__(self):
             f"compilation_config={self.compilation_config!r}"
         )
 
+    def validate_block_size(self) -> None:
+        """Validate block_size against DCP and mamba constraints.
+
+        Called after Platform.update_block_size_for_backend() has
+        finalised block_size, so that the checks see the real value
+        rather than the initial None sentinel.
+        """
+        block_size = self.cache_config.block_size
+        assert block_size is not None, (
+            "validate_block_size called before block_size was set"
+        )
+
+        # DCP interleave-size compatibility
+        if self.parallel_config.decode_context_parallel_size > 1:
+            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
+                self.parallel_config.cp_kv_cache_interleave_size
+                != self.parallel_config.dcp_kv_cache_interleave_size
+            ):
+                self.parallel_config.cp_kv_cache_interleave_size = (
+                    self.parallel_config.dcp_kv_cache_interleave_size
+                )
+                logger.warning_once(
+                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
+                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
+                    "deprecated when PCP is fully supported."
+                )
+            assert (
+                self.parallel_config.cp_kv_cache_interleave_size <= block_size
+                and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
+            ), (
+                f"Block_size({block_size}) should be greater "
+                "than or equal to and divisible by cp_kv_cache_interleave_size "
+                f"({self.parallel_config.cp_kv_cache_interleave_size})."
+            )
+
+        # Mamba cache align-mode constraints
+        if self.cache_config.mamba_cache_mode == "align":
+            assert block_size <= self.scheduler_config.max_num_batched_tokens, (
+                "In Mamba cache align mode, block_size "
+                f"({block_size}) must be <= "
+                "max_num_batched_tokens "
+                f"({self.scheduler_config.max_num_batched_tokens})."
+            )
+            if self.scheduler_config.long_prefill_token_threshold > 0:
+                assert self.scheduler_config.long_prefill_token_threshold >= block_size
+            assert not self.scheduler_config.disable_chunked_mm_input, (
+                "Chunked MM input is required because we need the flexibility "
+                "to schedule a multiple of block_size tokens even if they are "
+                "in the middle of a mm input"
+            )
+
     @model_validator(mode="after")
     def validate_mamba_block_size(self) -> "VllmConfig":
         if self.model_config is None:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5a1170700719..ae267f42eaa9 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -111,7 +111,11 @@ def __init__(
 
         self.available_gpu_memory_for_kv_cache = -1
 
+        # Update block_size for the selected attention backend.
+        # Deferred from check_and_update_config to avoid premature
+        # CUDA initialization in the main process.
         current_platform.update_block_size_for_backend(vllm_config)
+        vllm_config.validate_block_size()
 
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(

From 819968f26630ebc96a023ca87bb82e5e1fe1047a Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 18 Feb 2026 21:41:19 -0500
Subject: [PATCH 10/25] Cleanup

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/v1/engine/core.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ae267f42eaa9..d00a64f17879 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -111,9 +111,6 @@ def __init__(
 
         self.available_gpu_memory_for_kv_cache = -1
 
-        # Update block_size for the selected attention backend.
-        # Deferred from check_and_update_config to avoid premature
-        # CUDA initialization in the main process.
         current_platform.update_block_size_for_backend(vllm_config)
         vllm_config.validate_block_size()
 

From 74bb7470e85e1347c7a7bdd01ec03cfe9abd594f Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 18 Feb 2026 23:08:35 -0500
Subject: [PATCH 11/25] Run before executor construction

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/v1/engine/core.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index d00a64f17879..53e98642167d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -104,6 +104,9 @@ def __init__(
 
         self.log_stats = log_stats
 
+        current_platform.update_block_size_for_backend(vllm_config)
+        vllm_config.validate_block_size()
+
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
         if executor_fail_callback is not None:
@@ -111,9 +114,6 @@ def __init__(
 
         self.available_gpu_memory_for_kv_cache = -1
 
-        current_platform.update_block_size_for_backend(vllm_config)
-        vllm_config.validate_block_size()
-
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
             vllm_config

From 37c252c56918670cc757d315cab83a6821d7a29a Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 08:54:01 -0500
Subject: [PATCH 12/25] Fix tests that bypass EngineCore

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 tests/models/multimodal/processing/test_tensor_schema.py | 5 ++++-
 tests/v1/spec_decode/test_eagle.py                       | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 8f79936478da..c81a8fe09d30 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -13,6 +13,7 @@
 from PIL import Image
 
 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.config.cache import CacheConfig
 from vllm.config.multimodal import (
     AudioDummyOptions,
     BaseDummyOptions,
@@ -131,7 +132,9 @@ def initialize_dummy_model(
 ):
     temp_file = tempfile.mkstemp()[1]
     current_device = torch.get_default_device()
-    vllm_config = VllmConfig(model_config=model_config)
+    vllm_config = VllmConfig(
+        model_config=model_config, cache_config=CacheConfig(block_size=16)
+    )
     with set_current_vllm_config(vllm_config=vllm_config):
         init_distributed_environment(
             world_size=1,
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 8b180168dffc..65e97b7ad5b0 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -78,7 +78,7 @@ def _create_proposer(
     device = current_platform.device_type
     vllm_config = VllmConfig(
         model_config=model_config,
-        cache_config=CacheConfig(),
+        cache_config=CacheConfig(block_size=16),
         speculative_config=speculative_config,
         device_config=DeviceConfig(device=device),
         parallel_config=ParallelConfig(),

From 273dec28987af1d3284464989988326a72b00c2d Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 11:25:36 -0500
Subject: [PATCH 13/25] Read backend selections from layers

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../layers/attention/mla_attention.py         |   6 +-
 vllm/platforms/cuda.py                        | 175 +++---------------
 vllm/v1/engine/core.py                        |   6 +-
 vllm/v1/worker/gpu_model_runner.py            |   5 +-
 4 files changed, 35 insertions(+), 157 deletions(-)

diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index 98ff02e9d4ae..1d381ae579c5 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -1262,12 +1262,14 @@ def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int:
         scheduler_config = vllm_config.scheduler_config
         cache_config = vllm_config.cache_config
         model_config = vllm_config.model_config
+        # Use 128 as conservative upper bound if not set by user
+        block_size = cache_config.block_size or 128
 
         chunked_prefill_workspace_size = min(
             # Try for 8 full length request or at least 4 pages per-request
             max(
                 8 * model_config.max_model_len,
-                4 * scheduler_config.max_num_seqs * cache_config.block_size,
+                4 * scheduler_config.max_num_seqs * block_size,
             ),
             # For long-context models try not to over-allocate limiting
             # kv-cache space, limiting it to 64k tokens,
@@ -1283,7 +1285,7 @@ def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int:
         # Enforce that we enough for at least 1 page per request
         chunked_prefill_workspace_size = max(
             chunked_prefill_workspace_size,
-            scheduler_config.max_num_seqs * cache_config.block_size,
+            scheduler_config.max_num_seqs * block_size,
         )
 
         return chunked_prefill_workspace_size
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index d7c49733dda3..02e9e3b424fe 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -14,7 +14,9 @@
 
 # import custom ops, trigger op registration
 import vllm._C  # noqa
+from vllm.config.vllm import get_layers_from_vllm_config
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.utils.import_utils import import_pynvml
 from vllm.utils.torch_utils import cuda_device_count_stateless
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
@@ -186,163 +188,36 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
     @classmethod
     def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
         cache_config = vllm_config.cache_config
-        user_specified_block_size = cache_config.block_size is not None
-        if not user_specified_block_size:
-            cache_config.block_size = 16
-
-        model_config = vllm_config.model_config
-        # Note: model_config may be None during testing.
-        # Skip hybrid (attention+mamba) models — their block_size is
-        # managed by HybridAttentionMambaModelConfig
-        if model_config is not None and not model_config.is_hybrid:
-            cls._update_block_size_for_backend(
-                vllm_config,
-                user_specified_block_size,
-            )
-
-    @classmethod
-    def _update_block_size_for_backend(
-        cls,
-        vllm_config: "VllmConfig",
-        user_specified_block_size: bool,
-    ) -> None:
-        """Ensure block_size is compatible with the attention backend.
-
-        If the user specified --block-size, the selector validates/filters
-        backends by that block size (raising on incompatibility). Otherwise,
-        the backend is selected unconstrained and block_size is set to the
-        backend's preferred value.
-        """
-        from vllm.config.vllm import set_current_vllm_config
-        from vllm.v1.attention.selector import AttentionSelectorConfig
+        if cache_config.block_size is not None:
+            # User specified --block-size; keep it.
+            return
 
         model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
-
-        device_capability = cls.get_device_capability()
-        if device_capability is None:
+        # model_config may be None during testing.
+        # Skip hybrid models — their block_size is managed by
+        # HybridAttentionMambaModelConfig.
+        if model_config is None or model_config.is_hybrid:
+            cache_config.block_size = 16
             return
 
-        use_mla = model_config.use_mla
-        attn_selector_config = AttentionSelectorConfig(
-            head_size=model_config.get_head_size(),
-            dtype=model_config.dtype,  # type: ignore[arg-type]
-            kv_cache_dtype=cache_config.cache_dtype,
-            block_size=cache_config.block_size if user_specified_block_size else None,
-            use_mla=use_mla,
-            has_sink=False,
-            use_sparse=use_mla and hasattr(model_config.hf_config, "index_topk"),
-            use_mm_prefix=model_config.is_mm_prefix_lm,
+        attn_layers = get_layers_from_vllm_config(
+            vllm_config,
+            AttentionLayerBase,
         )
+        if not attn_layers:
+            cache_config.block_size = 16
+            return
 
-        user_specified_backend = vllm_config.attention_config.backend
-        num_heads = model_config.get_num_attention_heads(
-            vllm_config.parallel_config,
-        )
-        with set_current_vllm_config(vllm_config):
-            chosen_backend = cls.select_attention_backend(
-                selected_backend=user_specified_backend,
-                attn_selector_config=attn_selector_config,
-                device_capability=device_capability,
-                # Don't raise here — we produce better errors below.
-                raise_on_invalid=False,
-                num_heads=num_heads,
+        first_layer = next(iter(attn_layers.values()))
+        backend_cls = first_layer.get_attn_backend()
+        preferred = backend_cls.get_preferred_block_size(16)
+        if preferred != 16:
+            logger.info(
+                "Setting kv cache block size to %d for %s backend.",
+                preferred,
+                backend_cls.get_name(),
             )
-
-            # If the user's --block-size forced a non-optimal backend,
-            # warn them. Only relevant when the user didn't also specify
-            # --attention-backend (in which case the choice is explicit).
-            if (
-                chosen_backend is not None
-                and user_specified_block_size
-                and user_specified_backend is None
-            ):
-                optimal = cls.select_attention_backend(
-                    selected_backend=None,
-                    attn_selector_config=attn_selector_config._replace(
-                        block_size=None,
-                    ),
-                    device_capability=device_capability,
-                    raise_on_invalid=False,
-                    num_heads=num_heads,
-                )
-                if optimal is not None and optimal != chosen_backend:
-                    logger.warning(
-                        "--block-size %d is not supported by the preferred "
-                        "%s backend. Using %s instead, which may result "
-                        "in reduced performance. Consider removing "
-                        "--block-size to auto-select the optimal "
-                        "block size.",
-                        cache_config.block_size,
-                        optimal.name,
-                        chosen_backend.name,
-                    )
-
-            if chosen_backend is not None:
-                if user_specified_block_size:
-                    # User's block_size is compatible with the chosen
-                    # backend.
-                    return
-                # User didn't specify --block-size, so auto-select the
-                # preferred block size for the chosen backend.
-                try:
-                    backend_class = chosen_backend.get_class()
-                except ImportError:
-                    return  # Will fail later with a better error
-                preferred = backend_class.get_preferred_block_size(
-                    cache_config.block_size,
-                )
-                if cache_config.block_size != preferred:
-                    logger.info(
-                        "Setting kv cache block size to %d for %s backend.",
-                        preferred,
-                        chosen_backend.name,
-                    )
-                    cache_config.block_size = preferred
-                return
-
-            # No valid backend found. If the user didn't constrain the
-            # selection, defer the error to get_attn_backend_cls where
-            # the full config (including per-layer settings) is
-            # available.
-            if not user_specified_block_size:
-                return
-
-            if user_specified_backend is not None:
-                # User specified --block-size and --attention-backend
-                # and they are incompatible.
-                try:
-                    backend_class = user_specified_backend.get_class()
-                    supported = backend_class.get_supported_kernel_block_sizes()
-                except ImportError:
-                    supported = None
-                raise ValueError(
-                    f"User-specified --block-size "
-                    f"{cache_config.block_size} is incompatible with "
-                    f"the specified --attention-backend "
-                    f"{user_specified_backend.name} (supported kernel "
-                    f"block sizes: {supported}). Either remove "
-                    f"--block-size to auto-select, or choose a "
-                    f"compatible value."
-                )
-            else:
-                # User specified --block-size but no backend supports
-                # it.
-                _, invalid_reasons = cls.get_valid_backends(
-                    device_capability=device_capability,
-                    attn_selector_config=attn_selector_config,
-                    num_heads=num_heads,
-                )
-                reasons_str = ", ".join(
-                    f"{b.name}: [{', '.join(r)}]" for b, r in invalid_reasons.items()
-                )
-                raise ValueError(
-                    f"No valid attention backend found for "
-                    f"--block-size {cache_config.block_size}. "
-                    f"Reasons: {{{reasons_str}}}. Either remove "
-                    f"--block-size to auto-select, or choose a "
-                    f"compatible value."
-                )
+        cache_config.block_size = preferred
 
     @classmethod
     def get_current_memory_usage(
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 53e98642167d..edcc5b4ab308 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -104,14 +104,14 @@ def __init__(
 
         self.log_stats = log_stats
 
-        current_platform.update_block_size_for_backend(vllm_config)
-        vllm_config.validate_block_size()
-
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
         if executor_fail_callback is not None:
             self.model_executor.register_failure_callback(executor_fail_callback)
 
+        current_platform.update_block_size_for_backend(vllm_config)
+        vllm_config.validate_block_size()
+
         self.available_gpu_memory_for_kv_cache = -1
 
         # Setup KV Caches and update CacheConfig after profiling.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 41ec062305b5..ba1428c42ee4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -513,6 +513,7 @@ def __init__(
         custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = (
             tuple(logits_processors) if logits_processors is not None else ()
         )
+        placeholder_block_size = self.cache_config.block_size or 16
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
             # We need to use the encoder length for encoder-decoer
@@ -522,8 +523,8 @@ def __init__(
             device=self.device,
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
-            block_sizes=[self.cache_config.block_size],
-            kernel_block_sizes=[self.cache_config.block_size],
+            block_sizes=[placeholder_block_size],
+            kernel_block_sizes=[placeholder_block_size],
             is_spec_decode=bool(self.vllm_config.speculative_config),
             logitsprocs=build_logitsprocs(
                 self.vllm_config,

From fab3ee5d237a79544b02da4c0c2a8a4538bd57f8 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 11:31:25 -0500
Subject: [PATCH 14/25] Move call from executor to worker

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/platforms/interface.py            |  8 ++------
 vllm/v1/engine/core.py                 | 13 ++++++++-----
 vllm/v1/executor/multiproc_executor.py |  6 ++++++
 vllm/v1/executor/uniproc_executor.py   |  6 ++++++
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 07a73b5fdff9..ba44fa6d9672 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -408,12 +408,8 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
 
     @classmethod
     def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
-        """Ensure block_size is compatible with the attention backend.
-
-        Called from EngineCore after CUDA is initialized and the model
-        executor is created, but before KV caches are allocated.
-        Platforms that need to adjust block_size based on the selected
-        attention backend should override this method.
+        """
+        Ensure block_size is compatible with the attention backend.
         """
         pass
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index edcc5b4ab308..85927ce7ed16 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -23,7 +23,6 @@
 from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.platforms import current_platform
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.tracing import instrument, maybe_init_worker_tracer
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
@@ -109,16 +108,20 @@ def __init__(
         if executor_fail_callback is not None:
             self.model_executor.register_failure_callback(executor_fail_callback)
 
-        current_platform.update_block_size_for_backend(vllm_config)
-        vllm_config.validate_block_size()
-
         self.available_gpu_memory_for_kv_cache = -1
 
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
             vllm_config
         )
-
+        if kv_cache_config.kv_cache_groups:
+            vllm_config.cache_config.block_size = kv_cache_config.kv_cache_groups[
+                0
+            ].kv_cache_spec.block_size
+        elif vllm_config.cache_config.block_size is None:
+            # Attention-free models (encoder-only, SSM) — use default.
+            vllm_config.cache_config.block_size = 16
+        vllm_config.validate_block_size()
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
         self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks))
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index b63cbd6586f2..fcc096c27399 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -579,6 +579,12 @@ def __init__(
         self._init_message_queues(input_shm_handle, vllm_config)
         self.worker.load_model()
 
+        # Determine block size from the attention backends now that
+        # the model layers are constructed.
+        from vllm.platforms import current_platform
+
+        current_platform.update_block_size_for_backend(vllm_config)
+
         # Enable environment variable cache (e.g. assume no more
         # environment variable overrides after this point)
         enable_envs_cache()
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index b9c7b550170b..ab0f139c25e1 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -47,6 +47,12 @@ def _init_executor(self) -> None:
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 
+        # Determine block size from the attention backends now that
+        # the model layers are constructed.
+        from vllm.platforms import current_platform
+
+        current_platform.update_block_size_for_backend(self.vllm_config)
+
     def _distributed_args(self) -> tuple[str, int, int]:
         """Return (distributed_init_method, rank, local_rank)."""
         distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())

From d3a07349d930dfe685a0f7ce2921a71c392d599d Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 11:52:56 -0500
Subject: [PATCH 15/25] Compute chunked prefill workspace size lazily instead
 of being conservative

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../layers/attention/mla_attention.py         | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index 1d381ae579c5..c2a9a4f1bb7d 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -407,17 +407,23 @@ def __init__(
         )
 
         # Attributes for forward_impl method
-        self.chunked_prefill_workspace_size = (
-            MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
-                get_current_vllm_config()
-            )
-        )
+        self._chunked_prefill_workspace_size: int | None = None
         self._decode_concat_quant_fp8_op = _DecodeConcatQuantFP8(
             static=True,
             group_shape=GroupShape.PER_TENSOR,
             compile_native=True,
         )
 
+    @property
+    def chunked_prefill_workspace_size(self) -> int:
+        if self._chunked_prefill_workspace_size is None:
+            self._chunked_prefill_workspace_size = (
+                MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
+                    get_current_vllm_config()
+                )
+            )
+        return self._chunked_prefill_workspace_size
+
     def forward(
         self,
         q: torch.Tensor,
@@ -1262,8 +1268,7 @@ def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int:
         scheduler_config = vllm_config.scheduler_config
         cache_config = vllm_config.cache_config
         model_config = vllm_config.model_config
-        # Use 128 as conservative upper bound if not set by user
-        block_size = cache_config.block_size or 128
+        block_size = cache_config.block_size
 
         chunked_prefill_workspace_size = min(
             # Try for 8 full length request or at least 4 pages per-request

From e856ffd50731ceaa6e071de0a8f8e12bf269f4b8 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 11:54:21 -0500
Subject: [PATCH 16/25] Make imports local to prevent circular dependency

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/platforms/cuda.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 02e9e3b424fe..c467a629fe5a 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -14,9 +14,7 @@
 
 # import custom ops, trigger op registration
 import vllm._C  # noqa
-from vllm.config.vllm import get_layers_from_vllm_config
 from vllm.logger import init_logger
-from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.utils.import_utils import import_pynvml
 from vllm.utils.torch_utils import cuda_device_count_stateless
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
@@ -200,6 +198,11 @@ def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
             cache_config.block_size = 16
             return
 
+        from vllm.config.vllm import get_layers_from_vllm_config
+        from vllm.model_executor.layers.attention_layer_base import (
+            AttentionLayerBase,
+        )
+
         attn_layers = get_layers_from_vllm_config(
             vllm_config,
             AttentionLayerBase,

From ce3fc1cfbbf46be82a6036c3e3e4aff5b44eb964 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 12:17:05 -0500
Subject: [PATCH 17/25] Fix vllm config context

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/platforms/cuda.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index c467a629fe5a..302a6aa4bcbe 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -198,7 +198,10 @@ def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
             cache_config.block_size = 16
             return
 
-        from vllm.config.vllm import get_layers_from_vllm_config
+        from vllm.config.vllm import (
+            get_layers_from_vllm_config,
+            set_current_vllm_config,
+        )
         from vllm.model_executor.layers.attention_layer_base import (
             AttentionLayerBase,
         )
@@ -213,7 +216,8 @@ def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
 
         first_layer = next(iter(attn_layers.values()))
         backend_cls = first_layer.get_attn_backend()
-        preferred = backend_cls.get_preferred_block_size(16)
+        with set_current_vllm_config(vllm_config):
+            preferred = backend_cls.get_preferred_block_size(16)
         if preferred != 16:
             logger.info(
                 "Setting kv cache block size to %d for %s backend.",

From 511141847b5c56e54ab5f19ee5177b9a5e13a2cf Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 12:27:22 -0500
Subject: [PATCH 18/25] Fix chunked local attention

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../layers/attention/chunked_local_attention.py       | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/attention/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py
index e33733c0cc1f..522981820d6a 100644
--- a/vllm/model_executor/layers/attention/chunked_local_attention.py
+++ b/vllm/model_executor/layers/attention/chunked_local_attention.py
@@ -30,9 +30,8 @@
 def create_chunked_local_attention_backend(
     underlying_attn_backend: AttentionBackend,
     attention_chunk_size: int,
-    block_size: int,
 ) -> type[AttentionBackend]:
-    prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
+    prefix = f"ChunkedLocalAttention_{attention_chunk_size}_"
 
     underlying_builder = underlying_attn_backend.get_builder_cls()
     assert issubclass(underlying_builder, AttentionMetadataBuilder)
@@ -55,7 +54,9 @@ def build(
             fast_build: bool = False,
         ):
             cm, make_virtual_batches_block_table = make_local_attention_virtual_batches(
-                attention_chunk_size, common_attn_metadata, block_size
+                attention_chunk_size,
+                common_attn_metadata,
+                self.kv_cache_spec.block_size,
             )
             metadata = super().build(common_prefix_len, cm, fast_build)
             metadata.make_virtual_batches_block_table = make_virtual_batches_block_table
@@ -97,13 +98,13 @@ def __init__(
             block_size = cache_config.block_size
         else:
             kv_cache_dtype = "auto"
-            block_size = 16
+            block_size = None
 
         underlying_attn_backend = get_attn_backend(
             head_size, dtype, kv_cache_dtype, block_size
         )
         attn_backend = create_chunked_local_attention_backend(
-            underlying_attn_backend, attention_chunk_size, block_size
+            underlying_attn_backend, attention_chunk_size
         )
 
         super().__init__(

From 4bac453aad44f9403d0f81ca66f05a7cd82312a3 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 13:01:19 -0500
Subject: [PATCH 19/25] Fix config context

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/model_executor/layers/attention/mla_attention.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index c2a9a4f1bb7d..41c32c77be58 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -407,6 +407,7 @@ def __init__(
         )
 
         # Attributes for forward_impl method
+        self._vllm_config = get_current_vllm_config()
         self._chunked_prefill_workspace_size: int | None = None
         self._decode_concat_quant_fp8_op = _DecodeConcatQuantFP8(
             static=True,
@@ -419,7 +420,7 @@ def chunked_prefill_workspace_size(self) -> int:
         if self._chunked_prefill_workspace_size is None:
             self._chunked_prefill_workspace_size = (
                 MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
-                    get_current_vllm_config()
+                    self._vllm_config
                 )
             )
         return self._chunked_prefill_workspace_size

From e5ac83c2d29fdfe0f0590376153793ef2500a69f Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 13:33:25 -0500
Subject: [PATCH 20/25] Fix ray executor

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/v1/executor/ray_executor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index ad51526ae941..6c939a593877 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -385,6 +385,11 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
         self.collective_rpc("init_device")
         self.collective_rpc("load_model")
 
+        def _update_block_size(worker):
+            current_platform.update_block_size_for_backend(worker.vllm_config)
+
+        self.collective_rpc(_update_block_size)
+
         for pp_rank in range(self.parallel_config.pipeline_parallel_size):
             self.pp_tp_workers.append([])
             for tp_rank in range(self.parallel_config.tensor_parallel_size):

From 99b3b3a189f6f83af6bfcc72981bc5217bcccfe8 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 13:45:28 -0500
Subject: [PATCH 21/25] Re-add warning

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/platforms/cuda.py | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 302a6aa4bcbe..9107dbd133ec 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -242,10 +242,10 @@ def get_valid_backends(
         num_heads: int | None = None,
     ) -> tuple[
         list[tuple["AttentionBackendEnum", int]],
-        dict["AttentionBackendEnum", list[str]],
+        dict["AttentionBackendEnum", tuple[int, list[str]]],
     ]:
         valid_backends_priorities = []
-        invalid_reasons = {}
+        invalid_reasons: dict[AttentionBackendEnum, tuple[int, list[str]]] = {}
 
         backend_priorities = _get_backend_priorities(
             attn_selector_config.use_mla,
@@ -262,7 +262,7 @@ def get_valid_backends(
             except ImportError:
                 invalid_reasons_i = ["ImportError"]
             if invalid_reasons_i:
-                invalid_reasons[backend] = invalid_reasons_i
+                invalid_reasons[backend] = (priority, invalid_reasons_i)
             else:
                 valid_backends_priorities.append((backend, priority))
 
@@ -323,7 +323,7 @@ def select_attention_backend(
                     "{"
                     + ", ".join(
                         f"{backend.name}: [{', '.join(reasons)}]"
-                        for backend, reasons in invalid_reasons.items()
+                        for backend, (_, reasons) in invalid_reasons.items()
                     )
                     + "}"
                 )
@@ -336,7 +336,30 @@ def select_attention_backend(
 
         # Select the one with the highest priority (lowest index).
         sorted_backends = sorted(valid_backends_priorities, key=lambda x: x[1])
-        return sorted_backends[0][0]
+        chosen_backend, chosen_priority = sorted_backends[0]
+
+        # If the user specified --block-size (but not --attention-backend),
+        # check whether that constraint excluded any higher-priority backends.
+        if attn_selector_config.block_size is not None:
+            excluded = [
+                backend
+                for backend, (priority, reasons) in invalid_reasons.items()
+                if priority < chosen_priority
+                and reasons == ["block_size not supported"]
+            ]
+            if excluded:
+                names = ", ".join(b.name for b in excluded)
+                logger.warning(
+                    "--block-size %d excluded higher-priority backend(s) "
+                    "%s. Using %s instead, which may result in reduced "
+                    "performance. Consider removing --block-size to "
+                    "auto-select the optimal block size.",
+                    attn_selector_config.block_size,
+                    names,
+                    chosen_backend.name,
+                )
+
+        return chosen_backend
 
     @classmethod
     def get_attn_backend_cls(
@@ -371,7 +394,7 @@ def get_attn_backend_cls(
                 "{"
                 + ", ".join(
                     f"{backend.name}: [{', '.join(reasons)}]"
-                    for backend, reasons in invalid_reasons.items()
+                    for backend, (_, reasons) in invalid_reasons.items()
                 )
                 + "}"
             )
@@ -383,7 +406,7 @@ def get_attn_backend_cls(
             logger.info_once(
                 "Using %s attention backend out of potential backends: %s",
                 chosen_backend.name,
-                tuple(b[0].name for b in valid_backends_priorities),
+                tuple(backend.name for backend, _ in valid_backends_priorities),
                 scope="local",
             )
 

From f7b337a82fb898a49cc9350e4ee4e273ace546c8 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 13:47:52 -0500
Subject: [PATCH 22/25] Clean up

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/model_executor/layers/attention/mla_attention.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index 41c32c77be58..4fe25b027793 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -1269,13 +1269,12 @@ def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int:
         scheduler_config = vllm_config.scheduler_config
         cache_config = vllm_config.cache_config
         model_config = vllm_config.model_config
-        block_size = cache_config.block_size
 
         chunked_prefill_workspace_size = min(
             # Try for 8 full length request or at least 4 pages per-request
             max(
                 8 * model_config.max_model_len,
-                4 * scheduler_config.max_num_seqs * block_size,
+                4 * scheduler_config.max_num_seqs * cache_config.block_size,
             ),
             # For long-context models try not to over-allocate limiting
             # kv-cache space, limiting it to 64k tokens,
@@ -1291,7 +1290,7 @@ def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int:
         # Enforce that we enough for at least 1 page per request
         chunked_prefill_workspace_size = max(
             chunked_prefill_workspace_size,
-            scheduler_config.max_num_seqs * block_size,
+            scheduler_config.max_num_seqs * cache_config.block_size,
         )
 
         return chunked_prefill_workspace_size

From 982a8920a0b68e4e25970ee0ed926f2c3d3261c9 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 13:49:26 -0500
Subject: [PATCH 23/25] Comment

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/platforms/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 9107dbd133ec..921054f73e6d 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -339,7 +339,7 @@ def select_attention_backend(
         chosen_backend, chosen_priority = sorted_backends[0]
 
         # If the user specified --block-size (but not --attention-backend),
-        # check whether that constraint excluded any higher-priority backends.
+        # check whether that constraint precluded any higher-priority backends.
         if attn_selector_config.block_size is not None:
             excluded = [
                 backend

From 9b741dbd0cbf127fa252c29b4d6c3c70c5d71fc7 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 13:52:34 -0500
Subject: [PATCH 24/25] Clean up

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/v1/executor/multiproc_executor.py | 6 ++----
 vllm/v1/executor/uniproc_executor.py   | 6 +-----
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index fcc096c27399..9cc7dc63ad8c 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -41,6 +41,7 @@
 )
 from vllm.envs import enable_envs_cache
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.tracing import instrument, maybe_init_worker_tracer
 from vllm.utils.network_utils import (
     get_distributed_init_method,
@@ -579,10 +580,7 @@ def __init__(
         self._init_message_queues(input_shm_handle, vllm_config)
         self.worker.load_model()
 
-        # Determine block size from the attention backends now that
-        # the model layers are constructed.
-        from vllm.platforms import current_platform
-
+        # Set block size based on the attention backends
         current_platform.update_block_size_for_backend(vllm_config)
 
         # Enable environment variable cache (e.g. assume no more
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index ab0f139c25e1..290c4dc8bbc8 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -12,6 +12,7 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
@@ -46,11 +47,6 @@ def _init_executor(self) -> None:
         self.driver_worker.init_worker(all_kwargs=[kwargs])
         self.driver_worker.init_device()
         self.driver_worker.load_model()
-
-        # Determine block size from the attention backends now that
-        # the model layers are constructed.
-        from vllm.platforms import current_platform
-
         current_platform.update_block_size_for_backend(self.vllm_config)
 
     def _distributed_args(self) -> tuple[str, int, int]:

From 3bb724ac3359b8e727ebc1cc03d29ab53cd83915 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 19 Feb 2026 15:23:31 -0500
Subject: [PATCH 25/25] Use min across groups

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/v1/engine/core.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 85927ce7ed16..d7a52b090c07 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -115,9 +115,9 @@ def __init__(
             vllm_config
         )
         if kv_cache_config.kv_cache_groups:
-            vllm_config.cache_config.block_size = kv_cache_config.kv_cache_groups[
-                0
-            ].kv_cache_spec.block_size
+            vllm_config.cache_config.block_size = min(
+                g.kv_cache_spec.block_size for g in kv_cache_config.kv_cache_groups
+            )
         elif vllm_config.cache_config.block_size is None:
             # Attention-free models (encoder-only, SSM) — use default.
             vllm_config.cache_config.block_size = 16