From 2a25c4bbea2f99cb45d05a0605b1464a6de74ade Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 18 Feb 2026 10:40:06 -0500 Subject: [PATCH 01/25] Fix Signed-off-by: Matthew Bonanni --- tests/models/registry.py | 5 ++++- vllm/model_executor/models/minicpm_eagle.py | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 16e64ea9e6d8..1a2556d5c944 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -761,7 +761,10 @@ def check_available_online( trust_remote_code=True, extras={"2b": "h2oai/h2ovl-mississippi-2b"}, max_transformers_version="4.48", - transformers_version_reason={"hf": "HF model is not compatible."}, + transformers_version_reason={ + "hf": "HF model is not compatible.", + "vllm": "Remote config code is not compatible.", + }, ), "HCXVisionForCausalLM": _HfExamplesInfo( "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index e9f1a91bfc4a..4334ad4567fa 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -360,7 +360,12 @@ def forward( input_ids: torch.Tensor, positions: torch.Tensor, hidden_states: torch.Tensor, + inputs_embeds: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: + if inputs_embeds is not None: + raise NotImplementedError( + f"{type(self).__name__} does not support multimodal inputs yet." + ) hidden_states, hidden_states2 = self.model(input_ids, positions, hidden_states) hidden_states = hidden_states / self.scale_width hidden_states2 = hidden_states2 / self.scale_width From 1a4018693ddb222ca1d5453d39e898e03a7bc7d3 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 18 Feb 2026 10:46:07 -0500 Subject: [PATCH 02/25] Fix Signed-off-by: Matthew Bonanni --- tests/models/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/models/utils.py b/tests/models/utils.py index 4830f18dccf5..d68d711fd1c0 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -457,6 +457,9 @@ def dummy_hf_overrides( # Kimi uses `num_expert_group` instead of `n_group`. if n_group is None: n_group = getattr(text_config, "num_expert_group", None) + # InternS1Pro uses `router_n_groups` instead of `n_group`. + if n_group is None: + n_group = getattr(text_config, "router_n_groups", None) num_experts = n_group * 2 if n_group is not None else 2 # we use three layers for Gemma-3n to check From c617228cdfd159901dcd4e3bf0d2612c9dec3d79 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 18 Feb 2026 10:54:59 -0500 Subject: [PATCH 03/25] Fix Signed-off-by: Matthew Bonanni --- tests/models/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/utils.py b/tests/models/utils.py index d68d711fd1c0..8c1fb63d67ac 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -489,12 +489,14 @@ class DummyConfig: # Only set MoE related config when the model has MoE layers. # Otherwise all models detected as MoE by _get_transformers_backend_cls. if model_arch_config.num_experts > 0: + orig_topk = getattr(text_config, "num_experts_per_tok", 2) + topk = min(orig_topk, 2) update_dict.update( { "num_experts": num_experts, - "num_experts_per_tok": 2, + "num_experts_per_tok": topk, # Kimi uses `num_experts_per_token`. - "num_experts_per_token": 2, + "num_experts_per_token": topk, "num_local_experts": num_experts, # Otherwise there will not be any expert layers "first_k_dense_replace": 0, From 6ac74b88b39b8d2433f73f62f7ad148fec734946 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 18 Feb 2026 11:04:21 -0500 Subject: [PATCH 04/25] Make _update_block_size_for_backend fault-tolerant Signed-off-by: Matthew Bonanni --- tests/models/registry.py | 5 +---- vllm/platforms/cuda.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 1a2556d5c944..16e64ea9e6d8 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -761,10 +761,7 @@ def check_available_online( trust_remote_code=True, extras={"2b": "h2oai/h2ovl-mississippi-2b"}, max_transformers_version="4.48", - transformers_version_reason={ - "hf": "HF model is not compatible.", - "vllm": "Remote config code is not compatible.", - }, + transformers_version_reason={"hf": "HF model is not compatible."}, ), "HCXVisionForCausalLM": _HfExamplesInfo( "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 2314d0a8b675..0c3e43ee65bb 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -179,10 +179,22 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: # Skip hybrid (attention+mamba) models — their block_size is # managed by HybridAttentionMambaModelConfig if model_config is not None and not model_config.is_hybrid: - cls._update_block_size_for_backend( - vllm_config, - user_specified_block_size, - ) + try: + cls._update_block_size_for_backend( + vllm_config, + user_specified_block_size, + ) + except Exception: + # Some models (e.g. trust_remote_code models with + # incompatible transformers versions) may fail here. + # Fall back to the default block_size rather than + # crashing during config validation. + logger.debug( + "Failed to update block size for attention backend, " + "using default block_size=%d.", + cache_config.block_size, + exc_info=True, + ) scheduler_config = vllm_config.scheduler_config # Note: model_config may be None during testing From dfed2a16d848acf4b33787d1ea5081b9e1e79790 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 18 Feb 2026 13:38:06 -0500 Subject: [PATCH 05/25] Workaround by limiting to MLA Signed-off-by: Matthew Bonanni --- vllm/model_executor/models/minicpm_eagle.py | 5 ----- vllm/platforms/cuda.py | 8 +++++++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index 4334ad4567fa..e9f1a91bfc4a 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -360,12 +360,7 @@ def forward( input_ids: torch.Tensor, positions: torch.Tensor, hidden_states: torch.Tensor, - inputs_embeds: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: - if inputs_embeds is not None: - raise NotImplementedError( - f"{type(self).__name__} does not support multimodal inputs yet." - ) hidden_states, hidden_states2 = self.model(input_ids, positions, hidden_states) hidden_states = hidden_states / self.scale_width hidden_states2 = hidden_states2 / self.scale_width diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 0c3e43ee65bb..b6ec4748bb93 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -178,7 +178,13 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: # Note: model_config may be None during testing. # Skip hybrid (attention+mamba) models — their block_size is # managed by HybridAttentionMambaModelConfig - if model_config is not None and not model_config.is_hybrid: + # TODO(matt): Limiting this to MLA models is a workaround to avoid + # CUDA initialization during testing. Fix this and remove the MLA check + if ( + model_config is not None + and not model_config.is_hybrid + and model_config.use_mla + ): try: cls._update_block_size_for_backend( vllm_config, From 406575751f7effc5ea9736051ad6002233f1d38b Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 18 Feb 2026 13:44:19 -0500 Subject: [PATCH 06/25] Try-except should no longer be necessary Signed-off-by: Matthew Bonanni --- vllm/platforms/cuda.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index b6ec4748bb93..746f8282b1ad 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -185,22 +185,10 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: and not model_config.is_hybrid and model_config.use_mla ): - try: - cls._update_block_size_for_backend( - vllm_config, - user_specified_block_size, - ) - except Exception: - # Some models (e.g. trust_remote_code models with - # incompatible transformers versions) may fail here. - # Fall back to the default block_size rather than - # crashing during config validation. - logger.debug( - "Failed to update block size for attention backend, " - "using default block_size=%d.", - cache_config.block_size, - exc_info=True, - ) + cls._update_block_size_for_backend( + vllm_config, + user_specified_block_size, + ) scheduler_config = vllm_config.scheduler_config # Note: model_config may be None during testing From 40d3782bc130263371dd4aa025f827f2997d31ad Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 18 Feb 2026 17:20:17 -0500 Subject: [PATCH 07/25] Lazy allocate workspaces Signed-off-by: Matthew Bonanni --- vllm/v1/attention/backends/mla/cutlass_mla.py | 12 ++++++++++-- .../attention/backends/mla/flashinfer_mla.py | 19 +++++++++++++------ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 6d10a9d66e20..851188e0c240 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -95,7 +95,15 @@ def ensure_size(self, attn_metadata: MLACommonMetadata, num_kv_splits: int): self._workspace_buf.resize_(workspace_size) -g_sm100_workspace = SM100Workspace(128 * 1024 * 1024) # 128MB +g_sm100_workspace: SM100Workspace | None = None + + +def _get_sm100_workspace() -> SM100Workspace: + global g_sm100_workspace + if g_sm100_workspace is None: + g_sm100_workspace = SM100Workspace(128 * 1024 * 1024) # 128MB + return g_sm100_workspace + MAX_HEADS = 128 @@ -159,7 +167,7 @@ def __init__( self._num_kv_splits = -1 # => Auto-detect # Share workspace buffer across all executions - self._workspace = g_sm100_workspace + self._workspace = _get_sm100_workspace() def _sm100_cutlass_mla_decode( self, diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py index 58d4bec7c92e..7b28207aa18b 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -94,11 +94,18 @@ def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None": return "HND" -g_fi_workspace = torch.zeros( - FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE, - dtype=torch.uint8, - device="cuda", -) +g_fi_workspace: torch.Tensor | None = None + + +def _get_fi_workspace() -> torch.Tensor: + global g_fi_workspace + if g_fi_workspace is None: + g_fi_workspace = torch.zeros( + FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE, + dtype=torch.uint8, + device="cuda", + ) + return g_fi_workspace class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]): @@ -146,7 +153,7 @@ def __init__( "FlashInferMLAImpl" ) - self._workspace_buffer = g_fi_workspace + self._workspace_buffer = _get_fi_workspace() self.bmm1_scale: float | None = None self.bmm2_scale: float | None = None From d473952f622ef7e1c9c994938629c2f12c2c2101 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 18 Feb 2026 21:31:26 -0500 Subject: [PATCH 08/25] Call immediately before initializing kv cache Signed-off-by: Matthew Bonanni --- vllm/config/cache.py | 4 +- vllm/config/vllm.py | 12 +++++- vllm/platforms/cuda.py | 38 +++++++++---------- vllm/platforms/interface.py | 11 ++++++ vllm/v1/attention/backends/mla/cutlass_mla.py | 12 +----- .../attention/backends/mla/flashinfer_mla.py | 19 +++------- vllm/v1/engine/core.py | 3 ++ 7 files changed, 51 insertions(+), 48 deletions(-) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 0823b00a351c..313a4577b507 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -41,8 +41,8 @@ class CacheConfig: block_size: SkipValidation[int] = None # type: ignore[assignment] """Size of a contiguous cache block in number of tokens. - This is None until `Platform.check_and_update_config()` sets it based on - the current platform. Always an int by the time the engine starts.""" + This is None until the platform sets it. Always an int by the time + the engine starts.""" gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1) """The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index e951e6f2c8aa..9de4f51dd1d3 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -916,7 +916,12 @@ def has_blocked_weights(): current_platform.check_and_update_config(self) # If DCP, ensure the block size is right. - if self.parallel_config.decode_context_parallel_size > 1: + # block_size may still be None here (set later by + # Platform.update_block_size_for_backend in EngineCore). + if ( + self.cache_config.block_size is not None + and self.parallel_config.decode_context_parallel_size > 1 + ): if self.parallel_config.dcp_kv_cache_interleave_size > 1 and ( self.parallel_config.cp_kv_cache_interleave_size != self.parallel_config.dcp_kv_cache_interleave_size @@ -1108,7 +1113,10 @@ def has_blocked_weights(): # Default to enable HMA if not explicitly disabled by user or logic above. self.scheduler_config.disable_hybrid_kv_cache_manager = False - if self.cache_config.mamba_cache_mode == "align": + if ( + self.cache_config.mamba_cache_mode == "align" + and self.cache_config.block_size is not None + ): assert ( self.cache_config.block_size <= self.scheduler_config.max_num_batched_tokens diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 746f8282b1ad..d7c49733dda3 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -169,27 +169,6 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" - cache_config = vllm_config.cache_config - user_specified_block_size = cache_config.block_size is not None - if not user_specified_block_size: - cache_config.block_size = 16 - - # Ensure block_size is compatible with the attention backend. - # Note: model_config may be None during testing. - # Skip hybrid (attention+mamba) models — their block_size is - # managed by HybridAttentionMambaModelConfig - # TODO(matt): Limiting this to MLA models is a workaround to avoid - # CUDA initialization during testing. Fix this and remove the MLA check - if ( - model_config is not None - and not model_config.is_hybrid - and model_config.use_mla - ): - cls._update_block_size_for_backend( - vllm_config, - user_specified_block_size, - ) - scheduler_config = vllm_config.scheduler_config # Note: model_config may be None during testing if ( @@ -204,6 +183,23 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: ) scheduler_config.disable_chunked_mm_input = True + @classmethod + def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None: + cache_config = vllm_config.cache_config + user_specified_block_size = cache_config.block_size is not None + if not user_specified_block_size: + cache_config.block_size = 16 + + model_config = vllm_config.model_config + # Note: model_config may be None during testing. + # Skip hybrid (attention+mamba) models — their block_size is + # managed by HybridAttentionMambaModelConfig + if model_config is not None and not model_config.is_hybrid: + cls._update_block_size_for_backend( + vllm_config, + user_specified_block_size, + ) + @classmethod def _update_block_size_for_backend( cls, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 6794c05f5e52..07a73b5fdff9 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -406,6 +406,17 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: """ pass + @classmethod + def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None: + """Ensure block_size is compatible with the attention backend. + + Called from EngineCore after CUDA is initialized and the model + executor is created, but before KV caches are allocated. + Platforms that need to adjust block_size based on the selected + attention backend should override this method. + """ + pass + @classmethod def verify_model_arch(cls, model_arch: str) -> None: """ diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 851188e0c240..6d10a9d66e20 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -95,15 +95,7 @@ def ensure_size(self, attn_metadata: MLACommonMetadata, num_kv_splits: int): self._workspace_buf.resize_(workspace_size) -g_sm100_workspace: SM100Workspace | None = None - - -def _get_sm100_workspace() -> SM100Workspace: - global g_sm100_workspace - if g_sm100_workspace is None: - g_sm100_workspace = SM100Workspace(128 * 1024 * 1024) # 128MB - return g_sm100_workspace - +g_sm100_workspace = SM100Workspace(128 * 1024 * 1024) # 128MB MAX_HEADS = 128 @@ -167,7 +159,7 @@ def __init__( self._num_kv_splits = -1 # => Auto-detect # Share workspace buffer across all executions - self._workspace = _get_sm100_workspace() + self._workspace = g_sm100_workspace def _sm100_cutlass_mla_decode( self, diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py index 7b28207aa18b..58d4bec7c92e 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -94,18 +94,11 @@ def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None": return "HND" -g_fi_workspace: torch.Tensor | None = None - - -def _get_fi_workspace() -> torch.Tensor: - global g_fi_workspace - if g_fi_workspace is None: - g_fi_workspace = torch.zeros( - FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE, - dtype=torch.uint8, - device="cuda", - ) - return g_fi_workspace +g_fi_workspace = torch.zeros( + FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE, + dtype=torch.uint8, + device="cuda", +) class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]): @@ -153,7 +146,7 @@ def __init__( "FlashInferMLAImpl" ) - self._workspace_buffer = _get_fi_workspace() + self._workspace_buffer = g_fi_workspace self.bmm1_scale: float | None = None self.bmm2_scale: float | None = None diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 573a31027e7c..5a1170700719 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -23,6 +23,7 @@ from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.platforms import current_platform from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.tracing import instrument, maybe_init_worker_tracer from vllm.transformers_utils.config import maybe_register_config_serialize_by_value @@ -110,6 +111,8 @@ def __init__( self.available_gpu_memory_for_kv_cache = -1 + current_platform.update_block_size_for_backend(vllm_config) + # Setup KV Caches and update CacheConfig after profiling. num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( vllm_config From aa9c7c9a8b158d459999f1c04aca5179f1a4acfd Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 18 Feb 2026 21:40:16 -0500 Subject: [PATCH 09/25] Preserve block size validation Signed-off-by: Matthew Bonanni --- vllm/config/vllm.py | 105 ++++++++++++++++++++--------------------- vllm/v1/engine/core.py | 4 ++ 2 files changed, 55 insertions(+), 54 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 9de4f51dd1d3..fffe769e7a4b 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -915,37 +915,6 @@ def has_blocked_weights(): ) current_platform.check_and_update_config(self) - # If DCP, ensure the block size is right. - # block_size may still be None here (set later by - # Platform.update_block_size_for_backend in EngineCore). - if ( - self.cache_config.block_size is not None - and self.parallel_config.decode_context_parallel_size > 1 - ): - if self.parallel_config.dcp_kv_cache_interleave_size > 1 and ( - self.parallel_config.cp_kv_cache_interleave_size - != self.parallel_config.dcp_kv_cache_interleave_size - ): - self.parallel_config.cp_kv_cache_interleave_size = ( - self.parallel_config.dcp_kv_cache_interleave_size - ) - logger.warning_once( - "cp_kv_cache_interleave_size is overridden by dcp_kv_cache" - "_interleave_size. And dcp-kv-cache-interleave-size will be " - "deprecated when PCP is fully supported." - ) - assert ( - self.parallel_config.cp_kv_cache_interleave_size - <= self.cache_config.block_size - and self.cache_config.block_size - % self.parallel_config.cp_kv_cache_interleave_size - == 0 - ), ( - f"Block_size({self.cache_config.block_size}) should be greater " - "than or equal to and divisible by cp_kv_cache_interleave_size " - f"({self.parallel_config.cp_kv_cache_interleave_size})." - ) - # Do this after all the updates to compilation_config.mode effective_dp_size = ( self.parallel_config.data_parallel_size @@ -1113,29 +1082,6 @@ def has_blocked_weights(): # Default to enable HMA if not explicitly disabled by user or logic above. self.scheduler_config.disable_hybrid_kv_cache_manager = False - if ( - self.cache_config.mamba_cache_mode == "align" - and self.cache_config.block_size is not None - ): - assert ( - self.cache_config.block_size - <= self.scheduler_config.max_num_batched_tokens - ), ( - "In Mamba cache align mode, block_size " - f"({self.cache_config.block_size}) must be <= " - "max_num_batched_tokens " - f"({self.scheduler_config.max_num_batched_tokens})." - ) - if self.scheduler_config.long_prefill_token_threshold > 0: - assert ( - self.scheduler_config.long_prefill_token_threshold - >= self.cache_config.block_size - ) - assert not self.scheduler_config.disable_chunked_mm_input, ( - "Chunked MM input is required because we need the flexibility to " - "schedule a multiple of block_size tokens even if they are in the " - "middle of a mm input" - ) if self.compilation_config.debug_dump_path: self.compilation_config.debug_dump_path = ( self.compilation_config.debug_dump_path.absolute().expanduser() @@ -1496,6 +1442,57 @@ def __str__(self): f"compilation_config={self.compilation_config!r}" ) + def validate_block_size(self) -> None: + """Validate block_size against DCP and mamba constraints. + + Called after Platform.update_block_size_for_backend() has + finalised block_size, so that the checks see the real value + rather than the initial None sentinel. + """ + block_size = self.cache_config.block_size + assert block_size is not None, ( + "validate_block_size called before block_size was set" + ) + + # DCP interleave-size compatibility + if self.parallel_config.decode_context_parallel_size > 1: + if self.parallel_config.dcp_kv_cache_interleave_size > 1 and ( + self.parallel_config.cp_kv_cache_interleave_size + != self.parallel_config.dcp_kv_cache_interleave_size + ): + self.parallel_config.cp_kv_cache_interleave_size = ( + self.parallel_config.dcp_kv_cache_interleave_size + ) + logger.warning_once( + "cp_kv_cache_interleave_size is overridden by dcp_kv_cache" + "_interleave_size. And dcp-kv-cache-interleave-size will be " + "deprecated when PCP is fully supported." + ) + assert ( + self.parallel_config.cp_kv_cache_interleave_size <= block_size + and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0 + ), ( + f"Block_size({block_size}) should be greater " + "than or equal to and divisible by cp_kv_cache_interleave_size " + f"({self.parallel_config.cp_kv_cache_interleave_size})." + ) + + # Mamba cache align-mode constraints + if self.cache_config.mamba_cache_mode == "align": + assert block_size <= self.scheduler_config.max_num_batched_tokens, ( + "In Mamba cache align mode, block_size " + f"({block_size}) must be <= " + "max_num_batched_tokens " + f"({self.scheduler_config.max_num_batched_tokens})." + ) + if self.scheduler_config.long_prefill_token_threshold > 0: + assert self.scheduler_config.long_prefill_token_threshold >= block_size + assert not self.scheduler_config.disable_chunked_mm_input, ( + "Chunked MM input is required because we need the flexibility " + "to schedule a multiple of block_size tokens even if they are " + "in the middle of a mm input" + ) + @model_validator(mode="after") def validate_mamba_block_size(self) -> "VllmConfig": if self.model_config is None: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5a1170700719..ae267f42eaa9 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -111,7 +111,11 @@ def __init__( self.available_gpu_memory_for_kv_cache = -1 + # Update block_size for the selected attention backend. + # Deferred from check_and_update_config to avoid premature + # CUDA initialization in the main process. current_platform.update_block_size_for_backend(vllm_config) + vllm_config.validate_block_size() # Setup KV Caches and update CacheConfig after profiling. num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( From 819968f26630ebc96a023ca87bb82e5e1fe1047a Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 18 Feb 2026 21:41:19 -0500 Subject: [PATCH 10/25] Cleanup Signed-off-by: Matthew Bonanni --- vllm/v1/engine/core.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ae267f42eaa9..d00a64f17879 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -111,9 +111,6 @@ def __init__( self.available_gpu_memory_for_kv_cache = -1 - # Update block_size for the selected attention backend. - # Deferred from check_and_update_config to avoid premature - # CUDA initialization in the main process. current_platform.update_block_size_for_backend(vllm_config) vllm_config.validate_block_size() From 74bb7470e85e1347c7a7bdd01ec03cfe9abd594f Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 18 Feb 2026 23:08:35 -0500 Subject: [PATCH 11/25] Run before executor construction Signed-off-by: Matthew Bonanni --- vllm/v1/engine/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index d00a64f17879..53e98642167d 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -104,6 +104,9 @@ def __init__( self.log_stats = log_stats + current_platform.update_block_size_for_backend(vllm_config) + vllm_config.validate_block_size() + # Setup Model. self.model_executor = executor_class(vllm_config) if executor_fail_callback is not None: @@ -111,9 +114,6 @@ def __init__( self.available_gpu_memory_for_kv_cache = -1 - current_platform.update_block_size_for_backend(vllm_config) - vllm_config.validate_block_size() - # Setup KV Caches and update CacheConfig after profiling. num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( vllm_config From 37c252c56918670cc757d315cab83a6821d7a29a Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 08:54:01 -0500 Subject: [PATCH 12/25] Fix tests that bypass EngineCore Signed-off-by: Matthew Bonanni --- tests/models/multimodal/processing/test_tensor_schema.py | 5 ++++- tests/v1/spec_decode/test_eagle.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 8f79936478da..c81a8fe09d30 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -13,6 +13,7 @@ from PIL import Image from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config +from vllm.config.cache import CacheConfig from vllm.config.multimodal import ( AudioDummyOptions, BaseDummyOptions, @@ -131,7 +132,9 @@ def initialize_dummy_model( ): temp_file = tempfile.mkstemp()[1] current_device = torch.get_default_device() - vllm_config = VllmConfig(model_config=model_config) + vllm_config = VllmConfig( + model_config=model_config, cache_config=CacheConfig(block_size=16) + ) with set_current_vllm_config(vllm_config=vllm_config): init_distributed_environment( world_size=1, diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 8b180168dffc..65e97b7ad5b0 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -78,7 +78,7 @@ def _create_proposer( device = current_platform.device_type vllm_config = VllmConfig( model_config=model_config, - cache_config=CacheConfig(), + cache_config=CacheConfig(block_size=16), speculative_config=speculative_config, device_config=DeviceConfig(device=device), parallel_config=ParallelConfig(), From 273dec28987af1d3284464989988326a72b00c2d Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 11:25:36 -0500 Subject: [PATCH 13/25] Read backend selections from layers Signed-off-by: Matthew Bonanni --- .../layers/attention/mla_attention.py | 6 +- vllm/platforms/cuda.py | 175 +++--------------- vllm/v1/engine/core.py | 6 +- vllm/v1/worker/gpu_model_runner.py | 5 +- 4 files changed, 35 insertions(+), 157 deletions(-) diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index 98ff02e9d4ae..1d381ae579c5 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -1262,12 +1262,14 @@ def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int: scheduler_config = vllm_config.scheduler_config cache_config = vllm_config.cache_config model_config = vllm_config.model_config + # Use 128 as conservative upper bound if not set by user + block_size = cache_config.block_size or 128 chunked_prefill_workspace_size = min( # Try for 8 full length request or at least 4 pages per-request max( 8 * model_config.max_model_len, - 4 * scheduler_config.max_num_seqs * cache_config.block_size, + 4 * scheduler_config.max_num_seqs * block_size, ), # For long-context models try not to over-allocate limiting # kv-cache space, limiting it to 64k tokens, @@ -1283,7 +1285,7 @@ def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int: # Enforce that we enough for at least 1 page per request chunked_prefill_workspace_size = max( chunked_prefill_workspace_size, - scheduler_config.max_num_seqs * cache_config.block_size, + scheduler_config.max_num_seqs * block_size, ) return chunked_prefill_workspace_size diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index d7c49733dda3..02e9e3b424fe 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -14,7 +14,9 @@ # import custom ops, trigger op registration import vllm._C # noqa +from vllm.config.vllm import get_layers_from_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.utils.import_utils import import_pynvml from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.v1.attention.backends.registry import AttentionBackendEnum @@ -186,163 +188,36 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: @classmethod def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None: cache_config = vllm_config.cache_config - user_specified_block_size = cache_config.block_size is not None - if not user_specified_block_size: - cache_config.block_size = 16 - - model_config = vllm_config.model_config - # Note: model_config may be None during testing. - # Skip hybrid (attention+mamba) models — their block_size is - # managed by HybridAttentionMambaModelConfig - if model_config is not None and not model_config.is_hybrid: - cls._update_block_size_for_backend( - vllm_config, - user_specified_block_size, - ) - - @classmethod - def _update_block_size_for_backend( - cls, - vllm_config: "VllmConfig", - user_specified_block_size: bool, - ) -> None: - """Ensure block_size is compatible with the attention backend. - - If the user specified --block-size, the selector validates/filters - backends by that block size (raising on incompatibility). Otherwise, - the backend is selected unconstrained and block_size is set to the - backend's preferred value. - """ - from vllm.config.vllm import set_current_vllm_config - from vllm.v1.attention.selector import AttentionSelectorConfig + if cache_config.block_size is not None: + # User specified --block-size; keep it. + return model_config = vllm_config.model_config - cache_config = vllm_config.cache_config - - device_capability = cls.get_device_capability() - if device_capability is None: + # model_config may be None during testing. + # Skip hybrid models — their block_size is managed by + # HybridAttentionMambaModelConfig. + if model_config is None or model_config.is_hybrid: + cache_config.block_size = 16 return - use_mla = model_config.use_mla - attn_selector_config = AttentionSelectorConfig( - head_size=model_config.get_head_size(), - dtype=model_config.dtype, # type: ignore[arg-type] - kv_cache_dtype=cache_config.cache_dtype, - block_size=cache_config.block_size if user_specified_block_size else None, - use_mla=use_mla, - has_sink=False, - use_sparse=use_mla and hasattr(model_config.hf_config, "index_topk"), - use_mm_prefix=model_config.is_mm_prefix_lm, + attn_layers = get_layers_from_vllm_config( + vllm_config, + AttentionLayerBase, ) + if not attn_layers: + cache_config.block_size = 16 + return - user_specified_backend = vllm_config.attention_config.backend - num_heads = model_config.get_num_attention_heads( - vllm_config.parallel_config, - ) - with set_current_vllm_config(vllm_config): - chosen_backend = cls.select_attention_backend( - selected_backend=user_specified_backend, - attn_selector_config=attn_selector_config, - device_capability=device_capability, - # Don't raise here — we produce better errors below. - raise_on_invalid=False, - num_heads=num_heads, + first_layer = next(iter(attn_layers.values())) + backend_cls = first_layer.get_attn_backend() + preferred = backend_cls.get_preferred_block_size(16) + if preferred != 16: + logger.info( + "Setting kv cache block size to %d for %s backend.", + preferred, + backend_cls.get_name(), ) - - # If the user's --block-size forced a non-optimal backend, - # warn them. Only relevant when the user didn't also specify - # --attention-backend (in which case the choice is explicit). - if ( - chosen_backend is not None - and user_specified_block_size - and user_specified_backend is None - ): - optimal = cls.select_attention_backend( - selected_backend=None, - attn_selector_config=attn_selector_config._replace( - block_size=None, - ), - device_capability=device_capability, - raise_on_invalid=False, - num_heads=num_heads, - ) - if optimal is not None and optimal != chosen_backend: - logger.warning( - "--block-size %d is not supported by the preferred " - "%s backend. Using %s instead, which may result " - "in reduced performance. Consider removing " - "--block-size to auto-select the optimal " - "block size.", - cache_config.block_size, - optimal.name, - chosen_backend.name, - ) - - if chosen_backend is not None: - if user_specified_block_size: - # User's block_size is compatible with the chosen - # backend. - return - # User didn't specify --block-size, so auto-select the - # preferred block size for the chosen backend. - try: - backend_class = chosen_backend.get_class() - except ImportError: - return # Will fail later with a better error - preferred = backend_class.get_preferred_block_size( - cache_config.block_size, - ) - if cache_config.block_size != preferred: - logger.info( - "Setting kv cache block size to %d for %s backend.", - preferred, - chosen_backend.name, - ) - cache_config.block_size = preferred - return - - # No valid backend found. If the user didn't constrain the - # selection, defer the error to get_attn_backend_cls where - # the full config (including per-layer settings) is - # available. - if not user_specified_block_size: - return - - if user_specified_backend is not None: - # User specified --block-size and --attention-backend - # and they are incompatible. - try: - backend_class = user_specified_backend.get_class() - supported = backend_class.get_supported_kernel_block_sizes() - except ImportError: - supported = None - raise ValueError( - f"User-specified --block-size " - f"{cache_config.block_size} is incompatible with " - f"the specified --attention-backend " - f"{user_specified_backend.name} (supported kernel " - f"block sizes: {supported}). Either remove " - f"--block-size to auto-select, or choose a " - f"compatible value." - ) - else: - # User specified --block-size but no backend supports - # it. - _, invalid_reasons = cls.get_valid_backends( - device_capability=device_capability, - attn_selector_config=attn_selector_config, - num_heads=num_heads, - ) - reasons_str = ", ".join( - f"{b.name}: [{', '.join(r)}]" for b, r in invalid_reasons.items() - ) - raise ValueError( - f"No valid attention backend found for " - f"--block-size {cache_config.block_size}. " - f"Reasons: {{{reasons_str}}}. Either remove " - f"--block-size to auto-select, or choose a " - f"compatible value." - ) + cache_config.block_size = preferred @classmethod def get_current_memory_usage( diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 53e98642167d..edcc5b4ab308 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -104,14 +104,14 @@ def __init__( self.log_stats = log_stats - current_platform.update_block_size_for_backend(vllm_config) - vllm_config.validate_block_size() - # Setup Model. self.model_executor = executor_class(vllm_config) if executor_fail_callback is not None: self.model_executor.register_failure_callback(executor_fail_callback) + current_platform.update_block_size_for_backend(vllm_config) + vllm_config.validate_block_size() + self.available_gpu_memory_for_kv_cache = -1 # Setup KV Caches and update CacheConfig after profiling. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 41ec062305b5..ba1428c42ee4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -513,6 +513,7 @@ def __init__( custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = ( tuple(logits_processors) if logits_processors is not None else () ) + placeholder_block_size = self.cache_config.block_size or 16 self.input_batch = InputBatch( max_num_reqs=self.max_num_reqs, # We need to use the encoder length for encoder-decoer @@ -522,8 +523,8 @@ def __init__( device=self.device, pin_memory=self.pin_memory, vocab_size=self.model_config.get_vocab_size(), - block_sizes=[self.cache_config.block_size], - kernel_block_sizes=[self.cache_config.block_size], + block_sizes=[placeholder_block_size], + kernel_block_sizes=[placeholder_block_size], is_spec_decode=bool(self.vllm_config.speculative_config), logitsprocs=build_logitsprocs( self.vllm_config, From fab3ee5d237a79544b02da4c0c2a8a4538bd57f8 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 11:31:25 -0500 Subject: [PATCH 14/25] Move call from executor to worker Signed-off-by: Matthew Bonanni --- vllm/platforms/interface.py | 8 ++------ vllm/v1/engine/core.py | 13 ++++++++----- vllm/v1/executor/multiproc_executor.py | 6 ++++++ vllm/v1/executor/uniproc_executor.py | 6 ++++++ 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 07a73b5fdff9..ba44fa6d9672 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -408,12 +408,8 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: @classmethod def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None: - """Ensure block_size is compatible with the attention backend. - - Called from EngineCore after CUDA is initialized and the model - executor is created, but before KV caches are allocated. - Platforms that need to adjust block_size based on the selected - attention backend should override this method. + """ + Ensure block_size is compatible with the attention backend. """ pass diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index edcc5b4ab308..85927ce7ed16 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -23,7 +23,6 @@ from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.platforms import current_platform from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.tracing import instrument, maybe_init_worker_tracer from vllm.transformers_utils.config import maybe_register_config_serialize_by_value @@ -109,16 +108,20 @@ def __init__( if executor_fail_callback is not None: self.model_executor.register_failure_callback(executor_fail_callback) - current_platform.update_block_size_for_backend(vllm_config) - vllm_config.validate_block_size() - self.available_gpu_memory_for_kv_cache = -1 # Setup KV Caches and update CacheConfig after profiling. num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( vllm_config ) - + if kv_cache_config.kv_cache_groups: + vllm_config.cache_config.block_size = kv_cache_config.kv_cache_groups[ + 0 + ].kv_cache_spec.block_size + elif vllm_config.cache_config.block_size is None: + # Attention-free models (encoder-only, SSM) — use default. + vllm_config.cache_config.block_size = 16 + vllm_config.validate_block_size() vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks)) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index b63cbd6586f2..fcc096c27399 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -579,6 +579,12 @@ def __init__( self._init_message_queues(input_shm_handle, vllm_config) self.worker.load_model() + # Determine block size from the attention backends now that + # the model layers are constructed. + from vllm.platforms import current_platform + + current_platform.update_block_size_for_backend(vllm_config) + # Enable environment variable cache (e.g. assume no more # environment variable overrides after this point) enable_envs_cache() diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py index b9c7b550170b..ab0f139c25e1 100644 --- a/vllm/v1/executor/uniproc_executor.py +++ b/vllm/v1/executor/uniproc_executor.py @@ -47,6 +47,12 @@ def _init_executor(self) -> None: self.driver_worker.init_device() self.driver_worker.load_model() + # Determine block size from the attention backends now that + # the model layers are constructed. + from vllm.platforms import current_platform + + current_platform.update_block_size_for_backend(self.vllm_config) + def _distributed_args(self) -> tuple[str, int, int]: """Return (distributed_init_method, rank, local_rank).""" distributed_init_method = get_distributed_init_method(get_ip(), get_open_port()) From d3a07349d930dfe685a0f7ce2921a71c392d599d Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 11:52:56 -0500 Subject: [PATCH 15/25] Compute chunked prefill workspace size lazily instead of being conservative Signed-off-by: Matthew Bonanni --- .../layers/attention/mla_attention.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index 1d381ae579c5..c2a9a4f1bb7d 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -407,17 +407,23 @@ def __init__( ) # Attributes for forward_impl method - self.chunked_prefill_workspace_size = ( - MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size( - get_current_vllm_config() - ) - ) + self._chunked_prefill_workspace_size: int | None = None self._decode_concat_quant_fp8_op = _DecodeConcatQuantFP8( static=True, group_shape=GroupShape.PER_TENSOR, compile_native=True, ) + @property + def chunked_prefill_workspace_size(self) -> int: + if self._chunked_prefill_workspace_size is None: + self._chunked_prefill_workspace_size = ( + MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size( + get_current_vllm_config() + ) + ) + return self._chunked_prefill_workspace_size + def forward( self, q: torch.Tensor, @@ -1262,8 +1268,7 @@ def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int: scheduler_config = vllm_config.scheduler_config cache_config = vllm_config.cache_config model_config = vllm_config.model_config - # Use 128 as conservative upper bound if not set by user - block_size = cache_config.block_size or 128 + block_size = cache_config.block_size chunked_prefill_workspace_size = min( # Try for 8 full length request or at least 4 pages per-request From e856ffd50731ceaa6e071de0a8f8e12bf269f4b8 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 11:54:21 -0500 Subject: [PATCH 16/25] Make imports local to prevent circular dependency Signed-off-by: Matthew Bonanni --- vllm/platforms/cuda.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 02e9e3b424fe..c467a629fe5a 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -14,9 +14,7 @@ # import custom ops, trigger op registration import vllm._C # noqa -from vllm.config.vllm import get_layers_from_vllm_config from vllm.logger import init_logger -from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.utils.import_utils import import_pynvml from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.v1.attention.backends.registry import AttentionBackendEnum @@ -200,6 +198,11 @@ def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None: cache_config.block_size = 16 return + from vllm.config.vllm import get_layers_from_vllm_config + from vllm.model_executor.layers.attention_layer_base import ( + AttentionLayerBase, + ) + attn_layers = get_layers_from_vllm_config( vllm_config, AttentionLayerBase, From ce3fc1cfbbf46be82a6036c3e3e4aff5b44eb964 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 12:17:05 -0500 Subject: [PATCH 17/25] Fix vllm config context Signed-off-by: Matthew Bonanni --- vllm/platforms/cuda.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index c467a629fe5a..302a6aa4bcbe 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -198,7 +198,10 @@ def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None: cache_config.block_size = 16 return - from vllm.config.vllm import get_layers_from_vllm_config + from vllm.config.vllm import ( + get_layers_from_vllm_config, + set_current_vllm_config, + ) from vllm.model_executor.layers.attention_layer_base import ( AttentionLayerBase, ) @@ -213,7 +216,8 @@ def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None: first_layer = next(iter(attn_layers.values())) backend_cls = first_layer.get_attn_backend() - preferred = backend_cls.get_preferred_block_size(16) + with set_current_vllm_config(vllm_config): + preferred = backend_cls.get_preferred_block_size(16) if preferred != 16: logger.info( "Setting kv cache block size to %d for %s backend.", From 511141847b5c56e54ab5f19ee5177b9a5e13a2cf Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 12:27:22 -0500 Subject: [PATCH 18/25] Fix chunked local attention Signed-off-by: Matthew Bonanni --- .../layers/attention/chunked_local_attention.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/attention/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py index e33733c0cc1f..522981820d6a 100644 --- a/vllm/model_executor/layers/attention/chunked_local_attention.py +++ b/vllm/model_executor/layers/attention/chunked_local_attention.py @@ -30,9 +30,8 @@ def create_chunked_local_attention_backend( underlying_attn_backend: AttentionBackend, attention_chunk_size: int, - block_size: int, ) -> type[AttentionBackend]: - prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_" + prefix = f"ChunkedLocalAttention_{attention_chunk_size}_" underlying_builder = underlying_attn_backend.get_builder_cls() assert issubclass(underlying_builder, AttentionMetadataBuilder) @@ -55,7 +54,9 @@ def build( fast_build: bool = False, ): cm, make_virtual_batches_block_table = make_local_attention_virtual_batches( - attention_chunk_size, common_attn_metadata, block_size + attention_chunk_size, + common_attn_metadata, + self.kv_cache_spec.block_size, ) metadata = super().build(common_prefix_len, cm, fast_build) metadata.make_virtual_batches_block_table = make_virtual_batches_block_table @@ -97,13 +98,13 @@ def __init__( block_size = cache_config.block_size else: kv_cache_dtype = "auto" - block_size = 16 + block_size = None underlying_attn_backend = get_attn_backend( head_size, dtype, kv_cache_dtype, block_size ) attn_backend = create_chunked_local_attention_backend( - underlying_attn_backend, attention_chunk_size, block_size + underlying_attn_backend, attention_chunk_size ) super().__init__( From 4bac453aad44f9403d0f81ca66f05a7cd82312a3 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 13:01:19 -0500 Subject: [PATCH 19/25] Fix config context Signed-off-by: Matthew Bonanni --- vllm/model_executor/layers/attention/mla_attention.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index c2a9a4f1bb7d..41c32c77be58 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -407,6 +407,7 @@ def __init__( ) # Attributes for forward_impl method + self._vllm_config = get_current_vllm_config() self._chunked_prefill_workspace_size: int | None = None self._decode_concat_quant_fp8_op = _DecodeConcatQuantFP8( static=True, @@ -419,7 +420,7 @@ def chunked_prefill_workspace_size(self) -> int: if self._chunked_prefill_workspace_size is None: self._chunked_prefill_workspace_size = ( MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size( - get_current_vllm_config() + self._vllm_config ) ) return self._chunked_prefill_workspace_size From e5ac83c2d29fdfe0f0590376153793ef2500a69f Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 13:33:25 -0500 Subject: [PATCH 20/25] Fix ray executor Signed-off-by: Matthew Bonanni --- vllm/v1/executor/ray_executor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py index ad51526ae941..6c939a593877 100644 --- a/vllm/v1/executor/ray_executor.py +++ b/vllm/v1/executor/ray_executor.py @@ -385,6 +385,11 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData): self.collective_rpc("init_device") self.collective_rpc("load_model") + def _update_block_size(worker): + current_platform.update_block_size_for_backend(worker.vllm_config) + + self.collective_rpc(_update_block_size) + for pp_rank in range(self.parallel_config.pipeline_parallel_size): self.pp_tp_workers.append([]) for tp_rank in range(self.parallel_config.tensor_parallel_size): From 99b3b3a189f6f83af6bfcc72981bc5217bcccfe8 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 13:45:28 -0500 Subject: [PATCH 21/25] Re-add warning Signed-off-by: Matthew Bonanni --- vllm/platforms/cuda.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 302a6aa4bcbe..9107dbd133ec 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -242,10 +242,10 @@ def get_valid_backends( num_heads: int | None = None, ) -> tuple[ list[tuple["AttentionBackendEnum", int]], - dict["AttentionBackendEnum", list[str]], + dict["AttentionBackendEnum", tuple[int, list[str]]], ]: valid_backends_priorities = [] - invalid_reasons = {} + invalid_reasons: dict[AttentionBackendEnum, tuple[int, list[str]]] = {} backend_priorities = _get_backend_priorities( attn_selector_config.use_mla, @@ -262,7 +262,7 @@ def get_valid_backends( except ImportError: invalid_reasons_i = ["ImportError"] if invalid_reasons_i: - invalid_reasons[backend] = invalid_reasons_i + invalid_reasons[backend] = (priority, invalid_reasons_i) else: valid_backends_priorities.append((backend, priority)) @@ -323,7 +323,7 @@ def select_attention_backend( "{" + ", ".join( f"{backend.name}: [{', '.join(reasons)}]" - for backend, reasons in invalid_reasons.items() + for backend, (_, reasons) in invalid_reasons.items() ) + "}" ) @@ -336,7 +336,30 @@ def select_attention_backend( # Select the one with the highest priority (lowest index). sorted_backends = sorted(valid_backends_priorities, key=lambda x: x[1]) - return sorted_backends[0][0] + chosen_backend, chosen_priority = sorted_backends[0] + + # If the user specified --block-size (but not --attention-backend), + # check whether that constraint excluded any higher-priority backends. + if attn_selector_config.block_size is not None: + excluded = [ + backend + for backend, (priority, reasons) in invalid_reasons.items() + if priority < chosen_priority + and reasons == ["block_size not supported"] + ] + if excluded: + names = ", ".join(b.name for b in excluded) + logger.warning( + "--block-size %d excluded higher-priority backend(s) " + "%s. Using %s instead, which may result in reduced " + "performance. Consider removing --block-size to " + "auto-select the optimal block size.", + attn_selector_config.block_size, + names, + chosen_backend.name, + ) + + return chosen_backend @classmethod def get_attn_backend_cls( @@ -371,7 +394,7 @@ def get_attn_backend_cls( "{" + ", ".join( f"{backend.name}: [{', '.join(reasons)}]" - for backend, reasons in invalid_reasons.items() + for backend, (_, reasons) in invalid_reasons.items() ) + "}" ) @@ -383,7 +406,7 @@ def get_attn_backend_cls( logger.info_once( "Using %s attention backend out of potential backends: %s", chosen_backend.name, - tuple(b[0].name for b in valid_backends_priorities), + tuple(backend.name for backend, _ in valid_backends_priorities), scope="local", ) From f7b337a82fb898a49cc9350e4ee4e273ace546c8 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 13:47:52 -0500 Subject: [PATCH 22/25] Clean up Signed-off-by: Matthew Bonanni --- vllm/model_executor/layers/attention/mla_attention.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index 41c32c77be58..4fe25b027793 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -1269,13 +1269,12 @@ def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int: scheduler_config = vllm_config.scheduler_config cache_config = vllm_config.cache_config model_config = vllm_config.model_config - block_size = cache_config.block_size chunked_prefill_workspace_size = min( # Try for 8 full length request or at least 4 pages per-request max( 8 * model_config.max_model_len, - 4 * scheduler_config.max_num_seqs * block_size, + 4 * scheduler_config.max_num_seqs * cache_config.block_size, ), # For long-context models try not to over-allocate limiting # kv-cache space, limiting it to 64k tokens, @@ -1291,7 +1290,7 @@ def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int: # Enforce that we enough for at least 1 page per request chunked_prefill_workspace_size = max( chunked_prefill_workspace_size, - scheduler_config.max_num_seqs * block_size, + scheduler_config.max_num_seqs * cache_config.block_size, ) return chunked_prefill_workspace_size From 982a8920a0b68e4e25970ee0ed926f2c3d3261c9 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 13:49:26 -0500 Subject: [PATCH 23/25] Comment Signed-off-by: Matthew Bonanni --- vllm/platforms/cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 9107dbd133ec..921054f73e6d 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -339,7 +339,7 @@ def select_attention_backend( chosen_backend, chosen_priority = sorted_backends[0] # If the user specified --block-size (but not --attention-backend), - # check whether that constraint excluded any higher-priority backends. + # check whether that constraint precluded any higher-priority backends. if attn_selector_config.block_size is not None: excluded = [ backend From 9b741dbd0cbf127fa252c29b4d6c3c70c5d71fc7 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 13:52:34 -0500 Subject: [PATCH 24/25] Clean up Signed-off-by: Matthew Bonanni --- vllm/v1/executor/multiproc_executor.py | 6 ++---- vllm/v1/executor/uniproc_executor.py | 6 +----- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index fcc096c27399..9cc7dc63ad8c 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -41,6 +41,7 @@ ) from vllm.envs import enable_envs_cache from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.tracing import instrument, maybe_init_worker_tracer from vllm.utils.network_utils import ( get_distributed_init_method, @@ -579,10 +580,7 @@ def __init__( self._init_message_queues(input_shm_handle, vllm_config) self.worker.load_model() - # Determine block size from the attention backends now that - # the model layers are constructed. - from vllm.platforms import current_platform - + # Set block size based on the attention backends current_platform.update_block_size_for_backend(vllm_config) # Enable environment variable cache (e.g. assume no more diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py index ab0f139c25e1..290c4dc8bbc8 100644 --- a/vllm/v1/executor/uniproc_executor.py +++ b/vllm/v1/executor/uniproc_executor.py @@ -12,6 +12,7 @@ import vllm.envs as envs from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType @@ -46,11 +47,6 @@ def _init_executor(self) -> None: self.driver_worker.init_worker(all_kwargs=[kwargs]) self.driver_worker.init_device() self.driver_worker.load_model() - - # Determine block size from the attention backends now that - # the model layers are constructed. - from vllm.platforms import current_platform - current_platform.update_block_size_for_backend(self.vllm_config) def _distributed_args(self) -> tuple[str, int, int]: From 3bb724ac3359b8e727ebc1cc03d29ab53cd83915 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 19 Feb 2026 15:23:31 -0500 Subject: [PATCH 25/25] Use min across groups Signed-off-by: Matthew Bonanni --- vllm/v1/engine/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 85927ce7ed16..d7a52b090c07 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -115,9 +115,9 @@ def __init__( vllm_config ) if kv_cache_config.kv_cache_groups: - vllm_config.cache_config.block_size = kv_cache_config.kv_cache_groups[ - 0 - ].kv_cache_spec.block_size + vllm_config.cache_config.block_size = min( + g.kv_cache_spec.block_size for g in kv_cache_config.kv_cache_groups + ) elif vllm_config.cache_config.block_size is None: # Attention-free models (encoder-only, SSM) — use default. vllm_config.cache_config.block_size = 16