vllm-project · vllm-bot · Feb 19, 2026 · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026
@@ -13,6 +13,7 @@
 from PIL import Image
 
 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.config.cache import CacheConfig
 from vllm.config.multimodal import (
     AudioDummyOptions,
     BaseDummyOptions,
@@ -131,7 +132,9 @@ def initialize_dummy_model(
 ):
     temp_file = tempfile.mkstemp()[1]
     current_device = torch.get_default_device()
-    vllm_config = VllmConfig(model_config=model_config)
+    vllm_config = VllmConfig(
+        model_config=model_config, cache_config=CacheConfig(block_size=16)
+    )
     with set_current_vllm_config(vllm_config=vllm_config):
         init_distributed_environment(
             world_size=1,

@@ -457,6 +457,9 @@ def dummy_hf_overrides(
     # Kimi uses `num_expert_group` instead of `n_group`.
     if n_group is None:
         n_group = getattr(text_config, "num_expert_group", None)
+    # InternS1Pro uses `router_n_groups` instead of `n_group`.
+    if n_group is None:
+        n_group = getattr(text_config, "router_n_groups", None)
     num_experts = n_group * 2 if n_group is not None else 2
 
     # we use three layers for Gemma-3n to check
@@ -486,12 +489,14 @@ class DummyConfig:
     # Only set MoE related config when the model has MoE layers.
     # Otherwise all models detected as MoE by _get_transformers_backend_cls.
     if model_arch_config.num_experts > 0:
+        orig_topk = getattr(text_config, "num_experts_per_tok", 2)
+        topk = min(orig_topk, 2)
         update_dict.update(
             {
                 "num_experts": num_experts,
-                "num_experts_per_tok": 2,
+                "num_experts_per_tok": topk,
                 # Kimi uses `num_experts_per_token`.
-                "num_experts_per_token": 2,
+                "num_experts_per_token": topk,
                 "num_local_experts": num_experts,
                 # Otherwise there will not be any expert layers
                 "first_k_dense_replace": 0,

diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
@@ -78,7 +78,7 @@ def _create_proposer(
     device = current_platform.device_type
     vllm_config = VllmConfig(
         model_config=model_config,
-        cache_config=CacheConfig(),
+        cache_config=CacheConfig(block_size=16),
         speculative_config=speculative_config,
         device_config=DeviceConfig(device=device),
         parallel_config=ParallelConfig(),

@@ -41,8 +41,8 @@ class CacheConfig:
     block_size: SkipValidation[int] = None  # type: ignore[assignment]
     """Size of a contiguous cache block in number of tokens.
 
-    This is None until `Platform.check_and_update_config()` sets it based on
-    the current platform. Always an int by the time the engine starts."""
+    This is None until the platform sets it. Always an int by the time
+    the engine starts."""
     gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
     """The fraction of GPU memory to be used for the model executor, which can
     range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory

@@ -915,32 +915,6 @@ def has_blocked_weights():
             )
         current_platform.check_and_update_config(self)
 
-        # If DCP, ensure the block size is right.
-        if self.parallel_config.decode_context_parallel_size > 1:
-            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
-                self.parallel_config.cp_kv_cache_interleave_size
-                != self.parallel_config.dcp_kv_cache_interleave_size
-            ):
-                self.parallel_config.cp_kv_cache_interleave_size = (
-                    self.parallel_config.dcp_kv_cache_interleave_size
-                )
-                logger.warning_once(
-                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
-                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
-                    "deprecated when PCP is fully supported."
-                )
-            assert (
-                self.parallel_config.cp_kv_cache_interleave_size
-                <= self.cache_config.block_size
-                and self.cache_config.block_size
-                % self.parallel_config.cp_kv_cache_interleave_size
-                == 0
-            ), (
-                f"Block_size({self.cache_config.block_size}) should be greater "
-                "than or equal to and divisible by cp_kv_cache_interleave_size "
-                f"({self.parallel_config.cp_kv_cache_interleave_size})."
-            )
-
         # Do this after all the updates to compilation_config.mode
         effective_dp_size = (
             self.parallel_config.data_parallel_size
@@ -1108,26 +1082,6 @@ def has_blocked_weights():
             # Default to enable HMA if not explicitly disabled by user or logic above.
             self.scheduler_config.disable_hybrid_kv_cache_manager = False
 
-        if self.cache_config.mamba_cache_mode == "align":
-            assert (
-                self.cache_config.block_size
-                <= self.scheduler_config.max_num_batched_tokens
-            ), (
-                "In Mamba cache align mode, block_size "
-                f"({self.cache_config.block_size}) must be <= "
-                "max_num_batched_tokens "
-                f"({self.scheduler_config.max_num_batched_tokens})."
-            )
-            if self.scheduler_config.long_prefill_token_threshold > 0:
-                assert (
-                    self.scheduler_config.long_prefill_token_threshold
-                    >= self.cache_config.block_size
-                )
-            assert not self.scheduler_config.disable_chunked_mm_input, (
-                "Chunked MM input is required because we need the flexibility to "
-                "schedule a multiple of block_size tokens even if they are in the "
-                "middle of a mm input"
-            )
         if self.compilation_config.debug_dump_path:
             self.compilation_config.debug_dump_path = (
                 self.compilation_config.debug_dump_path.absolute().expanduser()
@@ -1488,6 +1442,57 @@ def __str__(self):
             f"compilation_config={self.compilation_config!r}"
         )
 
+    def validate_block_size(self) -> None:
+        """Validate block_size against DCP and mamba constraints.
+
+        Called after Platform.update_block_size_for_backend() has
+        finalised block_size, so that the checks see the real value
+        rather than the initial None sentinel.
+        """
+        block_size = self.cache_config.block_size
+        assert block_size is not None, (
+            "validate_block_size called before block_size was set"
+        )
+
+        # DCP interleave-size compatibility
+        if self.parallel_config.decode_context_parallel_size > 1:
+            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
+                self.parallel_config.cp_kv_cache_interleave_size
+                != self.parallel_config.dcp_kv_cache_interleave_size
+            ):
+                self.parallel_config.cp_kv_cache_interleave_size = (
+                    self.parallel_config.dcp_kv_cache_interleave_size
+                )
+                logger.warning_once(
+                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
+                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
+                    "deprecated when PCP is fully supported."
+                )
+            assert (
+                self.parallel_config.cp_kv_cache_interleave_size <= block_size
+                and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
+            ), (
+                f"Block_size({block_size}) should be greater "
+                "than or equal to and divisible by cp_kv_cache_interleave_size "
+                f"({self.parallel_config.cp_kv_cache_interleave_size})."
+            )
+
+        # Mamba cache align-mode constraints
+        if self.cache_config.mamba_cache_mode == "align":
+            assert block_size <= self.scheduler_config.max_num_batched_tokens, (
+                "In Mamba cache align mode, block_size "
+                f"({block_size}) must be <= "
+                "max_num_batched_tokens "
+                f"({self.scheduler_config.max_num_batched_tokens})."
+            )
+            if self.scheduler_config.long_prefill_token_threshold > 0:
+                assert self.scheduler_config.long_prefill_token_threshold >= block_size
+            assert not self.scheduler_config.disable_chunked_mm_input, (
+                "Chunked MM input is required because we need the flexibility "
+                "to schedule a multiple of block_size tokens even if they are "
+                "in the middle of a mm input"
+            )
+
     @model_validator(mode="after")
     def validate_mamba_block_size(self) -> "VllmConfig":
         if self.model_config is None:

@@ -30,9 +30,8 @@
 def create_chunked_local_attention_backend(
     underlying_attn_backend: AttentionBackend,
     attention_chunk_size: int,
-    block_size: int,
 ) -> type[AttentionBackend]:
-    prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
+    prefix = f"ChunkedLocalAttention_{attention_chunk_size}_"
 
     underlying_builder = underlying_attn_backend.get_builder_cls()
     assert issubclass(underlying_builder, AttentionMetadataBuilder)
@@ -55,7 +54,9 @@ def build(
             fast_build: bool = False,
         ):
             cm, make_virtual_batches_block_table = make_local_attention_virtual_batches(
-                attention_chunk_size, common_attn_metadata, block_size
+                attention_chunk_size,
+                common_attn_metadata,
+                self.kv_cache_spec.block_size,
             )
             metadata = super().build(common_prefix_len, cm, fast_build)
             metadata.make_virtual_batches_block_table = make_virtual_batches_block_table
@@ -97,13 +98,13 @@ def __init__(
             block_size = cache_config.block_size
         else:
             kv_cache_dtype = "auto"
-            block_size = 16
+            block_size = None
 
         underlying_attn_backend = get_attn_backend(
             head_size, dtype, kv_cache_dtype, block_size
         )
         attn_backend = create_chunked_local_attention_backend(
-            underlying_attn_backend, attention_chunk_size, block_size
+            underlying_attn_backend, attention_chunk_size
         )
 
         super().__init__(

@@ -407,17 +407,24 @@ def __init__(
         )
 
         # Attributes for forward_impl method
-        self.chunked_prefill_workspace_size = (
-            MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
-                get_current_vllm_config()
-            )
-        )
+        self._vllm_config = get_current_vllm_config()
+        self._chunked_prefill_workspace_size: int | None = None
         self._decode_concat_quant_fp8_op = _DecodeConcatQuantFP8(
             static=True,
             group_shape=GroupShape.PER_TENSOR,
             compile_native=True,
         )
 
+    @property
+    def chunked_prefill_workspace_size(self) -> int:
+        if self._chunked_prefill_workspace_size is None:
+            self._chunked_prefill_workspace_size = (
+                MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
+                    self._vllm_config
+                )
+            )
+        return self._chunked_prefill_workspace_size
+
     def forward(
         self,
         q: torch.Tensor,