vllm-project · vllm-bot · Feb 18, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 12, 2026
@@ -19,7 +19,6 @@
 
 logger = init_logger(__name__)
 
-BlockSize = Literal[1, 8, 16, 32, 64, 128, 256]
 CacheDType = Literal[
     "auto",
     "bfloat16",
@@ -39,13 +38,11 @@
 class CacheConfig:
     """Configuration for the KV cache."""
 
-    block_size: SkipValidation[BlockSize] = None  # type: ignore[assignment]
-    """Size of a contiguous cache block in number of tokens. On CUDA devices,
-    only block sizes up to 32 are supported.
+    block_size: SkipValidation[int] = None  # type: ignore[assignment]
+    """Size of a contiguous cache block in number of tokens.
 
-    This config has no static default. If left unspecified by the user, it will
-    be set in `Platform.check_and_update_config()` based on the current
-    platform."""
+    This is None until `Platform.check_and_update_config()` sets it based on
+    the current platform. Always an int by the time the engine starts."""
     gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
     """The fraction of GPU memory to be used for the model executor, which can
     range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -59,7 +59,6 @@
     get_attr_docs,
 )
 from vllm.config.cache import (
-    BlockSize,
     CacheDType,
     KVOffloadingBackend,
     MambaCacheMode,
@@ -431,7 +430,7 @@ class EngineArgs:
     max_parallel_loading_workers: int | None = (
         ParallelConfig.max_parallel_loading_workers
     )
-    block_size: BlockSize = CacheConfig.block_size
+    block_size: int = None  # type: ignore[assignment]
     enable_prefix_caching: bool | None = None
     prefix_caching_hash_algo: PrefixCachingHashAlgo = (
         CacheConfig.prefix_caching_hash_algo