Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2a25c4b
Fix
MatthewBonanni Feb 18, 2026
1a40186
Fix
MatthewBonanni Feb 18, 2026
c617228
Fix
MatthewBonanni Feb 18, 2026
6ac74b8
Make _update_block_size_for_backend fault-tolerant
MatthewBonanni Feb 18, 2026
dfed2a1
Workaround by limiting to MLA
MatthewBonanni Feb 18, 2026
4065757
Try-except should no longer be necessary
MatthewBonanni Feb 18, 2026
40d3782
Lazy allocate workspaces
MatthewBonanni Feb 18, 2026
d473952
Call immediately before initializing kv cache
MatthewBonanni Feb 19, 2026
aa9c7c9
Preserve block size validation
MatthewBonanni Feb 19, 2026
819968f
Cleanup
MatthewBonanni Feb 19, 2026
1f5281e
Merge branch 'main' into fix_basic_extra_init
robertgshaw2-redhat Feb 19, 2026
74bb747
Run before executor construction
MatthewBonanni Feb 19, 2026
37c252c
Fix tests that bypass EngineCore
MatthewBonanni Feb 19, 2026
273dec2
Read backend selections from layers
MatthewBonanni Feb 19, 2026
fab3ee5
Move call from executor to worker
MatthewBonanni Feb 19, 2026
d3a0734
Compute chunked prefill workspace size lazily instead of being conser…
MatthewBonanni Feb 19, 2026
e856ffd
Make imports local to prevent circular dependency
MatthewBonanni Feb 19, 2026
ce3fc1c
Fix vllm config context
MatthewBonanni Feb 19, 2026
5111418
Fix chunked local attention
MatthewBonanni Feb 19, 2026
4bac453
Fix config context
MatthewBonanni Feb 19, 2026
e5ac83c
Fix ray executor
MatthewBonanni Feb 19, 2026
99b3b3a
Re-add warning
MatthewBonanni Feb 19, 2026
f7b337a
Clean up
MatthewBonanni Feb 19, 2026
982a892
Comment
MatthewBonanni Feb 19, 2026
9b741db
Clean up
MatthewBonanni Feb 19, 2026
3bb724a
Use min across groups
MatthewBonanni Feb 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion tests/models/multimodal/processing/test_tensor_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from PIL import Image

from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
from vllm.config.cache import CacheConfig
from vllm.config.multimodal import (
AudioDummyOptions,
BaseDummyOptions,
Expand Down Expand Up @@ -131,7 +132,9 @@ def initialize_dummy_model(
):
temp_file = tempfile.mkstemp()[1]
current_device = torch.get_default_device()
vllm_config = VllmConfig(model_config=model_config)
vllm_config = VllmConfig(
model_config=model_config, cache_config=CacheConfig(block_size=16)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it important that we set the block size for these tests? Basically does this work around some conditional CUDA init issue?

Copy link
Collaborator Author

@MatthewBonanni MatthewBonanni Feb 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, these tests bypass the EngineCore, which is where the update now happens. As an alternative, we could set the default in the CacheConfig constructor, but we'd need to add complexity to CacheConfig to keep track of whether the block size was set by default or by the user (one is overridable, the other is not). I figured this was a valid workaround for these two tests

)
with set_current_vllm_config(vllm_config=vllm_config):
init_distributed_environment(
world_size=1,
Expand Down
9 changes: 7 additions & 2 deletions tests/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,9 @@ def dummy_hf_overrides(
# Kimi uses `num_expert_group` instead of `n_group`.
if n_group is None:
n_group = getattr(text_config, "num_expert_group", None)
# InternS1Pro uses `router_n_groups` instead of `n_group`.
if n_group is None:
n_group = getattr(text_config, "router_n_groups", None)
num_experts = n_group * 2 if n_group is not None else 2

# we use three layers for Gemma-3n to check
Expand Down Expand Up @@ -486,12 +489,14 @@ class DummyConfig:
# Only set MoE related config when the model has MoE layers.
# Otherwise all models detected as MoE by _get_transformers_backend_cls.
if model_arch_config.num_experts > 0:
orig_topk = getattr(text_config, "num_experts_per_tok", 2)
topk = min(orig_topk, 2)
update_dict.update(
{
"num_experts": num_experts,
"num_experts_per_tok": 2,
"num_experts_per_tok": topk,
# Kimi uses `num_experts_per_token`.
"num_experts_per_token": 2,
"num_experts_per_token": topk,
"num_local_experts": num_experts,
# Otherwise there will not be any expert layers
"first_k_dense_replace": 0,
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/spec_decode/test_eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def _create_proposer(
device = current_platform.device_type
vllm_config = VllmConfig(
model_config=model_config,
cache_config=CacheConfig(),
cache_config=CacheConfig(block_size=16),
speculative_config=speculative_config,
device_config=DeviceConfig(device=device),
parallel_config=ParallelConfig(),
Expand Down
4 changes: 2 additions & 2 deletions vllm/config/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ class CacheConfig:
block_size: SkipValidation[int] = None # type: ignore[assignment]
"""Size of a contiguous cache block in number of tokens.

This is None until `Platform.check_and_update_config()` sets it based on
the current platform. Always an int by the time the engine starts."""
This is None until the platform sets it. Always an int by the time
the engine starts."""
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
"""The fraction of GPU memory to be used for the model executor, which can
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
Expand Down
97 changes: 51 additions & 46 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -915,32 +915,6 @@ def has_blocked_weights():
)
current_platform.check_and_update_config(self)

# If DCP, ensure the block size is right.
if self.parallel_config.decode_context_parallel_size > 1:
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
self.parallel_config.cp_kv_cache_interleave_size
!= self.parallel_config.dcp_kv_cache_interleave_size
):
self.parallel_config.cp_kv_cache_interleave_size = (
self.parallel_config.dcp_kv_cache_interleave_size
)
logger.warning_once(
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
"_interleave_size. And dcp-kv-cache-interleave-size will be "
"deprecated when PCP is fully supported."
)
assert (
self.parallel_config.cp_kv_cache_interleave_size
<= self.cache_config.block_size
and self.cache_config.block_size
% self.parallel_config.cp_kv_cache_interleave_size
== 0
), (
f"Block_size({self.cache_config.block_size}) should be greater "
"than or equal to and divisible by cp_kv_cache_interleave_size "
f"({self.parallel_config.cp_kv_cache_interleave_size})."
)

# Do this after all the updates to compilation_config.mode
effective_dp_size = (
self.parallel_config.data_parallel_size
Expand Down Expand Up @@ -1108,26 +1082,6 @@ def has_blocked_weights():
# Default to enable HMA if not explicitly disabled by user or logic above.
self.scheduler_config.disable_hybrid_kv_cache_manager = False

if self.cache_config.mamba_cache_mode == "align":
assert (
self.cache_config.block_size
<= self.scheduler_config.max_num_batched_tokens
), (
"In Mamba cache align mode, block_size "
f"({self.cache_config.block_size}) must be <= "
"max_num_batched_tokens "
f"({self.scheduler_config.max_num_batched_tokens})."
)
if self.scheduler_config.long_prefill_token_threshold > 0:
assert (
self.scheduler_config.long_prefill_token_threshold
>= self.cache_config.block_size
)
assert not self.scheduler_config.disable_chunked_mm_input, (
"Chunked MM input is required because we need the flexibility to "
"schedule a multiple of block_size tokens even if they are in the "
"middle of a mm input"
)
if self.compilation_config.debug_dump_path:
self.compilation_config.debug_dump_path = (
self.compilation_config.debug_dump_path.absolute().expanduser()
Expand Down Expand Up @@ -1488,6 +1442,57 @@ def __str__(self):
f"compilation_config={self.compilation_config!r}"
)

def validate_block_size(self) -> None:
"""Validate block_size against DCP and mamba constraints.

Called after Platform.update_block_size_for_backend() has
finalised block_size, so that the checks see the real value
rather than the initial None sentinel.
"""
block_size = self.cache_config.block_size
assert block_size is not None, (
"validate_block_size called before block_size was set"
)

# DCP interleave-size compatibility
if self.parallel_config.decode_context_parallel_size > 1:
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
self.parallel_config.cp_kv_cache_interleave_size
!= self.parallel_config.dcp_kv_cache_interleave_size
):
self.parallel_config.cp_kv_cache_interleave_size = (
self.parallel_config.dcp_kv_cache_interleave_size
)
logger.warning_once(
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
"_interleave_size. And dcp-kv-cache-interleave-size will be "
"deprecated when PCP is fully supported."
)
assert (
self.parallel_config.cp_kv_cache_interleave_size <= block_size
and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
), (
f"Block_size({block_size}) should be greater "
"than or equal to and divisible by cp_kv_cache_interleave_size "
f"({self.parallel_config.cp_kv_cache_interleave_size})."
)

# Mamba cache align-mode constraints
if self.cache_config.mamba_cache_mode == "align":
assert block_size <= self.scheduler_config.max_num_batched_tokens, (
"In Mamba cache align mode, block_size "
f"({block_size}) must be <= "
"max_num_batched_tokens "
f"({self.scheduler_config.max_num_batched_tokens})."
)
if self.scheduler_config.long_prefill_token_threshold > 0:
assert self.scheduler_config.long_prefill_token_threshold >= block_size
assert not self.scheduler_config.disable_chunked_mm_input, (
"Chunked MM input is required because we need the flexibility "
"to schedule a multiple of block_size tokens even if they are "
"in the middle of a mm input"
)

@model_validator(mode="after")
def validate_mamba_block_size(self) -> "VllmConfig":
if self.model_config is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@
def create_chunked_local_attention_backend(
underlying_attn_backend: AttentionBackend,
attention_chunk_size: int,
block_size: int,
) -> type[AttentionBackend]:
prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
prefix = f"ChunkedLocalAttention_{attention_chunk_size}_"

underlying_builder = underlying_attn_backend.get_builder_cls()
assert issubclass(underlying_builder, AttentionMetadataBuilder)
Expand All @@ -55,7 +54,9 @@ def build(
fast_build: bool = False,
):
cm, make_virtual_batches_block_table = make_local_attention_virtual_batches(
attention_chunk_size, common_attn_metadata, block_size
attention_chunk_size,
common_attn_metadata,
self.kv_cache_spec.block_size,
)
metadata = super().build(common_prefix_len, cm, fast_build)
metadata.make_virtual_batches_block_table = make_virtual_batches_block_table
Expand Down Expand Up @@ -97,13 +98,13 @@ def __init__(
block_size = cache_config.block_size
else:
kv_cache_dtype = "auto"
block_size = 16
block_size = None

underlying_attn_backend = get_attn_backend(
head_size, dtype, kv_cache_dtype, block_size
)
attn_backend = create_chunked_local_attention_backend(
underlying_attn_backend, attention_chunk_size, block_size
underlying_attn_backend, attention_chunk_size
)

super().__init__(
Expand Down
17 changes: 12 additions & 5 deletions vllm/model_executor/layers/attention/mla_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,17 +407,24 @@ def __init__(
)

# Attributes for forward_impl method
self.chunked_prefill_workspace_size = (
MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
get_current_vllm_config()
)
)
self._vllm_config = get_current_vllm_config()
self._chunked_prefill_workspace_size: int | None = None
self._decode_concat_quant_fp8_op = _DecodeConcatQuantFP8(
static=True,
group_shape=GroupShape.PER_TENSOR,
compile_native=True,
)

@property
def chunked_prefill_workspace_size(self) -> int:
if self._chunked_prefill_workspace_size is None:
self._chunked_prefill_workspace_size = (
MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
self._vllm_config
)
)
return self._chunked_prefill_workspace_size

def forward(
self,
q: torch.Tensor,
Expand Down
Loading