Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
ef71a0e
move block_size update to platform update_block_size_for_backend
xuechendi Mar 23, 2026
52f18b7
enable is_act_and_mul for xpu
xuechendi Mar 18, 2026
778cf5f
Refactor: move hybrid block_size alignment into base Platform
MatthewBonanni Mar 24, 2026
1954d71
Merge commit 'refs/pull/16/head' of https://github.com/xuechendi/vllm…
xuechendi Mar 24, 2026
d06a35a
Revert "enable is_act_and_mul for xpu"
xuechendi Mar 18, 2026
c827335
Fix kernel_block_alignment_size
xuechendi Mar 24, 2026
2d78e87
Add get_preferred_block_size to fa and update for xpu
xuechendi Mar 24, 2026
c8a020f
Merge remote-tracking branch 'origin/main' into wip_nemotron_h_xpu
xuechendi Mar 24, 2026
2f66345
remove default_block_size and fix multi_backend
xuechendi Mar 24, 2026
d80ab76
move ssm check to attn_backend
xuechendi Mar 24, 2026
c7f188a
Update vllm/platforms/interface.py
xuechendi Mar 24, 2026
347bbad
clean up codes
xuechendi Mar 24, 2026
128ada2
update the way to get kernel_block_alignment_size
xuechendi Mar 24, 2026
8e9ce79
Merge remote-tracking branch 'origin/main' into wip_nemotron_h_xpu
xuechendi Mar 25, 2026
d614874
Fix pytest error
xuechendi Mar 25, 2026
cb0596f
Fix pytest
xuechendi Mar 26, 2026
7362163
Fix mamba_attn_chunk_size ignore issue due to pre mamba_block_size
xuechendi Mar 26, 2026
a234a85
skip return default for user_specified_block_size
xuechendi Mar 26, 2026
67f1195
fix last fix
xuechendi Mar 26, 2026
aa44d02
Merge branch 'main' into wip_nemotron_h_xpu
NickLucche Mar 26, 2026
04249bb
Refactor update_block_size_for_backend for clarity
MatthewBonanni Mar 26, 2026
26d5bac
Comments
MatthewBonanni Mar 26, 2026
8f1f8d8
Simplify
MatthewBonanni Mar 26, 2026
6a9ba62
Add user_specified_mamba_block_size
xuechendi Mar 26, 2026
e07c3bd
check backend_cls for non_attn_layer case
xuechendi Mar 26, 2026
6ca84a4
fix comments
xuechendi Mar 27, 2026
6765e45
Merge branch 'main' into wip_nemotron_h_xpu
xuechendi Mar 27, 2026
a83006b
Merge branch 'main' into wip_nemotron_h_xpu
xuechendi Mar 28, 2026
613428e
Merge branch 'main' into wip_nemotron_h_xpu
MatthewBonanni Mar 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tests/v1/worker/test_gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,7 @@ def test_hybrid_attention_mamba_tensor_shapes():
vllm_ctx = vllm_config.compilation_config.static_forward_context

runner = GPUModelRunner(vllm_config, DEVICE)
current_platform.update_block_size_for_backend(vllm_config)
kv_cache_spec = runner.get_kv_cache_spec()

available_memory = 5 * GiB_bytes
Expand Down Expand Up @@ -1306,6 +1307,7 @@ def test_mamba_cache_raises_when_max_num_seqs_exceeds_blocks():
assert fwd_context is not None

runner = GPUModelRunner(vllm_config, DEVICE)
current_platform.update_block_size_for_backend(vllm_config)
kv_cache_spec = runner.get_kv_cache_spec()

available_memory = 5 * GiB_bytes
Expand Down
5 changes: 5 additions & 0 deletions vllm/config/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class CacheConfig:
Accepts None (meaning "use default"). After construction, always int."""
user_specified_block_size: bool = field(default=False, init=False)
"""Whether block_size was explicitly provided. Derived automatically."""
user_specified_mamba_block_size: bool = field(default=False, init=False)
"""Whether mamba_block_size was explicitly provided. Derived automatically."""
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
"""The fraction of GPU memory to be used for the model executor, which can
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
Expand Down Expand Up @@ -174,6 +176,7 @@ def compute_hash(self) -> str:
"cpu_kvcache_space_bytes",
"mamba_page_size_padded",
"user_specified_block_size",
"user_specified_mamba_block_size",
"_block_size_resolved",
# Post-init/derived counters
"num_gpu_blocks",
Expand Down Expand Up @@ -206,6 +209,8 @@ def _apply_block_size_default(self) -> "CacheConfig":
object.__setattr__(self, "block_size", self.DEFAULT_BLOCK_SIZE)
else:
object.__setattr__(self, "user_specified_block_size", True)
if self.mamba_block_size is not None:
object.__setattr__(self, "user_specified_mamba_block_size", True)
return self

@field_validator("calculate_kv_scales", mode="after")
Expand Down
149 changes: 7 additions & 142 deletions vllm/model_executor/models/config.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from copy import deepcopy
from math import lcm
from typing import TYPE_CHECKING

from vllm.logger import init_logger
from vllm.model_executor.models import ModelRegistry
from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
from vllm.utils.math_utils import round_up

if TYPE_CHECKING:
from vllm.config import ModelConfig, VllmConfig
Expand Down Expand Up @@ -104,11 +99,11 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
@classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
"""
Ensure that page size of attention layers is greater than or
equal to the mamba layers. If not, automatically set the attention
block size to ensure that it is. If the attention page size is
strictly greater than the mamba page size, we pad the mamba page size
to make them equal.
Perform early validation and setup for hybrid attention/mamba models.

Block size alignment with mamba page sizes is handled later by
Platform.update_block_size_for_backend(), which runs after model
layers are constructed and the attention backend is known.

Args:
vllm_config: vLLM Config
Expand All @@ -118,6 +113,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
# Disable calculate_kv_scales for hybrid models: uninitialized
# recurrent state corrupts scales during the calibration pass.
# See issue: https://github.com/vllm-project/vllm/issues/37554

if cache_config.calculate_kv_scales:
logger.warning(
"Disabling calculate_kv_scales for hybrid model '%s'. "
Expand All @@ -129,140 +125,9 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
)
cache_config.calculate_kv_scales = False

# Save the user input before it gets modified by MambaModelConfig
mamba_block_size = cache_config.mamba_block_size
# Enable FULL_AND_PIECEWISE by default
MambaModelConfig.verify_and_update_config(vllm_config)

attention_config = vllm_config.attention_config
cache_config = vllm_config.cache_config
model_config = vllm_config.model_config
parallel_config = vllm_config.parallel_config

if cache_config.cache_dtype == "auto":
kv_cache_dtype = model_config.dtype
else:
kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]

# get attention page size (for 1 token)
# Attention backend constraints:
# - FlashAttention (FA) requires block size to be multiple of 16
# - MLA (Multi-head Latent Attention) requires larger alignment:
# * CUTLASS_MLA backend: kernel_block_size 128 alignment
# * Other MLA backends: kernel_block_size 64 alignment
if model_config.use_mla:
use_cutlass_mla = (
attention_config.backend == AttentionBackendEnum.CUTLASS_MLA
)
kernel_block_alignment_size = 128 if use_cutlass_mla else 64
attn_page_size_1_token = MLAAttentionSpec(
block_size=1,
num_kv_heads=model_config.get_num_kv_heads(parallel_config),
head_size=model_config.get_head_size(),
dtype=kv_cache_dtype,
).page_size_bytes
else:
kernel_block_alignment_size = 16
attn_page_size_1_token = FullAttentionSpec(
block_size=1,
num_kv_heads=model_config.get_num_kv_heads(parallel_config),
head_size=model_config.get_head_size(),
dtype=kv_cache_dtype,
).page_size_bytes

model_cls, _ = ModelRegistry.resolve_model_cls(
model_config.architecture,
model_config=model_config,
)

# get mamba page size
mamba_page_size = MambaSpec(
shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),
dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config),
block_size=-1, # block_size doesn't matter for mamba page size
).page_size_bytes

# Model may be marked as is_hybrid
# but mamba is skipped via config,
# return directly
if mamba_page_size == 0:
return

if cache_config.mamba_cache_mode == "all":
# With prefix caching, select attention block size to
# optimize for mamba kernel performance

# Mamba2 SSD kernel uses a chunk_size, e.g. 256
# Align the block to the kernel: use lowest multiple of chunk_size
# of attention tokens that would fit mamba_page_size:
# e.g. for mamba page size = 788kB
# attn_1_token = 2kB -> fits ~394 tokens
# then round up to a multiple of 256 -> 512 tokens
# End result:
# attn_block_size = 512
# mamba_block_size = 512 (aligned to a multiple of chunk_size)
# TODO(tdoublep): this constraint can be relaxed fairly
# easily by changing the way we layout chunks in the
# mamba2 kernels.

base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size()
attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token)
chunk_size = lcm(base_chunk_size, kernel_block_alignment_size)
attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size)
cache_config.mamba_block_size = attn_block_size
else:
# Without prefix caching, select minimum valid attention block size
# to minimize mamba state padding

# Calculate minimum attention block size that satisfies both:
# 1. Backend alignment requirements (kernel_block_alignment_size)
# 2. Mamba page size compatibility (attn_page_size >= mamba_page_size)
attn_block_size = kernel_block_alignment_size * cdiv(
mamba_page_size, kernel_block_alignment_size * attn_page_size_1_token
)

# override attention block size if it is too small,
# even if the user has explicitly set it
if cache_config.block_size < attn_block_size:
cache_config.block_size = attn_block_size
logger.info(
"Setting attention block size to %d tokens "
"to ensure that attention page size is >= mamba page size.",
attn_block_size,
)

# By default, mamba block size will be set to max_model_len.
# When enabling prefix caching and using align mamba cache
# mode, we align mamba block size to the block size as the
# basic granularity for prefix caching.
if cache_config.mamba_cache_mode == "align":
cache_config.mamba_block_size = cache_config.block_size

# compute new attention page size
attn_page_size = cache_config.block_size * attn_page_size_1_token

assert attn_page_size >= mamba_page_size

if attn_page_size == mamba_page_size:
# don't need to pad mamba page size
return

# pad mamba page size to exactly match attention
if (
cache_config.mamba_page_size_padded is None
or cache_config.mamba_page_size_padded != attn_page_size
):
cache_config.mamba_page_size_padded = attn_page_size
mamba_padding_pct = (
100 * (attn_page_size - mamba_page_size) / mamba_page_size
)
logger.info(
"Padding mamba page size by %.2f%% to ensure "
"that mamba page size and attention page size are "
"exactly equal.",
mamba_padding_pct,
)


class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig):
@staticmethod
Expand Down
Loading
Loading