Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 20 additions & 9 deletions vllm/model_executor/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,15 +325,26 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:

if cache_config.enable_prefix_caching:
if cache_config.mamba_cache_mode == "none":
cache_config.mamba_cache_mode = (
"all" if model_config.supports_mamba_prefix_caching else "align"
)
logger.warning(
"Mamba cache mode is set to '%s' for %s by default "
"when prefix caching is enabled",
cache_config.mamba_cache_mode,
model_config.architecture,
)
if (
model_config.supports_mamba_prefix_caching
and vllm_config.speculative_config is not None
):
cache_config.mamba_cache_mode = "align"
logger.warning(
"Mamba cache mode is set to 'align' for %s by default "
"when prefix caching and speculative decoding are enabled",
model_config.architecture,
)
else:
cache_config.mamba_cache_mode = (
"all" if model_config.supports_mamba_prefix_caching else "align"
)
logger.warning(
"Mamba cache mode is set to '%s' for %s by default "
"when prefix caching is enabled",
cache_config.mamba_cache_mode,
model_config.architecture,
)
Comment on lines +328 to +347

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Defaulting to align mode for Mamba cache will cause a server crash if chunked prefill is not enabled, due to the strict assertion at line 359. Since this PR increases the cases where align is used as a default (specifically when speculative decoding is enabled), we should ensure that enable_chunked_prefill is automatically enabled to avoid this regression in usability.

Note that this requirement applies whenever mamba_cache_mode is set to align. It would be ideal to handle this enablement consistently for all paths that lead to align mode.

                if (
                    model_config.supports_mamba_prefix_caching
                    and vllm_config.speculative_config is not None
                ):
                    cache_config.mamba_cache_mode = "align"
                    vllm_config.scheduler_config.enable_chunked_prefill = True
                    logger.warning(
                        "Mamba cache mode is set to 'align' for %s by default "
                        "when prefix caching and speculative decoding are enabled. "
                        "Chunked prefill has been enabled as it is required for 'align' mode.",
                        model_config.architecture,
                    )
                else:
                    cache_config.mamba_cache_mode = (
                        "all" if model_config.supports_mamba_prefix_caching else "align"
                    )
                    if cache_config.mamba_cache_mode == "align":
                        vllm_config.scheduler_config.enable_chunked_prefill = True
                    logger.warning(
                        "Mamba cache mode is set to '%s' for %s by default "
                        "when prefix caching is enabled",
                        cache_config.mamba_cache_mode,
                        model_config.architecture,
                    )

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably isn't needed

if (
cache_config.mamba_cache_mode == "all"
and not model_config.supports_mamba_prefix_caching
Expand Down
Loading