diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 521184e4c686..856f4b33ed3b 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -325,15 +325,26 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: if cache_config.enable_prefix_caching: if cache_config.mamba_cache_mode == "none": - cache_config.mamba_cache_mode = ( - "all" if model_config.supports_mamba_prefix_caching else "align" - ) - logger.warning( - "Mamba cache mode is set to '%s' for %s by default " - "when prefix caching is enabled", - cache_config.mamba_cache_mode, - model_config.architecture, - ) + if ( + model_config.supports_mamba_prefix_caching + and vllm_config.speculative_config is not None + ): + cache_config.mamba_cache_mode = "align" + logger.warning( + "Mamba cache mode is set to 'align' for %s by default " + "when prefix caching and speculative decoding are enabled", + model_config.architecture, + ) + else: + cache_config.mamba_cache_mode = ( + "all" if model_config.supports_mamba_prefix_caching else "align" + ) + logger.warning( + "Mamba cache mode is set to '%s' for %s by default " + "when prefix caching is enabled", + cache_config.mamba_cache_mode, + model_config.architecture, + ) if ( cache_config.mamba_cache_mode == "all" and not model_config.supports_mamba_prefix_caching