From 3c99c85c8af1fe9353b9c7b6734ecb2a8334ba2f Mon Sep 17 00:00:00 2001 From: Roi Koren Date: Mon, 20 Apr 2026 14:54:34 +0300 Subject: [PATCH 1/3] Default to "align" prefix caching mode for NemotronH+MTP Signed-off-by: Roi Koren --- vllm/model_executor/models/config.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 521184e4c686..298d47cb60a5 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -399,6 +399,19 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: hf_config=vllm_config.model_config.hf_config, ) + cache_config = vllm_config.cache_config + speculative_config = vllm_config.speculative_config + if ( + cache_config.enable_prefix_caching + and cache_config.mamba_cache_mode == "none" + and speculative_config is not None + and speculative_config.method == "mtp" + ): + cache_config.mamba_cache_mode = "align" + logger.info( + "Defaulting mamba_cache_mode to 'align' for NemotronH with MTP." + ) + class NemotronHNanoVLV2Config(VerifyAndUpdateConfig): @classmethod From a8a085f26de752c1d220b2a54a609eec416f5028 Mon Sep 17 00:00:00 2001 From: Roi Koren Date: Tue, 21 Apr 2026 12:59:50 +0300 Subject: [PATCH 2/3] Default to "align" prefix caching mode for Mamba-based+SpecDec instead Signed-off-by: Roi Koren --- vllm/model_executor/models/config.py | 29 +++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 298d47cb60a5..5c293b1f06f3 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -325,15 +325,26 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: if cache_config.enable_prefix_caching: if cache_config.mamba_cache_mode == "none": - cache_config.mamba_cache_mode = ( - "all" if model_config.supports_mamba_prefix_caching else "align" - ) - logger.warning( - "Mamba cache mode is set to '%s' for %s by default " - "when prefix caching is enabled", - cache_config.mamba_cache_mode, - model_config.architecture, - ) + if ( + model_config.supports_mamba_prefix_caching + and vllm_config.speculative_config is not None + ): + cache_config.mamba_cache_mode = "align" + logger.warning( + "Mamba cache mode is set to 'align' for %s by default " + "when prefix caching and speculative decoding are enabled", + model_config.architecture, + ) + else: + cache_config.mamba_cache_mode = ( + "all" if model_config.supports_mamba_prefix_caching else "align" + ) + logger.warning( + "Mamba cache mode is set to '%s' for %s by default " + "when prefix caching is enabled", + cache_config.mamba_cache_mode, + model_config.architecture, + ) if ( cache_config.mamba_cache_mode == "all" and not model_config.supports_mamba_prefix_caching From f4ca4f5ed6c92508e08d55d1d703d5fe720d7c82 Mon Sep 17 00:00:00 2001 From: Roi Koren Date: Tue, 21 Apr 2026 13:00:09 +0300 Subject: [PATCH 3/3] Delete duplicate code Signed-off-by: Roi Koren --- vllm/model_executor/models/config.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 5c293b1f06f3..856f4b33ed3b 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -410,19 +410,6 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: hf_config=vllm_config.model_config.hf_config, ) - cache_config = vllm_config.cache_config - speculative_config = vllm_config.speculative_config - if ( - cache_config.enable_prefix_caching - and cache_config.mamba_cache_mode == "none" - and speculative_config is not None - and speculative_config.method == "mtp" - ): - cache_config.mamba_cache_mode = "align" - logger.info( - "Defaulting mamba_cache_mode to 'align' for NemotronH with MTP." - ) - class NemotronHNanoVLV2Config(VerifyAndUpdateConfig): @classmethod