From 5ed3486219e19584fb9d1a1dd6c5badac387730d Mon Sep 17 00:00:00 2001 From: Andrii Skliar Date: Wed, 4 Mar 2026 13:48:26 +0100 Subject: [PATCH] [Feature] Add new model configurations and update cache handling for DeepSeekV32 and GptOss (#36000) - Introduced `DeepseekV32ForCausalLM` class to update fp8 cache format. - Added `Ernie4_5_VLMoeForConditionalGenerationConfig` to manage MoE execution order. - Implemented `GptOssForCausalLMConfig` to enhance structured output handling and increase max CUDA graph capture size. - Added `HybridAttentionMambaModelConfig` to ensure compatibility between attention and mamba layer sizes. Signed-off-by: Andrii Skliar --- vllm/model_executor/models/config.py | 740 +++++++++++++-------------- 1 file changed, 370 insertions(+), 370 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index ec03d283fed1..734e3ad2339f 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -28,6 +28,36 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: return +class DeepseekV32ForCausalLM(VerifyAndUpdateConfig): + @classmethod + def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: + """ + Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32 + """ + hf_config = vllm_config.model_config.hf_config + + # Mirror the check in vllm/model_executor/models/deepseek_v2.py + is_v32 = hasattr(hf_config, "index_topk") + assert is_v32 + + # For DeepSeekV3.2, a custom fp8 format is used when fp8 kv-cache is enabled. + cache_config = vllm_config.cache_config + if cache_config.cache_dtype.startswith("fp8"): + cache_config.cache_dtype = "fp8_ds_mla" + logger.info("Using custom fp8 kv-cache format for DeepSeekV3.2") + if cache_config.cache_dtype == "bfloat16": + cache_config.cache_dtype = "auto" + logger.info("Using bfloat16 kv-cache for DeepSeekV3.2") + + +class Ernie4_5_VLMoeForConditionalGenerationConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + # Ernie4.5-VL conditionally executes text/vision MoE branches, so + # fast_moe_cold_start can silently produce incorrect execution order. + vllm_config.compilation_config.fast_moe_cold_start = False + + class Gemma3TextModelConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_model_config(model_config: "ModelConfig") -> None: @@ -35,6 +65,29 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: hf_config.is_causal = not hf_config.use_bidirectional_attention +class GptOssForCausalLMConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + structured_outputs_config = vllm_config.structured_outputs_config + if structured_outputs_config.reasoning_parser == "": + structured_outputs_config.reasoning_parser = "openai_gptoss" + + # Increase the max capture size from 512 to 1024 for performance. + # NOTE(woosuk): This will increase the number of CUDA graphs + # from 67 to 83. + compilation_config = vllm_config.compilation_config + # Only override when the user has not set either of + # cudagraph_capture_sizes or max_cudagraph_capture_size. + if ( + compilation_config.cudagraph_capture_sizes is None + and compilation_config.max_cudagraph_capture_size is None + ): + compilation_config.max_cudagraph_capture_size = 1024 + logger.info( + "Overriding max cuda graph capture size to %d for performance.", 1024 + ) + + class GteNewModelConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_model_config(model_config: "ModelConfig") -> None: @@ -55,6 +108,166 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: } +class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): + @classmethod + def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: + """ + Ensure that page size of attention layers is greater than or + equal to the mamba layers. If not, automatically set the attention + block size to ensure that it is. If the attention page size is + strictly greater than the mamba page size, we pad the mamba page size + to make them equal. + + Args: + vllm_config: vLLM Config + """ + # Save the user input before it gets modified by MambaModelConfig + mamba_block_size = vllm_config.cache_config.mamba_block_size + # Enable FULL_AND_PIECEWISE by default + MambaModelConfig.verify_and_update_config(vllm_config) + + attention_config = vllm_config.attention_config + cache_config = vllm_config.cache_config + model_config = vllm_config.model_config + parallel_config = vllm_config.parallel_config + + if cache_config.cache_dtype == "auto": + kv_cache_dtype = model_config.dtype + else: + kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] + + # get attention page size (for 1 token) + # Attention backend constraints: + # - FlashAttention (FA) requires block size to be multiple of 16 + # - MLA (Multi-head Latent Attention) requires larger alignment: + # * CUTLASS_MLA backend: kernel_block_size 128 alignment + # * Other MLA backends: kernel_block_size 64 alignment + if model_config.use_mla: + use_cutlass_mla = ( + attention_config.backend == AttentionBackendEnum.CUTLASS_MLA + ) + kernel_block_alignment_size = 128 if use_cutlass_mla else 64 + attn_page_size_1_token = MLAAttentionSpec( + block_size=1, + num_kv_heads=model_config.get_num_kv_heads(parallel_config), + head_size=model_config.get_head_size(), + dtype=kv_cache_dtype, + ).page_size_bytes + else: + kernel_block_alignment_size = 16 + if ( + current_platform.is_device_capability_family(100) + and model_config.get_head_size() == 256 + and ( + attention_config.backend is None + or attention_config.backend == AttentionBackendEnum.FLASHINFER + ) + ): + # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that` + # head size 256 and block size 16 is not supported on blackwell. + kernel_block_alignment_size = 32 + attn_page_size_1_token = FullAttentionSpec( + block_size=1, + num_kv_heads=model_config.get_num_kv_heads(parallel_config), + head_size=model_config.get_head_size(), + dtype=kv_cache_dtype, + ).page_size_bytes + + model_cls, _ = ModelRegistry.resolve_model_cls( + model_config.architecture, + model_config=model_config, + ) + + # get mamba page size + mamba_page_size = MambaSpec( + shapes=model_cls.get_mamba_state_shape_from_config(vllm_config), + dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config), + block_size=-1, # block_size doesn't matter for mamba page size + ).page_size_bytes + + # Model may be marked as is_hybrid + # but mamba is skipped via config, + # return directly + if mamba_page_size == 0: + return + + if cache_config.mamba_cache_mode == "all": + # With prefix caching, select attention block size to + # optimize for mamba kernel performance + + # Mamba2 SSD kernel uses a chunk_size, e.g. 256 + # Align the block to the kernel: use lowest multiple of chunk_size + # of attention tokens that would fit mamba_page_size: + # e.g. for mamba page size = 788kB + # attn_1_token = 2kB -> fits ~394 tokens + # then round up to a multiple of 256 -> 512 tokens + # End result: + # attn_block_size = 512 + # mamba_block_size = 512 (aligned to a multiple of chunk_size) + # TODO(tdoublep): this constraint can be relaxed fairly + # easily by changing the way we layout chunks in the + # mamba2 kernels. + + base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size() + attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token) + chunk_size = lcm(base_chunk_size, kernel_block_alignment_size) + attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size) + cache_config.mamba_block_size = attn_block_size + else: + # Without prefix caching, select minimum valid attention block size + # to minimize mamba state padding + + # Calculate minimum attention block size that satisfies both: + # 1. Backend alignment requirements (kernel_block_alignment_size) + # 2. Mamba page size compatibility (attn_page_size >= mamba_page_size) + attn_block_size = kernel_block_alignment_size * cdiv( + mamba_page_size, kernel_block_alignment_size * attn_page_size_1_token + ) + + # override attention block size if either (a) the + # user has not set it or (b) the user has set it + # too small. + if cache_config.block_size is None or cache_config.block_size < attn_block_size: + cache_config.block_size = attn_block_size + logger.info( + "Setting attention block size to %d tokens " + "to ensure that attention page size is >= mamba page size.", + attn_block_size, + ) + + # By default, mamba block size will be set to max_model_len. + # When enabling prefix caching and using align mamba cache + # mode, we align mamba block size to the block size as the + # basic granularity for prefix caching. + if cache_config.mamba_cache_mode == "align": + cache_config.mamba_block_size = cache_config.block_size + + # compute new attention page size + attn_page_size = cache_config.block_size * attn_page_size_1_token + + assert attn_page_size >= mamba_page_size + + if attn_page_size == mamba_page_size: + # don't need to pad mamba page size + return + + # pad mamba page size to exactly match attention + if ( + cache_config.mamba_page_size_padded is None + or cache_config.mamba_page_size_padded != attn_page_size + ): + cache_config.mamba_page_size_padded = attn_page_size + mamba_padding_pct = ( + 100 * (attn_page_size - mamba_page_size) / mamba_page_size + ) + logger.info( + "Padding mamba page size by %.2f%% to ensure " + "that mamba page size and attention page size are " + "exactly equal.", + mamba_padding_pct, + ) + + class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_model_config(model_config: "ModelConfig") -> None: @@ -91,6 +304,16 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: } +class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config + config.num_labels = 1 + pooler_config = model_config.pooler_config + if pooler_config.logit_bias is None: + pooler_config.logit_bias = 2.65 + + class LlamaBidirectionalConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_model_config(model_config: "ModelConfig") -> None: @@ -148,30 +371,119 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: model_config.pooler_config.seq_pooling_type = pooling_type -class NomicBertModelConfig(VerifyAndUpdateConfig): - @staticmethod - def verify_and_update_model_config(model_config: "ModelConfig") -> None: - config = model_config.hf_config - - assert config.__class__.__name__ == "NomicBertConfig" - assert config.activation_function in ["swiglu", "gelu"] - config.position_embedding_type = getattr( - config, "position_embedding_type", "rope" - ) - - if config.activation_function == "swiglu": - config.hidden_act = "silu" - else: - config.hidden_act = config.activation_function - - assert config.mlp_fc1_bias == config.mlp_fc2_bias == config.qkv_proj_bias - config.bias = config.qkv_proj_bias +class MambaModelConfig(VerifyAndUpdateConfig): + @classmethod + def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: + """ + Enable FULL_AND_PIECEWISE cuda graph mode by default (required + to get good performance for mamba layers in V1). - assert config.rotary_emb_scale_base is None - assert not config.rotary_emb_interleaved + Args: + vllm_config: vLLM Config + """ + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config - config.layer_norm_eps = config.layer_norm_epsilon - config.intermediate_size = config.n_inner + if cache_config.enable_prefix_caching: + if cache_config.mamba_cache_mode == "none": + cache_config.mamba_cache_mode = ( + "all" if model_config.supports_mamba_prefix_caching else "align" + ) + logger.warning( + "Mamba cache mode is set to '%s' for %s by default " + "when prefix caching is enabled", + cache_config.mamba_cache_mode, + model_config.architecture, + ) + if ( + cache_config.mamba_cache_mode == "all" + and not model_config.supports_mamba_prefix_caching + ): + cache_config.mamba_cache_mode = "align" + logger.warning( + "Hybrid or mamba-based model detected without support " + "for prefix caching with Mamba cache 'all' mode: " + "falling back to 'align' mode." + ) + if cache_config.mamba_cache_mode == "align": + assert vllm_config.scheduler_config.enable_chunked_prefill, ( + "Chunked prefill is required for mamba cache mode 'align'." + ) + logger.info( + "Warning: Prefix caching in Mamba cache '%s' " + "mode is currently enabled. " + "Its support for Mamba layers is experimental. " + "Please report any issues you may observe.", + cache_config.mamba_cache_mode, + ) + # By default, mamba block size will be set to max_model_len (see + # below). When enabling prefix caching, we align mamba block size + # to the block size as the basic granularity for prefix caching. + if cache_config.mamba_block_size is None: + cache_config.mamba_block_size = cache_config.block_size + else: + if cache_config.mamba_cache_mode != "none": + cache_config.mamba_cache_mode = "none" + logger.warning( + "Mamba cache mode is set to 'none' when prefix caching is disabled" + ) + if cache_config.mamba_block_size is None: + cache_config.mamba_block_size = model_config.max_model_len + + +class NemotronHForCausalLMConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto' + (or not explicitly set), to the value specified in the HF config, or to + float16 if not specified. + """ + cache_config = vllm_config.cache_config + if cache_config.mamba_ssm_cache_dtype == "auto": + hf_config = vllm_config.model_config.hf_config + mamba_ssm_cache_dtype = getattr( + hf_config, "mamba_ssm_cache_dtype", "float16" + ) + logger.info( + "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model", + mamba_ssm_cache_dtype, + ) + cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype + + +class NemotronHNanoVLV2Config(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + mm_config = model_config.multimodal_config + if mm_config is not None: + video_kwargs = mm_config.media_io_kwargs.setdefault("video", {}) + video_kwargs.setdefault("video_backend", "nemotron_vl") + + +class NomicBertModelConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config + + assert config.__class__.__name__ == "NomicBertConfig" + assert config.activation_function in ["swiglu", "gelu"] + config.position_embedding_type = getattr( + config, "position_embedding_type", "rope" + ) + + if config.activation_function == "swiglu": + config.hidden_act = "silu" + else: + config.hidden_act = config.activation_function + + assert config.mlp_fc1_bias == config.mlp_fc2_bias == config.qkv_proj_bias + config.bias = config.qkv_proj_bias + + assert config.rotary_emb_scale_base is None + assert not config.rotary_emb_interleaved + + config.layer_norm_eps = config.layer_norm_epsilon + config.intermediate_size = config.n_inner config.hidden_size = config.n_embd config.num_hidden_layers = config.n_layer model_config.model_arch_config.hidden_size = config.hidden_size @@ -299,338 +611,6 @@ class Qwen3VLForSequenceClassificationConfig(Qwen3ForSequenceClassificationConfi pass -class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig): - @staticmethod - def verify_and_update_model_config(model_config: "ModelConfig") -> None: - config = model_config.hf_config - config.num_labels = 1 - pooler_config = model_config.pooler_config - if pooler_config.logit_bias is None: - pooler_config.logit_bias = 2.65 - - -class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): - @staticmethod - def verify_and_update_model_config(model_config: "ModelConfig") -> None: - config = model_config.hf_config - - assert config.__class__.__name__ == "GteConfig" - assert config.hidden_act == "gelu" - - config.hidden_act = "geglu" - - head_dim = config.hidden_size // config.num_attention_heads - rotary_dim = getattr(config, "rotary_emb_dim", head_dim) - config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim - config.rotary_kwargs = { - "head_size": head_dim, - "max_position": config.max_position_embeddings, - "rope_parameters": config.rope_parameters, - } - - -class Ernie4_5_VLMoeForConditionalGenerationConfig(VerifyAndUpdateConfig): - @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - # Ernie4.5-VL conditionally executes text/vision MoE branches, so - # fast_moe_cold_start can silently produce incorrect execution order. - vllm_config.compilation_config.fast_moe_cold_start = False - - -class GptOssForCausalLMConfig(VerifyAndUpdateConfig): - @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - structured_outputs_config = vllm_config.structured_outputs_config - if structured_outputs_config.reasoning_parser == "": - structured_outputs_config.reasoning_parser = "openai_gptoss" - - # Increase the max capture size from 512 to 1024 for performance. - # NOTE(woosuk): This will increase the number of CUDA graphs - # from 67 to 83. - compilation_config = vllm_config.compilation_config - # Only override when the user has not set either of - # cudagraph_capture_sizes or max_cudagraph_capture_size. - if ( - compilation_config.cudagraph_capture_sizes is None - and compilation_config.max_cudagraph_capture_size is None - ): - compilation_config.max_cudagraph_capture_size = 1024 - logger.info( - "Overriding max cuda graph capture size to %d for performance.", 1024 - ) - - -class MambaModelConfig(VerifyAndUpdateConfig): - @classmethod - def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: - """ - Enable FULL_AND_PIECEWISE cuda graph mode by default (required - to get good performance for mamba layers in V1). - - Args: - vllm_config: vLLM Config - """ - model_config = vllm_config.model_config - cache_config = vllm_config.cache_config - - if cache_config.enable_prefix_caching: - if cache_config.mamba_cache_mode == "none": - cache_config.mamba_cache_mode = ( - "all" if model_config.supports_mamba_prefix_caching else "align" - ) - logger.warning( - "Mamba cache mode is set to '%s' for %s by default " - "when prefix caching is enabled", - cache_config.mamba_cache_mode, - model_config.architecture, - ) - if ( - cache_config.mamba_cache_mode == "all" - and not model_config.supports_mamba_prefix_caching - ): - cache_config.mamba_cache_mode = "align" - logger.warning( - "Hybrid or mamba-based model detected without support " - "for prefix caching with Mamba cache 'all' mode: " - "falling back to 'align' mode." - ) - if cache_config.mamba_cache_mode == "align": - assert vllm_config.scheduler_config.enable_chunked_prefill, ( - "Chunked prefill is required for mamba cache mode 'align'." - ) - logger.info( - "Warning: Prefix caching in Mamba cache '%s' " - "mode is currently enabled. " - "Its support for Mamba layers is experimental. " - "Please report any issues you may observe.", - cache_config.mamba_cache_mode, - ) - # By default, mamba block size will be set to max_model_len (see - # below). When enabling prefix caching, we align mamba block size - # to the block size as the basic granularity for prefix caching. - if cache_config.mamba_block_size is None: - cache_config.mamba_block_size = cache_config.block_size - else: - if cache_config.mamba_cache_mode != "none": - cache_config.mamba_cache_mode = "none" - logger.warning( - "Mamba cache mode is set to 'none' when prefix caching is disabled" - ) - if cache_config.mamba_block_size is None: - cache_config.mamba_block_size = model_config.max_model_len - - -class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): - @classmethod - def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: - """ - Ensure that page size of attention layers is greater than or - equal to the mamba layers. If not, automatically set the attention - block size to ensure that it is. If the attention page size is - strictly greater than the mamba page size, we pad the mamba page size - to make them equal. - - Args: - vllm_config: vLLM Config - """ - # Save the user input before it gets modified by MambaModelConfig - mamba_block_size = vllm_config.cache_config.mamba_block_size - # Enable FULL_AND_PIECEWISE by default - MambaModelConfig.verify_and_update_config(vllm_config) - - attention_config = vllm_config.attention_config - cache_config = vllm_config.cache_config - model_config = vllm_config.model_config - parallel_config = vllm_config.parallel_config - - if cache_config.cache_dtype == "auto": - kv_cache_dtype = model_config.dtype - else: - kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - - # get attention page size (for 1 token) - # Attention backend constraints: - # - FlashAttention (FA) requires block size to be multiple of 16 - # - MLA (Multi-head Latent Attention) requires larger alignment: - # * CUTLASS_MLA backend: kernel_block_size 128 alignment - # * Other MLA backends: kernel_block_size 64 alignment - if model_config.use_mla: - use_cutlass_mla = ( - attention_config.backend == AttentionBackendEnum.CUTLASS_MLA - ) - kernel_block_alignment_size = 128 if use_cutlass_mla else 64 - attn_page_size_1_token = MLAAttentionSpec( - block_size=1, - num_kv_heads=model_config.get_num_kv_heads(parallel_config), - head_size=model_config.get_head_size(), - dtype=kv_cache_dtype, - ).page_size_bytes - else: - kernel_block_alignment_size = 16 - if ( - current_platform.is_device_capability_family(100) - and model_config.get_head_size() == 256 - and ( - attention_config.backend is None - or attention_config.backend == AttentionBackendEnum.FLASHINFER - ) - ): - # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that` - # head size 256 and block size 16 is not supported on blackwell. - kernel_block_alignment_size = 32 - attn_page_size_1_token = FullAttentionSpec( - block_size=1, - num_kv_heads=model_config.get_num_kv_heads(parallel_config), - head_size=model_config.get_head_size(), - dtype=kv_cache_dtype, - ).page_size_bytes - - model_cls, _ = ModelRegistry.resolve_model_cls( - model_config.architecture, - model_config=model_config, - ) - - # get mamba page size - mamba_page_size = MambaSpec( - shapes=model_cls.get_mamba_state_shape_from_config(vllm_config), - dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config), - block_size=-1, # block_size doesn't matter for mamba page size - ).page_size_bytes - - # Model may be marked as is_hybrid - # but mamba is skipped via config, - # return directly - if mamba_page_size == 0: - return - - if cache_config.mamba_cache_mode == "all": - # With prefix caching, select attention block size to - # optimize for mamba kernel performance - - # Mamba2 SSD kernel uses a chunk_size, e.g. 256 - # Align the block to the kernel: use lowest multiple of chunk_size - # of attention tokens that would fit mamba_page_size: - # e.g. for mamba page size = 788kB - # attn_1_token = 2kB -> fits ~394 tokens - # then round up to a multiple of 256 -> 512 tokens - # End result: - # attn_block_size = 512 - # mamba_block_size = 512 (aligned to a multiple of chunk_size) - # TODO(tdoublep): this constraint can be relaxed fairly - # easily by changing the way we layout chunks in the - # mamba2 kernels. - - base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size() - attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token) - chunk_size = lcm(base_chunk_size, kernel_block_alignment_size) - attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size) - cache_config.mamba_block_size = attn_block_size - else: - # Without prefix caching, select minimum valid attention block size - # to minimize mamba state padding - - # Calculate minimum attention block size that satisfies both: - # 1. Backend alignment requirements (kernel_block_alignment_size) - # 2. Mamba page size compatibility (attn_page_size >= mamba_page_size) - attn_block_size = kernel_block_alignment_size * cdiv( - mamba_page_size, kernel_block_alignment_size * attn_page_size_1_token - ) - - # override attention block size if either (a) the - # user has not set it or (b) the user has set it - # too small. - if cache_config.block_size is None or cache_config.block_size < attn_block_size: - cache_config.block_size = attn_block_size - logger.info( - "Setting attention block size to %d tokens " - "to ensure that attention page size is >= mamba page size.", - attn_block_size, - ) - - # By default, mamba block size will be set to max_model_len. - # When enabling prefix caching and using align mamba cache - # mode, we align mamba block size to the block size as the - # basic granularity for prefix caching. - if cache_config.mamba_cache_mode == "align": - cache_config.mamba_block_size = cache_config.block_size - - # compute new attention page size - attn_page_size = cache_config.block_size * attn_page_size_1_token - - assert attn_page_size >= mamba_page_size - - if attn_page_size == mamba_page_size: - # don't need to pad mamba page size - return - - # pad mamba page size to exactly match attention - if ( - cache_config.mamba_page_size_padded is None - or cache_config.mamba_page_size_padded != attn_page_size - ): - cache_config.mamba_page_size_padded = attn_page_size - mamba_padding_pct = ( - 100 * (attn_page_size - mamba_page_size) / mamba_page_size - ) - logger.info( - "Padding mamba page size by %.2f%% to ensure " - "that mamba page size and attention page size are " - "exactly equal.", - mamba_padding_pct, - ) - - -class DeepseekV32ForCausalLM(VerifyAndUpdateConfig): - @classmethod - def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: - """ - Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32 - """ - hf_config = vllm_config.model_config.hf_config - - # Mirror the check in vllm/model_executor/models/deepseek_v2.py - is_v32 = hasattr(hf_config, "index_topk") - assert is_v32 - - # For DeepSeekV3.2, a custom fp8 format is used when fp8 kv-cache is enabled. - cache_config = vllm_config.cache_config - if cache_config.cache_dtype.startswith("fp8"): - cache_config.cache_dtype = "fp8_ds_mla" - logger.info("Using custom fp8 kv-cache format for DeepSeekV3.2") - if cache_config.cache_dtype == "bfloat16": - cache_config.cache_dtype = "auto" - logger.info("Using bfloat16 kv-cache for DeepSeekV3.2") - - -class NemotronHForCausalLMConfig(VerifyAndUpdateConfig): - @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto' - (or not explicitly set), to the value specified in the HF config, or to - float16 if not specified. - """ - cache_config = vllm_config.cache_config - if cache_config.mamba_ssm_cache_dtype == "auto": - hf_config = vllm_config.model_config.hf_config - mamba_ssm_cache_dtype = getattr( - hf_config, "mamba_ssm_cache_dtype", "float16" - ) - logger.info( - "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model", - mamba_ssm_cache_dtype, - ) - cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype - - -class NemotronHNanoVLV2Config(VerifyAndUpdateConfig): - @staticmethod - def verify_and_update_model_config(model_config: "ModelConfig") -> None: - mm_config = model_config.multimodal_config - if mm_config is not None: - video_kwargs = mm_config.media_io_kwargs.setdefault("video", {}) - video_kwargs.setdefault("video_backend", "nemotron_vl") - - class Qwen3_5ForConditionalGenerationConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: @@ -658,6 +638,26 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: ) +class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config + + assert config.__class__.__name__ == "GteConfig" + assert config.hidden_act == "gelu" + + config.hidden_act = "geglu" + + head_dim = config.hidden_size // config.num_attention_heads + rotary_dim = getattr(config, "rotary_emb_dim", head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim + config.rotary_kwargs = { + "head_size": head_dim, + "max_position": config.max_position_embeddings, + "rope_parameters": config.rope_parameters, + } + + class VoyageQwen3BidirectionalEmbedModelConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_model_config(model_config: "ModelConfig") -> None: @@ -666,33 +666,33 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { + "ColBERTJinaRobertaModel": JinaRobertaModelConfig, + "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM, + "Ernie4_5_VLMoeForConditionalGeneration": Ernie4_5_VLMoeForConditionalGenerationConfig, # noqa: E501 + "FalconMambaForCausalLM": MambaModelConfig, + "Gemma3TextModel": Gemma3TextModelConfig, + "GptOssForCausalLM": GptOssForCausalLMConfig, "GteModel": SnowflakeGteNewModelConfig, - "GteNewModel": GteNewModelConfig, "GteNewForSequenceClassification": GteNewModelConfig, - "Gemma3TextModel": Gemma3TextModelConfig, - "NemotronH_Nano_VL_V2": NemotronHNanoVLV2Config, + "GteNewModel": GteNewModelConfig, + "JambaForSequenceClassification": JambaForSequenceClassificationConfig, + "JinaVLForRanking": JinaVLForSequenceClassificationConfig, "LlamaBidirectionalForSequenceClassification": LlamaBidirectionalConfig, "LlamaBidirectionalModel": LlamaBidirectionalConfig, - "LlamaNemotronVLModel": LlamaNemotronVLConfig, "LlamaNemotronVLForSequenceClassification": LlamaNemotronVLConfig, + "LlamaNemotronVLModel": LlamaNemotronVLConfig, + "Mamba2ForCausalLM": MambaModelConfig, + "MambaForCausalLM": MambaModelConfig, + "NemotronHForCausalLM": NemotronHForCausalLMConfig, + "NemotronHPuzzleForCausalLM": NemotronHForCausalLMConfig, + "NemotronH_Nano_VL_V2": NemotronHNanoVLV2Config, "NomicBertModel": NomicBertModelConfig, "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig, "Qwen2ForRewardModel": Qwen2ForRewardModelConfig, "Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig, "Qwen3VLForSequenceClassification": Qwen3VLForSequenceClassificationConfig, - "Ernie4_5_VLMoeForConditionalGeneration": Ernie4_5_VLMoeForConditionalGenerationConfig, # noqa: E501 - "XLMRobertaModel": JinaRobertaModelConfig, - "ColBERTJinaRobertaModel": JinaRobertaModelConfig, - "JinaVLForRanking": JinaVLForSequenceClassificationConfig, - "JambaForSequenceClassification": JambaForSequenceClassificationConfig, - "GptOssForCausalLM": GptOssForCausalLMConfig, - "MambaForCausalLM": MambaModelConfig, - "Mamba2ForCausalLM": MambaModelConfig, - "FalconMambaForCausalLM": MambaModelConfig, - "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM, - "NemotronHForCausalLM": NemotronHForCausalLMConfig, - "NemotronHPuzzleForCausalLM": NemotronHForCausalLMConfig, "Qwen3_5ForConditionalGeneration": Qwen3_5ForConditionalGenerationConfig, "Qwen3_5MoeForConditionalGeneration": Qwen3_5ForConditionalGenerationConfig, "VoyageQwen3BidirectionalEmbedModel": VoyageQwen3BidirectionalEmbedModelConfig, + "XLMRobertaModel": JinaRobertaModelConfig, }