From 5ed3486219e19584fb9d1a1dd6c5badac387730d Mon Sep 17 00:00:00 2001
From: Andrii Skliar <askliar@nvidia.com>
Date: Wed, 4 Mar 2026 13:48:26 +0100
Subject: [PATCH] [Feature] Add new model configurations and update cache
 handling for DeepSeekV32 and GptOss (#36000)

- Introduced `DeepseekV32ForCausalLM` class to update fp8 cache format.
- Added `Ernie4_5_VLMoeForConditionalGenerationConfig` to manage MoE execution order.
- Implemented `GptOssForCausalLMConfig` to enhance structured output handling and increase max CUDA graph capture size.
- Added `HybridAttentionMambaModelConfig` to ensure compatibility between attention and mamba layer sizes.

Signed-off-by: Andrii Skliar <askliar@nvidia.com>
---
 vllm/model_executor/models/config.py | 740 +++++++++++++--------------
 1 file changed, 370 insertions(+), 370 deletions(-)

diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index ec03d283fed1..734e3ad2339f 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -28,6 +28,36 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None:
         return
 
 
+class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
+    @classmethod
+    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        """
+        Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32
+        """
+        hf_config = vllm_config.model_config.hf_config
+
+        # Mirror the check in vllm/model_executor/models/deepseek_v2.py
+        is_v32 = hasattr(hf_config, "index_topk")
+        assert is_v32
+
+        # For DeepSeekV3.2, a custom fp8 format is used when fp8 kv-cache is enabled.
+        cache_config = vllm_config.cache_config
+        if cache_config.cache_dtype.startswith("fp8"):
+            cache_config.cache_dtype = "fp8_ds_mla"
+            logger.info("Using custom fp8 kv-cache format for DeepSeekV3.2")
+        if cache_config.cache_dtype == "bfloat16":
+            cache_config.cache_dtype = "auto"
+            logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")
+
+
+class Ernie4_5_VLMoeForConditionalGenerationConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        # Ernie4.5-VL conditionally executes text/vision MoE branches, so
+        # fast_moe_cold_start can silently produce incorrect execution order.
+        vllm_config.compilation_config.fast_moe_cold_start = False
+
+
 class Gemma3TextModelConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_model_config(model_config: "ModelConfig") -> None:
@@ -35,6 +65,29 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None:
         hf_config.is_causal = not hf_config.use_bidirectional_attention
 
 
+class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        structured_outputs_config = vllm_config.structured_outputs_config
+        if structured_outputs_config.reasoning_parser == "":
+            structured_outputs_config.reasoning_parser = "openai_gptoss"
+
+        # Increase the max capture size from 512 to 1024 for performance.
+        # NOTE(woosuk): This will increase the number of CUDA graphs
+        # from 67 to 83.
+        compilation_config = vllm_config.compilation_config
+        # Only override when the user has not set either of
+        # cudagraph_capture_sizes or max_cudagraph_capture_size.
+        if (
+            compilation_config.cudagraph_capture_sizes is None
+            and compilation_config.max_cudagraph_capture_size is None
+        ):
+            compilation_config.max_cudagraph_capture_size = 1024
+            logger.info(
+                "Overriding max cuda graph capture size to %d for performance.", 1024
+            )
+
+
 class GteNewModelConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_model_config(model_config: "ModelConfig") -> None:
@@ -55,6 +108,166 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None:
         }
 
 
+class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
+    @classmethod
+    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        """
+        Ensure that page size of attention layers is greater than or
+        equal to the mamba layers. If not, automatically set the attention
+        block size to ensure that it is. If the attention page size is
+        strictly greater than the mamba page size, we pad the mamba page size
+        to make them equal.
+
+        Args:
+            vllm_config: vLLM Config
+        """
+        # Save the user input before it gets modified by MambaModelConfig
+        mamba_block_size = vllm_config.cache_config.mamba_block_size
+        # Enable FULL_AND_PIECEWISE by default
+        MambaModelConfig.verify_and_update_config(vllm_config)
+
+        attention_config = vllm_config.attention_config
+        cache_config = vllm_config.cache_config
+        model_config = vllm_config.model_config
+        parallel_config = vllm_config.parallel_config
+
+        if cache_config.cache_dtype == "auto":
+            kv_cache_dtype = model_config.dtype
+        else:
+            kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+
+        # get attention page size (for 1 token)
+        # Attention backend constraints:
+        # - FlashAttention (FA) requires block size to be multiple of 16
+        # - MLA (Multi-head Latent Attention) requires larger alignment:
+        #   * CUTLASS_MLA backend: kernel_block_size 128 alignment
+        #   * Other MLA backends: kernel_block_size 64 alignment
+        if model_config.use_mla:
+            use_cutlass_mla = (
+                attention_config.backend == AttentionBackendEnum.CUTLASS_MLA
+            )
+            kernel_block_alignment_size = 128 if use_cutlass_mla else 64
+            attn_page_size_1_token = MLAAttentionSpec(
+                block_size=1,
+                num_kv_heads=model_config.get_num_kv_heads(parallel_config),
+                head_size=model_config.get_head_size(),
+                dtype=kv_cache_dtype,
+            ).page_size_bytes
+        else:
+            kernel_block_alignment_size = 16
+            if (
+                current_platform.is_device_capability_family(100)
+                and model_config.get_head_size() == 256
+                and (
+                    attention_config.backend is None
+                    or attention_config.backend == AttentionBackendEnum.FLASHINFER
+                )
+            ):
+                # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that`
+                # head size 256 and block size 16 is not supported on blackwell.
+                kernel_block_alignment_size = 32
+            attn_page_size_1_token = FullAttentionSpec(
+                block_size=1,
+                num_kv_heads=model_config.get_num_kv_heads(parallel_config),
+                head_size=model_config.get_head_size(),
+                dtype=kv_cache_dtype,
+            ).page_size_bytes
+
+        model_cls, _ = ModelRegistry.resolve_model_cls(
+            model_config.architecture,
+            model_config=model_config,
+        )
+
+        # get mamba page size
+        mamba_page_size = MambaSpec(
+            shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),
+            dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config),
+            block_size=-1,  # block_size doesn't matter for mamba page size
+        ).page_size_bytes
+
+        # Model may be marked as is_hybrid
+        #  but mamba is skipped via config,
+        #  return directly
+        if mamba_page_size == 0:
+            return
+
+        if cache_config.mamba_cache_mode == "all":
+            # With prefix caching, select attention block size to
+            # optimize for mamba kernel performance
+
+            # Mamba2 SSD kernel uses a chunk_size, e.g. 256
+            # Align the block to the kernel: use lowest multiple of chunk_size
+            # of attention tokens that would fit mamba_page_size:
+            # e.g. for mamba page size = 788kB
+            #          attn_1_token = 2kB -> fits ~394 tokens
+            #      then round up to a multiple of 256 -> 512 tokens
+            # End result:
+            #  attn_block_size = 512
+            #  mamba_block_size = 512 (aligned to a multiple of chunk_size)
+            # TODO(tdoublep): this constraint can be relaxed fairly
+            # easily by changing the way we layout chunks in the
+            # mamba2 kernels.
+
+            base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size()
+            attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token)
+            chunk_size = lcm(base_chunk_size, kernel_block_alignment_size)
+            attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size)
+            cache_config.mamba_block_size = attn_block_size
+        else:
+            # Without prefix caching, select minimum valid attention block size
+            # to minimize mamba state padding
+
+            # Calculate minimum attention block size that satisfies both:
+            # 1. Backend alignment requirements (kernel_block_alignment_size)
+            # 2. Mamba page size compatibility (attn_page_size >= mamba_page_size)
+            attn_block_size = kernel_block_alignment_size * cdiv(
+                mamba_page_size, kernel_block_alignment_size * attn_page_size_1_token
+            )
+
+        # override attention block size if either (a) the
+        # user has not set it or (b) the user has set it
+        # too small.
+        if cache_config.block_size is None or cache_config.block_size < attn_block_size:
+            cache_config.block_size = attn_block_size
+            logger.info(
+                "Setting attention block size to %d tokens "
+                "to ensure that attention page size is >= mamba page size.",
+                attn_block_size,
+            )
+
+        # By default, mamba block size will be set to max_model_len.
+        # When enabling prefix caching and using align mamba cache
+        # mode, we align mamba block size to the block size as the
+        # basic granularity for prefix caching.
+        if cache_config.mamba_cache_mode == "align":
+            cache_config.mamba_block_size = cache_config.block_size
+
+        # compute new attention page size
+        attn_page_size = cache_config.block_size * attn_page_size_1_token
+
+        assert attn_page_size >= mamba_page_size
+
+        if attn_page_size == mamba_page_size:
+            # don't need to pad mamba page size
+            return
+
+        # pad mamba page size to exactly match attention
+        if (
+            cache_config.mamba_page_size_padded is None
+            or cache_config.mamba_page_size_padded != attn_page_size
+        ):
+            cache_config.mamba_page_size_padded = attn_page_size
+            mamba_padding_pct = (
+                100 * (attn_page_size - mamba_page_size) / mamba_page_size
+            )
+            logger.info(
+                "Padding mamba page size by %.2f%% to ensure "
+                "that mamba page size and attention page size are "
+                "exactly equal.",
+                mamba_padding_pct,
+            )
+
+
 class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_model_config(model_config: "ModelConfig") -> None:
@@ -91,6 +304,16 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None:
             }
 
 
+class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config
+        config.num_labels = 1
+        pooler_config = model_config.pooler_config
+        if pooler_config.logit_bias is None:
+            pooler_config.logit_bias = 2.65
+
+
 class LlamaBidirectionalConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_model_config(model_config: "ModelConfig") -> None:
@@ -148,30 +371,119 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None:
         model_config.pooler_config.seq_pooling_type = pooling_type
 
 
-class NomicBertModelConfig(VerifyAndUpdateConfig):
-    @staticmethod
-    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
-        config = model_config.hf_config
-
-        assert config.__class__.__name__ == "NomicBertConfig"
-        assert config.activation_function in ["swiglu", "gelu"]
-        config.position_embedding_type = getattr(
-            config, "position_embedding_type", "rope"
-        )
-
-        if config.activation_function == "swiglu":
-            config.hidden_act = "silu"
-        else:
-            config.hidden_act = config.activation_function
-
-        assert config.mlp_fc1_bias == config.mlp_fc2_bias == config.qkv_proj_bias
-        config.bias = config.qkv_proj_bias
+class MambaModelConfig(VerifyAndUpdateConfig):
+    @classmethod
+    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        """
+        Enable FULL_AND_PIECEWISE cuda graph mode by default (required
+        to get good performance for mamba layers in V1).
 
-        assert config.rotary_emb_scale_base is None
-        assert not config.rotary_emb_interleaved
+        Args:
+            vllm_config: vLLM Config
+        """
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
 
-        config.layer_norm_eps = config.layer_norm_epsilon
-        config.intermediate_size = config.n_inner
+        if cache_config.enable_prefix_caching:
+            if cache_config.mamba_cache_mode == "none":
+                cache_config.mamba_cache_mode = (
+                    "all" if model_config.supports_mamba_prefix_caching else "align"
+                )
+                logger.warning(
+                    "Mamba cache mode is set to '%s' for %s by default "
+                    "when prefix caching is enabled",
+                    cache_config.mamba_cache_mode,
+                    model_config.architecture,
+                )
+            if (
+                cache_config.mamba_cache_mode == "all"
+                and not model_config.supports_mamba_prefix_caching
+            ):
+                cache_config.mamba_cache_mode = "align"
+                logger.warning(
+                    "Hybrid or mamba-based model detected without support "
+                    "for prefix caching with Mamba cache 'all' mode: "
+                    "falling back to 'align' mode."
+                )
+            if cache_config.mamba_cache_mode == "align":
+                assert vllm_config.scheduler_config.enable_chunked_prefill, (
+                    "Chunked prefill is required for mamba cache mode 'align'."
+                )
+            logger.info(
+                "Warning: Prefix caching in Mamba cache '%s' "
+                "mode is currently enabled. "
+                "Its support for Mamba layers is experimental. "
+                "Please report any issues you may observe.",
+                cache_config.mamba_cache_mode,
+            )
+            # By default, mamba block size will be set to max_model_len (see
+            # below). When enabling prefix caching, we align mamba block size
+            # to the block size as the basic granularity for prefix caching.
+            if cache_config.mamba_block_size is None:
+                cache_config.mamba_block_size = cache_config.block_size
+        else:
+            if cache_config.mamba_cache_mode != "none":
+                cache_config.mamba_cache_mode = "none"
+                logger.warning(
+                    "Mamba cache mode is set to 'none' when prefix caching is disabled"
+                )
+            if cache_config.mamba_block_size is None:
+                cache_config.mamba_block_size = model_config.max_model_len
+
+
+class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
+        (or not explicitly set), to the value specified in the HF config, or to
+        float16 if not specified.
+        """
+        cache_config = vllm_config.cache_config
+        if cache_config.mamba_ssm_cache_dtype == "auto":
+            hf_config = vllm_config.model_config.hf_config
+            mamba_ssm_cache_dtype = getattr(
+                hf_config, "mamba_ssm_cache_dtype", "float16"
+            )
+            logger.info(
+                "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
+                mamba_ssm_cache_dtype,
+            )
+            cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
+
+
+class NemotronHNanoVLV2Config(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        mm_config = model_config.multimodal_config
+        if mm_config is not None:
+            video_kwargs = mm_config.media_io_kwargs.setdefault("video", {})
+            video_kwargs.setdefault("video_backend", "nemotron_vl")
+
+
+class NomicBertModelConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config
+
+        assert config.__class__.__name__ == "NomicBertConfig"
+        assert config.activation_function in ["swiglu", "gelu"]
+        config.position_embedding_type = getattr(
+            config, "position_embedding_type", "rope"
+        )
+
+        if config.activation_function == "swiglu":
+            config.hidden_act = "silu"
+        else:
+            config.hidden_act = config.activation_function
+
+        assert config.mlp_fc1_bias == config.mlp_fc2_bias == config.qkv_proj_bias
+        config.bias = config.qkv_proj_bias
+
+        assert config.rotary_emb_scale_base is None
+        assert not config.rotary_emb_interleaved
+
+        config.layer_norm_eps = config.layer_norm_epsilon
+        config.intermediate_size = config.n_inner
         config.hidden_size = config.n_embd
         config.num_hidden_layers = config.n_layer
         model_config.model_arch_config.hidden_size = config.hidden_size
@@ -299,338 +611,6 @@ class Qwen3VLForSequenceClassificationConfig(Qwen3ForSequenceClassificationConfi
     pass
 
 
-class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
-    @staticmethod
-    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
-        config = model_config.hf_config
-        config.num_labels = 1
-        pooler_config = model_config.pooler_config
-        if pooler_config.logit_bias is None:
-            pooler_config.logit_bias = 2.65
-
-
-class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
-    @staticmethod
-    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
-        config = model_config.hf_config
-
-        assert config.__class__.__name__ == "GteConfig"
-        assert config.hidden_act == "gelu"
-
-        config.hidden_act = "geglu"
-
-        head_dim = config.hidden_size // config.num_attention_heads
-        rotary_dim = getattr(config, "rotary_emb_dim", head_dim)
-        config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim
-        config.rotary_kwargs = {
-            "head_size": head_dim,
-            "max_position": config.max_position_embeddings,
-            "rope_parameters": config.rope_parameters,
-        }
-
-
-class Ernie4_5_VLMoeForConditionalGenerationConfig(VerifyAndUpdateConfig):
-    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        # Ernie4.5-VL conditionally executes text/vision MoE branches, so
-        # fast_moe_cold_start can silently produce incorrect execution order.
-        vllm_config.compilation_config.fast_moe_cold_start = False
-
-
-class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
-    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        structured_outputs_config = vllm_config.structured_outputs_config
-        if structured_outputs_config.reasoning_parser == "":
-            structured_outputs_config.reasoning_parser = "openai_gptoss"
-
-        # Increase the max capture size from 512 to 1024 for performance.
-        # NOTE(woosuk): This will increase the number of CUDA graphs
-        # from 67 to 83.
-        compilation_config = vllm_config.compilation_config
-        # Only override when the user has not set either of
-        # cudagraph_capture_sizes or max_cudagraph_capture_size.
-        if (
-            compilation_config.cudagraph_capture_sizes is None
-            and compilation_config.max_cudagraph_capture_size is None
-        ):
-            compilation_config.max_cudagraph_capture_size = 1024
-            logger.info(
-                "Overriding max cuda graph capture size to %d for performance.", 1024
-            )
-
-
-class MambaModelConfig(VerifyAndUpdateConfig):
-    @classmethod
-    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
-        """
-        Enable FULL_AND_PIECEWISE cuda graph mode by default (required
-        to get good performance for mamba layers in V1).
-
-        Args:
-            vllm_config: vLLM Config
-        """
-        model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
-
-        if cache_config.enable_prefix_caching:
-            if cache_config.mamba_cache_mode == "none":
-                cache_config.mamba_cache_mode = (
-                    "all" if model_config.supports_mamba_prefix_caching else "align"
-                )
-                logger.warning(
-                    "Mamba cache mode is set to '%s' for %s by default "
-                    "when prefix caching is enabled",
-                    cache_config.mamba_cache_mode,
-                    model_config.architecture,
-                )
-            if (
-                cache_config.mamba_cache_mode == "all"
-                and not model_config.supports_mamba_prefix_caching
-            ):
-                cache_config.mamba_cache_mode = "align"
-                logger.warning(
-                    "Hybrid or mamba-based model detected without support "
-                    "for prefix caching with Mamba cache 'all' mode: "
-                    "falling back to 'align' mode."
-                )
-            if cache_config.mamba_cache_mode == "align":
-                assert vllm_config.scheduler_config.enable_chunked_prefill, (
-                    "Chunked prefill is required for mamba cache mode 'align'."
-                )
-            logger.info(
-                "Warning: Prefix caching in Mamba cache '%s' "
-                "mode is currently enabled. "
-                "Its support for Mamba layers is experimental. "
-                "Please report any issues you may observe.",
-                cache_config.mamba_cache_mode,
-            )
-            # By default, mamba block size will be set to max_model_len (see
-            # below). When enabling prefix caching, we align mamba block size
-            # to the block size as the basic granularity for prefix caching.
-            if cache_config.mamba_block_size is None:
-                cache_config.mamba_block_size = cache_config.block_size
-        else:
-            if cache_config.mamba_cache_mode != "none":
-                cache_config.mamba_cache_mode = "none"
-                logger.warning(
-                    "Mamba cache mode is set to 'none' when prefix caching is disabled"
-                )
-            if cache_config.mamba_block_size is None:
-                cache_config.mamba_block_size = model_config.max_model_len
-
-
-class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
-    @classmethod
-    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
-        """
-        Ensure that page size of attention layers is greater than or
-        equal to the mamba layers. If not, automatically set the attention
-        block size to ensure that it is. If the attention page size is
-        strictly greater than the mamba page size, we pad the mamba page size
-        to make them equal.
-
-        Args:
-            vllm_config: vLLM Config
-        """
-        # Save the user input before it gets modified by MambaModelConfig
-        mamba_block_size = vllm_config.cache_config.mamba_block_size
-        # Enable FULL_AND_PIECEWISE by default
-        MambaModelConfig.verify_and_update_config(vllm_config)
-
-        attention_config = vllm_config.attention_config
-        cache_config = vllm_config.cache_config
-        model_config = vllm_config.model_config
-        parallel_config = vllm_config.parallel_config
-
-        if cache_config.cache_dtype == "auto":
-            kv_cache_dtype = model_config.dtype
-        else:
-            kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
-
-        # get attention page size (for 1 token)
-        # Attention backend constraints:
-        # - FlashAttention (FA) requires block size to be multiple of 16
-        # - MLA (Multi-head Latent Attention) requires larger alignment:
-        #   * CUTLASS_MLA backend: kernel_block_size 128 alignment
-        #   * Other MLA backends: kernel_block_size 64 alignment
-        if model_config.use_mla:
-            use_cutlass_mla = (
-                attention_config.backend == AttentionBackendEnum.CUTLASS_MLA
-            )
-            kernel_block_alignment_size = 128 if use_cutlass_mla else 64
-            attn_page_size_1_token = MLAAttentionSpec(
-                block_size=1,
-                num_kv_heads=model_config.get_num_kv_heads(parallel_config),
-                head_size=model_config.get_head_size(),
-                dtype=kv_cache_dtype,
-            ).page_size_bytes
-        else:
-            kernel_block_alignment_size = 16
-            if (
-                current_platform.is_device_capability_family(100)
-                and model_config.get_head_size() == 256
-                and (
-                    attention_config.backend is None
-                    or attention_config.backend == AttentionBackendEnum.FLASHINFER
-                )
-            ):
-                # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that`
-                # head size 256 and block size 16 is not supported on blackwell.
-                kernel_block_alignment_size = 32
-            attn_page_size_1_token = FullAttentionSpec(
-                block_size=1,
-                num_kv_heads=model_config.get_num_kv_heads(parallel_config),
-                head_size=model_config.get_head_size(),
-                dtype=kv_cache_dtype,
-            ).page_size_bytes
-
-        model_cls, _ = ModelRegistry.resolve_model_cls(
-            model_config.architecture,
-            model_config=model_config,
-        )
-
-        # get mamba page size
-        mamba_page_size = MambaSpec(
-            shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),
-            dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config),
-            block_size=-1,  # block_size doesn't matter for mamba page size
-        ).page_size_bytes
-
-        # Model may be marked as is_hybrid
-        #  but mamba is skipped via config,
-        #  return directly
-        if mamba_page_size == 0:
-            return
-
-        if cache_config.mamba_cache_mode == "all":
-            # With prefix caching, select attention block size to
-            # optimize for mamba kernel performance
-
-            # Mamba2 SSD kernel uses a chunk_size, e.g. 256
-            # Align the block to the kernel: use lowest multiple of chunk_size
-            # of attention tokens that would fit mamba_page_size:
-            # e.g. for mamba page size = 788kB
-            #          attn_1_token = 2kB -> fits ~394 tokens
-            #      then round up to a multiple of 256 -> 512 tokens
-            # End result:
-            #  attn_block_size = 512
-            #  mamba_block_size = 512 (aligned to a multiple of chunk_size)
-            # TODO(tdoublep): this constraint can be relaxed fairly
-            # easily by changing the way we layout chunks in the
-            # mamba2 kernels.
-
-            base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size()
-            attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token)
-            chunk_size = lcm(base_chunk_size, kernel_block_alignment_size)
-            attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size)
-            cache_config.mamba_block_size = attn_block_size
-        else:
-            # Without prefix caching, select minimum valid attention block size
-            # to minimize mamba state padding
-
-            # Calculate minimum attention block size that satisfies both:
-            # 1. Backend alignment requirements (kernel_block_alignment_size)
-            # 2. Mamba page size compatibility (attn_page_size >= mamba_page_size)
-            attn_block_size = kernel_block_alignment_size * cdiv(
-                mamba_page_size, kernel_block_alignment_size * attn_page_size_1_token
-            )
-
-        # override attention block size if either (a) the
-        # user has not set it or (b) the user has set it
-        # too small.
-        if cache_config.block_size is None or cache_config.block_size < attn_block_size:
-            cache_config.block_size = attn_block_size
-            logger.info(
-                "Setting attention block size to %d tokens "
-                "to ensure that attention page size is >= mamba page size.",
-                attn_block_size,
-            )
-
-        # By default, mamba block size will be set to max_model_len.
-        # When enabling prefix caching and using align mamba cache
-        # mode, we align mamba block size to the block size as the
-        # basic granularity for prefix caching.
-        if cache_config.mamba_cache_mode == "align":
-            cache_config.mamba_block_size = cache_config.block_size
-
-        # compute new attention page size
-        attn_page_size = cache_config.block_size * attn_page_size_1_token
-
-        assert attn_page_size >= mamba_page_size
-
-        if attn_page_size == mamba_page_size:
-            # don't need to pad mamba page size
-            return
-
-        # pad mamba page size to exactly match attention
-        if (
-            cache_config.mamba_page_size_padded is None
-            or cache_config.mamba_page_size_padded != attn_page_size
-        ):
-            cache_config.mamba_page_size_padded = attn_page_size
-            mamba_padding_pct = (
-                100 * (attn_page_size - mamba_page_size) / mamba_page_size
-            )
-            logger.info(
-                "Padding mamba page size by %.2f%% to ensure "
-                "that mamba page size and attention page size are "
-                "exactly equal.",
-                mamba_padding_pct,
-            )
-
-
-class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
-    @classmethod
-    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
-        """
-        Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32
-        """
-        hf_config = vllm_config.model_config.hf_config
-
-        # Mirror the check in vllm/model_executor/models/deepseek_v2.py
-        is_v32 = hasattr(hf_config, "index_topk")
-        assert is_v32
-
-        # For DeepSeekV3.2, a custom fp8 format is used when fp8 kv-cache is enabled.
-        cache_config = vllm_config.cache_config
-        if cache_config.cache_dtype.startswith("fp8"):
-            cache_config.cache_dtype = "fp8_ds_mla"
-            logger.info("Using custom fp8 kv-cache format for DeepSeekV3.2")
-        if cache_config.cache_dtype == "bfloat16":
-            cache_config.cache_dtype = "auto"
-            logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")
-
-
-class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
-    @staticmethod
-    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
-        (or not explicitly set), to the value specified in the HF config, or to
-        float16 if not specified.
-        """
-        cache_config = vllm_config.cache_config
-        if cache_config.mamba_ssm_cache_dtype == "auto":
-            hf_config = vllm_config.model_config.hf_config
-            mamba_ssm_cache_dtype = getattr(
-                hf_config, "mamba_ssm_cache_dtype", "float16"
-            )
-            logger.info(
-                "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
-                mamba_ssm_cache_dtype,
-            )
-            cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
-
-
-class NemotronHNanoVLV2Config(VerifyAndUpdateConfig):
-    @staticmethod
-    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
-        mm_config = model_config.multimodal_config
-        if mm_config is not None:
-            video_kwargs = mm_config.media_io_kwargs.setdefault("video", {})
-            video_kwargs.setdefault("video_backend", "nemotron_vl")
-
-
 class Qwen3_5ForConditionalGenerationConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
@@ -658,6 +638,26 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
             )
 
 
+class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
+        config = model_config.hf_config
+
+        assert config.__class__.__name__ == "GteConfig"
+        assert config.hidden_act == "gelu"
+
+        config.hidden_act = "geglu"
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        rotary_dim = getattr(config, "rotary_emb_dim", head_dim)
+        config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim
+        config.rotary_kwargs = {
+            "head_size": head_dim,
+            "max_position": config.max_position_embeddings,
+            "rope_parameters": config.rope_parameters,
+        }
+
+
 class VoyageQwen3BidirectionalEmbedModelConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_model_config(model_config: "ModelConfig") -> None:
@@ -666,33 +666,33 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None:
 
 
 MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
+    "ColBERTJinaRobertaModel": JinaRobertaModelConfig,
+    "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
+    "Ernie4_5_VLMoeForConditionalGeneration": Ernie4_5_VLMoeForConditionalGenerationConfig,  # noqa: E501
+    "FalconMambaForCausalLM": MambaModelConfig,
+    "Gemma3TextModel": Gemma3TextModelConfig,
+    "GptOssForCausalLM": GptOssForCausalLMConfig,
     "GteModel": SnowflakeGteNewModelConfig,
-    "GteNewModel": GteNewModelConfig,
     "GteNewForSequenceClassification": GteNewModelConfig,
-    "Gemma3TextModel": Gemma3TextModelConfig,
-    "NemotronH_Nano_VL_V2": NemotronHNanoVLV2Config,
+    "GteNewModel": GteNewModelConfig,
+    "JambaForSequenceClassification": JambaForSequenceClassificationConfig,
+    "JinaVLForRanking": JinaVLForSequenceClassificationConfig,
     "LlamaBidirectionalForSequenceClassification": LlamaBidirectionalConfig,
     "LlamaBidirectionalModel": LlamaBidirectionalConfig,
-    "LlamaNemotronVLModel": LlamaNemotronVLConfig,
     "LlamaNemotronVLForSequenceClassification": LlamaNemotronVLConfig,
+    "LlamaNemotronVLModel": LlamaNemotronVLConfig,
+    "Mamba2ForCausalLM": MambaModelConfig,
+    "MambaForCausalLM": MambaModelConfig,
+    "NemotronHForCausalLM": NemotronHForCausalLMConfig,
+    "NemotronHPuzzleForCausalLM": NemotronHForCausalLMConfig,
+    "NemotronH_Nano_VL_V2": NemotronHNanoVLV2Config,
     "NomicBertModel": NomicBertModelConfig,
     "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
     "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
     "Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
     "Qwen3VLForSequenceClassification": Qwen3VLForSequenceClassificationConfig,
-    "Ernie4_5_VLMoeForConditionalGeneration": Ernie4_5_VLMoeForConditionalGenerationConfig,  # noqa: E501
-    "XLMRobertaModel": JinaRobertaModelConfig,
-    "ColBERTJinaRobertaModel": JinaRobertaModelConfig,
-    "JinaVLForRanking": JinaVLForSequenceClassificationConfig,
-    "JambaForSequenceClassification": JambaForSequenceClassificationConfig,
-    "GptOssForCausalLM": GptOssForCausalLMConfig,
-    "MambaForCausalLM": MambaModelConfig,
-    "Mamba2ForCausalLM": MambaModelConfig,
-    "FalconMambaForCausalLM": MambaModelConfig,
-    "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
-    "NemotronHForCausalLM": NemotronHForCausalLMConfig,
-    "NemotronHPuzzleForCausalLM": NemotronHForCausalLMConfig,
     "Qwen3_5ForConditionalGeneration": Qwen3_5ForConditionalGenerationConfig,
     "Qwen3_5MoeForConditionalGeneration": Qwen3_5ForConditionalGenerationConfig,
     "VoyageQwen3BidirectionalEmbedModel": VoyageQwen3BidirectionalEmbedModelConfig,
+    "XLMRobertaModel": JinaRobertaModelConfig,
 }