vllm-project · vllm-bot · Feb 24, 2026 · Dec 21, 2025 · Dec 21, 2025 · Nov 26, 2025
@@ -1200,6 +1200,11 @@ def check_available_online(
         },
         is_available_online=False,
     ),
+    "NemotronHMTPModel": _HfExamplesInfo(
+        "nvidia/Nemotron-Super-Placeholder",
+        speculative_model="nvidia/Nemotron-Super-Placeholder",
+        is_available_online=False,
+    ),
 }
 
 _TRANSFORMERS_BACKEND_MODELS = {

diff --git a/tests/v1/attention/test_mamba_update_block_table.py b/tests/v1/attention/test_mamba_update_block_table.py
@@ -41,6 +41,9 @@ def _make_vllm_config(block_size, max_model_len, max_num_seqs):
             cudagraph_mode=CUDAGraphMode.FULL,
             max_cudagraph_capture_size=None,
         ),
+        speculative_config=None,
+        num_speculative_tokens=0,
+        parallel_config=SimpleNamespace(decode_context_parallel_size=1),
         scheduler_config=SimpleNamespace(max_num_seqs=max_num_seqs),
         model_config=SimpleNamespace(max_model_len=max_model_len),
     )
@@ -92,7 +95,10 @@ def test_update_block_table_copies_block_idx_to_persistent_buffers():
         has_initial_states_p=None,
         query_start_loc_p=None,
         num_computed_tokens_p=None,
-        state_indices_tensor=builder_a.state_indices_tensor[:num_reqs],
+        state_indices_tensor_p=None,
+        query_start_loc_d=None,
+        num_accepted_tokens=None,
+        state_indices_tensor_d=builder_a.state_indices_tensor_d[:num_reqs],
         block_idx_last_scheduled_token=(
             builder_a.block_idx_last_scheduled_token[:num_reqs]
         ),

@@ -36,6 +36,7 @@
     "glm4_moe_lite_mtp",
     "glm_ocr_mtp",
     "ernie_mtp",
+    "nemotron_h_mtp",
     "exaone_moe_mtp",
     "qwen3_next_mtp",
     "qwen3_5_mtp",
@@ -255,6 +256,19 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
                 {"n_predict": n_predict, "architectures": ["ErnieMTPModel"]}
             )
 
+        if (
+            hf_config.model_type == "nemotron_h"
+            and hasattr(hf_config, "num_nextn_predict_layers")
+            and hf_config.num_nextn_predict_layers > 0
+        ):
+            # Check if this is an MTP variant
+            hf_config.model_type = "nemotron_h_mtp"
+        if hf_config.model_type == "nemotron_h_mtp":
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
+            hf_config.update(
+                {"n_predict": n_predict, "architectures": ["NemotronHMTPModel"]}
+            )
+
         if hf_config.model_type == "qwen3_next":
             hf_config.model_type = "qwen3_next_mtp"
         if hf_config.model_type == "qwen3_next_mtp":
@@ -325,7 +339,7 @@ def __post_init__(self):
                 if self.target_model_config is None:
                     raise ValueError("target_model_config must be present for mtp")
                 if self.target_model_config.hf_text_config.model_type == "deepseek_v32":
-                    # FIXME(luccafong): cudgraph with v32 MTP is not supported,
+                    # FIXME(luccafong): cudagraph with v32 MTP is not supported,
                     # remove this when the issue is fixed.
                     self.enforce_eager = True
                 # use the draft model from the same model:
@@ -427,7 +441,7 @@ def __post_init__(self):
                     self.method = "mtp"
                     if self.num_speculative_tokens > 1:
                         logger.warning(
-                            "Enabling num_speculative_tokens > 1 will run"
+                            "Enabling num_speculative_tokens > 1 will run "
                             "multiple times of forward on same MTP layer"
                             ",which may result in lower acceptance rate"
                         )
@@ -712,6 +726,7 @@ def _verify_args(self) -> Self:
             "hunyuan_vl",
             "hunyuan_v1_dense",
             "afmoe",
+            "nemotron_h",
         ]
         if (
             self.method == "eagle3"

@@ -395,6 +395,15 @@ def compute_hash(self) -> str:
         ]
         return hash_str
 
+    @property
+    def num_speculative_tokens(self) -> int:
+        if (
+            self.speculative_config is not None
+            and self.speculative_config.num_speculative_tokens is not None
+        ):
+            return self.speculative_config.num_speculative_tokens
+        return 0
+
     @property
     def needs_dp_coordinator(self) -> bool:
         """

@@ -41,14 +41,6 @@ def get_state_dtype(self) -> tuple[torch.dtype, ...]:
         pass
 
     def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None:
-        if (
-            vllm_config.speculative_config is not None
-            and vllm_config.model_config.hf_config.model_type
-            not in ["qwen3_next", "qwen3_5", "qwen3_5_moe"]
-        ):
-            raise NotImplementedError(
-                "Mamba with speculative decoding is not supported yet."
-            )
         mamba_block_size = vllm_config.cache_config.mamba_block_size
         page_size_padded = vllm_config.cache_config.mamba_page_size_padded
         return MambaSpec(

@@ -265,7 +265,8 @@ def forward_impl(self, hidden_states: torch.Tensor, output: torch.Tensor):
             attn_metadata = attn_metadata[self.prefix]
             assert isinstance(attn_metadata, Mamba1AttentionMetadata)
             query_start_loc_p = attn_metadata.query_start_loc_p
-            state_indices_tensor = attn_metadata.state_indices_tensor
+            state_indices_tensor_p = attn_metadata.state_indices_tensor_p
+            state_indices_tensor_d = attn_metadata.state_indices_tensor_d
             self_kv_cache = self.kv_cache[forward_context.virtual_engine]
             conv_state = self_kv_cache[0].transpose(-1, -2)
             ssm_state = self_kv_cache[1]
@@ -295,17 +296,13 @@ def forward_impl(self, hidden_states: torch.Tensor, output: torch.Tensor):
         prefill_decode_split = split_batch_to_prefill_and_decode(
             hidden_states_BC,
             gate,
-            state_indices_tensor,
             num_prefill_tokens,
-            num_prefills,
             num_decode_tokens,
         )
         hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p
         hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d
         gate_p = prefill_decode_split.gate_p
         gate_d = prefill_decode_split.gate_d
-        state_indices_tensor_p = prefill_decode_split.state_indices_tensor_p
-        state_indices_tensor_d = prefill_decode_split.state_indices_tensor_d
 
         if is_mamba_cache_all:
             block_idx_last_computed_token_d, block_idx_last_computed_token_p = (
@@ -477,16 +474,12 @@ class PrefillDecodeSplit(NamedTuple):
     hidden_states_BC_d: torch.Tensor
     gate_p: torch.Tensor
     gate_d: torch.Tensor
-    state_indices_tensor_p: torch.Tensor
-    state_indices_tensor_d: torch.Tensor
 
 
 def split_batch_to_prefill_and_decode(
     hidden_states_BC: torch.Tensor,
     gate: torch.Tensor,
-    state_indices_tensor: torch.Tensor,
     num_prefill_tokens: int,
-    num_prefills: int,
     num_decode_tokens: int,
 ) -> PrefillDecodeSplit:
     num_actual_tokens = num_prefill_tokens + num_decode_tokens
@@ -501,20 +494,11 @@ def split_batch_to_prefill_and_decode(
         gate[..., :num_actual_tokens], [num_decode_tokens, num_prefill_tokens], dim=-1
     )
 
-    # num_decode_tokens accounts for CUDA graph padding when applicable
-    state_indices_tensor_d, state_indices_tensor_p = torch.split(
-        state_indices_tensor[: num_decode_tokens + num_prefills],
-        [num_decode_tokens, num_prefills],
-        dim=0,
-    )
-
     return PrefillDecodeSplit(
         hidden_states_BC_p=hidden_states_BC_p,
         hidden_states_BC_d=hidden_states_BC_d,
         gate_p=gate_p,
         gate_d=gate_d,
-        state_indices_tensor_p=state_indices_tensor_p,
-        state_indices_tensor_d=state_indices_tensor_d,
     )
 
 

@@ -477,7 +477,8 @@ def __init__(
             dim=-1,
         )
 
-        compilation_config = get_current_vllm_config().compilation_config
+        vllm_config = get_current_vllm_config()
+        compilation_config = vllm_config.compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
         compilation_config.static_forward_context[prefix] = self
@@ -488,6 +489,8 @@ def __init__(
         self.cache_config = cache_config
         self.prefix = prefix
 
+        self.num_spec = vllm_config.num_speculative_tokens
+
         # Pre-compute sizes for forward pass
         self.tped_intermediate_size = self.intermediate_size // self.tp_size
         self.tped_conv_size = self.conv_dim // self.tp_size
@@ -576,14 +579,19 @@ def conv_ssm_forward(
             # conv_state = (..., dim, width-1) yet contiguous along 'dim'
             conv_state = self_kv_cache[0].transpose(-1, -2)
             ssm_state = self_kv_cache[1]
-            state_indices_tensor = attn_metadata.state_indices_tensor
             has_initial_states_p = attn_metadata.has_initial_states_p
             prep_initial_states = attn_metadata.prep_initial_states
             chunk_size = attn_metadata.chunk_size
             seq_idx_p = attn_metadata.seq_idx_p
             query_start_loc_p = attn_metadata.query_start_loc_p
             cu_chunk_seqlen_p = attn_metadata.cu_chunk_seqlen_p
             last_chunk_indices_p = attn_metadata.last_chunk_indices_p
+            state_indices_tensor_p = attn_metadata.state_indices_tensor_p
+            state_indices_tensor_d = attn_metadata.state_indices_tensor_d
+            num_accepted_tokens = attn_metadata.num_accepted_tokens
+            query_start_loc_d = attn_metadata.query_start_loc_d
+            num_decodes = attn_metadata.num_decodes
+            num_decode_tokens = attn_metadata.num_decode_tokens
 
         if attn_metadata is None:
             # profile run
@@ -593,29 +601,21 @@ def conv_ssm_forward(
             hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C)
             return hidden_states
 
-        num_prefills = attn_metadata.num_prefills  # request count
-        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
-        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
+        num_prefills = attn_metadata.num_prefills
+        num_prefill_tokens = attn_metadata.num_prefill_tokens
         has_prefill = num_prefills > 0
         has_decode = num_decodes > 0
-        num_actual_tokens = num_prefill_tokens + num_decodes
+        num_actual_tokens = num_prefill_tokens + num_decode_tokens
 
-        # Separate prefill and decode by splitting varlen input
         # Split along token dimension
         hidden_states_B_C_d, hidden_states_B_C_p = torch.split(
             hidden_states_B_C[:num_actual_tokens],
-            [num_decodes, num_prefill_tokens],
+            [num_decode_tokens, num_prefill_tokens],
             dim=0,
         )
         dt_d, dt_p = torch.split(
             dt[:num_actual_tokens],
-            [num_decodes, num_prefill_tokens],
-            dim=0,
-        )
-        # Split along batch dimension
-        state_indices_tensor_d, state_indices_tensor_p = torch.split(
-            state_indices_tensor[:num_actual_tokens],
-            [num_decodes, num_prefills],
+            [num_decode_tokens, num_prefill_tokens],
             dim=0,
         )
 
@@ -642,16 +642,16 @@ def conv_ssm_forward(
             )
             num_computed_tokens_p = attn_metadata.num_computed_tokens_p
         else:
-            block_idx_last_computed_token_d = None
             block_idx_last_computed_token_p = None
-            block_idx_last_scheduled_token_d = None
             block_idx_last_scheduled_token_p = None
             block_idx_first_scheduled_token_p = None
+            block_idx_last_scheduled_token_d = None
+            block_idx_last_computed_token_d = None
             num_computed_tokens_p = None
 
         preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split(
             output[:num_actual_tokens],
-            [num_decodes, num_prefill_tokens],
+            [num_decode_tokens, num_prefill_tokens],
             dim=0,
         )
 
@@ -709,6 +709,7 @@ def conv_ssm_forward(
                 )
 
             # NOTE: final output is an in-place update of out tensor
+            assert preallocated_ssm_out_p is not None
             varlen_states = mamba_chunk_scan_combined_varlen(
                 hidden_states_p.view(
                     num_prefill_tokens, self.num_heads // self.tp_size, self.head_dim
@@ -840,6 +841,9 @@ def conv_ssm_forward(
                 conv_state_indices=state_indices_tensor_d,
                 block_idx_last_scheduled_token=block_idx_last_scheduled_token_d,
                 initial_state_idx=block_idx_last_computed_token_d,
+                num_accepted_tokens=num_accepted_tokens,
+                query_start_loc=query_start_loc_d,
+                max_query_len=state_indices_tensor_d.size(-1),
             )
 
             hidden_states_d, B_d, C_d = self.split_hidden_states_B_C_fn(
@@ -862,6 +866,7 @@ def conv_ssm_forward(
                 -1, self.num_heads // self.tp_size, self.head_dim
             )
 
+            assert preallocated_ssm_out_d is not None
             # - the hidden is reshaped into (bs, num_heads, head_dim)
             # - mamba_cache_params.ssm_state's slots will be selected
             #   using state_indices_tensor_d
@@ -879,7 +884,9 @@ def conv_ssm_forward(
                 dt_softplus=True,
                 state_batch_indices=state_indices_tensor_d_input,
                 dst_state_batch_indices=state_indices_tensor_d_output,
-                out=preallocated_ssm_out_d.view(num_decodes, -1, self.head_dim),
+                out=preallocated_ssm_out_d.view(num_decode_tokens, -1, self.head_dim),
+                num_accepted_tokens=num_accepted_tokens,
+                cu_seqlens=query_start_loc_d,
                 is_blackwell=self.is_blackwell,
             )
 
@@ -901,6 +908,7 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
             head_dim=self.head_dim,
             state_size=self.ssm_state_size,
             conv_kernel=self.conv_kernel_size,
+            num_spec=self.num_spec,
         )
 
     @property

@@ -133,6 +133,7 @@ def mamba2_state_shape(
         head_dim: int,
         state_size: int,
         conv_kernel: int,
+        num_spec: int = 0,
     ) -> tuple[tuple[int, int], tuple[int, int, int]]:
         # if n_groups is not divisible by world_size, need to extend the shards
         # to ensure all groups needed by a head is sharded along with it
@@ -141,7 +142,7 @@ def mamba2_state_shape(
         conv_dim = intermediate_size + 2 * n_groups * state_size
 
         # contiguous along 'dim' axis
-        conv_state_shape = (conv_kernel - 1, divide(conv_dim, tp_world_size))
+        conv_state_shape = (conv_kernel - 1 + num_spec, divide(conv_dim, tp_world_size))
 
         # These are not TP-ed as they depend on A, dt_bias, D
         # - they are typically small

@@ -1155,7 +1155,9 @@ def causal_conv1d_update(
         if conv_state_indices is None:
             assert conv_state.size(0) >= batch
         else:
-            assert (batch,) == conv_state_indices.shape
+            assert batch == conv_state_indices.shape[0], (
+                f"ERROR: conv_state_indices should have shape ({batch},*) but got {conv_state_indices.shape}"
+            )
 
         assert num_cache_lines >= batch
         assert weight.stride(1) == 1  # Need this

@@ -119,7 +119,8 @@ def forward_cuda(
             assert isinstance(attn_metadata, ShortConvAttentionMetadata)
             self_kv_cache = self.kv_cache[forward_context.virtual_engine]
             conv_state = self_kv_cache[0].transpose(-1, -2)
-            state_indices_tensor = attn_metadata.state_indices_tensor
+            state_indices_tensor_p = attn_metadata.state_indices_tensor_p
+            state_indices_tensor_d = attn_metadata.state_indices_tensor_d
             has_initial_states_p = attn_metadata.has_initial_states_p
             query_start_loc_p = attn_metadata.query_start_loc_p
 
@@ -163,13 +164,6 @@ def forward_cuda(
             [num_decodes, num_prefill_tokens],
             dim=0,
         )
-        # Split along batch dimension
-        state_indices_tensor_d, state_indices_tensor_p = torch.split(
-            state_indices_tensor,
-            [num_decodes, num_prefills],
-            dim=0,
-        )
-
         conv_output_list = []
 
         if has_prefill:

diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
@@ -228,6 +228,7 @@ def get_mamba_state_shape_from_config(
             head_dim=hf_config.head_dim,
             state_size=hf_config.state_size,
             conv_kernel=hf_config.conv_kernel,
+            num_spec=vllm_config.num_speculative_tokens,
         )
 
     @classmethod