NVIDIA
diff --git a/‎tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py‎
Lines changed: 36 additions & 2 deletions b/‎tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py‎
Lines changed: 36 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/models/checkpoints/hf/qwen3_next_weight_mapper.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/models/checkpoints/hf/qwen3_next_weight_mapper.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/models/modeling_nemotron_h.py‎
Lines changed: 194 additions & 18 deletions b/‎tensorrt_llm/_torch/models/modeling_nemotron_h.py‎
Lines changed: 194 additions & 18 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/create_moe.py‎
Lines changed: 5 additions & 1 deletion b/‎tensorrt_llm/_torch/modules/fused_moe/create_moe.py‎
Lines changed: 5 additions & 1 deletion
@@ -2,8 +2,8 @@
 
 from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import \
     HfWeightMapper
-from tensorrt_llm._torch.models.modeling_nemotron_h import split
 from tensorrt_llm._torch.models.modeling_utils import register_mapper
+from tensorrt_llm._torch.utils import split
 
 
 @register_mapper("HF", "NemotronHForCausalLM")
@@ -34,7 +34,8 @@ def preprocess_weights(self, weights: dict) -> dict:
             if "A_log" in key:
                 key = key.replace("A_log", "A")
 
-            if "_scale" in key:
+            if ("mixer.in_proj" in key
+                    or "mixer.out_proj" in key) and "_scale" in key:
                 new_weights[key] = weights[name]
             elif "A" in key:
                 w = split(weights[name], tp_size, tp_rank)
@@ -94,6 +95,39 @@ def preprocess_weights(self, weights: dict) -> dict:
             elif "mixer.norm.weight" in key:
                 w = split(weights[name], tp_size, tp_rank)
                 new_weights[key] = w
+            # Remap MoE expert weights.
+            elif "mixer.experts." in key:
+                if self.config.moe_backend == 'VANILLA':
+                    new_weights[key] = weights[name]
+                else:
+                    if "up_proj" in key:
+                        w1_key = key.replace("up_proj", "w1")
+                        w3_key = key.replace("up_proj", "w3")
+                        # Don't need to handle with input_scale and weight_scale_2 since they are scalar for fp8 and nvfp4 models.
+                        if "input_scale" in key or "weight_scale_2" in key:
+                            new_weights[w3_key] = weights[name]
+                            new_weights[w1_key] = weights[name]
+                        elif "weight_scale" in key:
+                            # NVFP4 case.
+                            if weights[name].shape:
+                                new_weights[w3_key] = weights[
+                                    name][:weights[name].shape[0] // 2]
+                                new_weights[w1_key] = weights[name][
+                                    weights[name].shape[0] // 2:]
+                            # FP8 case.
+                            else:
+                                new_weights[w3_key] = weights[name]
+                                new_weights[w1_key] = weights[name]
+                        else:
+                            new_weights[w3_key] = weights[name][:weights[name].
+                                                                shape[0] // 2]
+                            new_weights[w1_key] = weights[name][weights[name].
+                                                                shape[0] // 2:]
+                    elif "down_proj" in key:
+                        key = key.replace("down_proj", "w2")
+                        new_weights[key] = weights[name]
+                    else:
+                        raise ValueError(f"Unknown MoE weight: {key}")
             else:
                 new_weights[key] = weights[name]
 
 
@@ -6,8 +6,8 @@
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.checkpoints.hf.qwen2_moe_weight_mapper import \
     Qwen2MoeHfWeightMapper
-from tensorrt_llm._torch.models.modeling_nemotron_h import split
 from tensorrt_llm._torch.models.modeling_utils import register_mapper
+from tensorrt_llm._torch.utils import split
 from tensorrt_llm.models.modeling_utils import DecoderModelForCausalLM
 
 
 
@@ -14,44 +14,33 @@
 # limitations under the License.
 
 import re
-from typing import Optional
+from typing import Dict, Optional
 
 import torch
 from torch import nn
-from torch.nn import functional as F
 from transformers import AutoConfig, PretrainedConfig
 
 from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
     BaseWeightMapper
 from tensorrt_llm._torch.modules.mamba.mamba2_metadata import Mamba2Metadata
+from tensorrt_llm._torch.utils import ActivationType, relu2
 
 from ..attention_backend import AttentionMetadata
 from ..model_config import ModelConfig
 from ..modules.attention import Attention
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
+from ..modules.fused_moe import MoEWeightLoadingMode, create_moe
+from ..modules.linear import Linear
 from ..modules.mamba.mamba2_mixer import Mamba2Mixer
 from ..modules.mlp import MLP
+from ..modules.multi_stream_utils import maybe_execute_in_parallel
 from ..modules.rms_norm import RMSNorm
+from ..utils import AuxStreamType, EventType
 from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
                              register_auto_model)
 
 
-def split(x: torch.Tensor,
-          tp_size: int,
-          idx: int,
-          dim: int = 0) -> torch.Tensor:
-    assert x.shape[dim] % tp_size == 0
-    split_size = x.shape[dim] // tp_size
-    if tp_size == 1:
-        return x
-    return torch.split(x, split_size, dim=dim)[idx]
-
-
-def relu2(x: torch.Tensor) -> torch.Tensor:
-    return torch.square(F.relu(x))
-
-
 class NemotronHConfig(PretrainedConfig):
     model_type = "nemotron_h"
 
@@ -120,6 +109,163 @@ def forward(
                                attn_metadata=attn_metadata)
 
 
+class NemotronHMOE(nn.Module):
+
+    def __init__(
+        self,
+        model_config: ModelConfig[PretrainedConfig],
+        layer_idx: int,
+        aux_stream_dict: Dict[AuxStreamType, torch.cuda.Stream],
+    ):
+        super().__init__()
+
+        # Import here to avoid circular dependency.
+        from .modeling_deepseekv3 import DeepseekV3Gate
+
+        self.activation_type = ActivationType.Relu2
+        self.reduce_results = True
+
+        config = model_config.pretrained_config
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.layer_idx = layer_idx
+        self.moe_intermediate_size = config.moe_intermediate_size[0] \
+            if isinstance(config.moe_intermediate_size, list) else config.moe_intermediate_size
+        self.use_latent_moe: bool = getattr(config, "moe_latent_size",
+                                            None) is not None
+        self.moe_hidden_size: int = config.moe_latent_size if self.use_latent_moe else config.hidden_size
+        self.mlp_bias = config.mlp_bias if hasattr(config,
+                                                   'mlp_bias') else False
+        self.moe_n_group = config.n_group
+        self.num_experts = config.n_routed_experts
+        self.hidden_size = config.hidden_size
+        self.num_shared_experts = config.n_shared_experts
+        self.top_k = config.num_experts_per_tok
+        self.enable_attention_dp = model_config.mapping.enable_attention_dp
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        # Setup shared expert MLP.
+        if config.n_shared_experts is None or config.n_shared_experts == 0:
+            self.shared_experts = None
+        else:
+            shared_expert_intermediate_size = (
+                config.moe_shared_expert_intermediate_size *
+                config.n_shared_experts)
+            self.shared_experts = MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=shared_expert_intermediate_size,
+                bias=self.mlp_bias,
+                activation=relu2,
+                dtype=config.torch_dtype,
+                config=model_config,
+                layer_idx=self.layer_idx,
+            )
+        # Setup MoE gate.
+        self.gate = DeepseekV3Gate(
+            self.hidden_size,
+            self.num_experts,
+            top_k=self.top_k,
+            n_group=self.moe_n_group,
+            topk_group=config.topk_group,
+            routed_scaling_factor=self.routed_scaling_factor,
+            dtype=config.torch_dtype,
+            fuse_routing_kernel=True,
+            apply_routing=False,
+            moe_backend=model_config.moe_backend)
+
+        # Setup MoE experts.
+        self.experts = create_moe(
+            routing_method=self.gate.routing_method,
+            num_experts=self.num_experts,
+            hidden_size=self.moe_hidden_size,
+            intermediate_size=self.moe_intermediate_size,
+            aux_stream_dict=aux_stream_dict,
+            dtype=config.torch_dtype,
+            reduce_results=self.reduce_results,
+            model_config=model_config,
+            layer_idx=self.layer_idx,
+            weight_loading_mode=MoEWeightLoadingMode.VANILLA,
+            bias=self.mlp_bias,
+            activation_type=self.activation_type,
+            # Default values
+            override_quant_config=None,
+            apply_router_weight_on_input=False,
+            swiglu_alpha=None,
+            swiglu_beta=None,
+            swiglu_limit=None,
+        )
+
+        # Setup latent projection layers.
+        if self.use_latent_moe:
+            self.fc1_latent_proj = Linear(
+                in_features=self.hidden_size,
+                out_features=self.moe_hidden_size,
+                bias=self.mlp_bias,
+                dtype=config.torch_dtype,
+            )
+            self.fc2_latent_proj = Linear(
+                in_features=self.moe_hidden_size,
+                out_features=self.hidden_size,
+                bias=self.mlp_bias,
+                dtype=config.torch_dtype,
+            )
+        else:
+            self.fc1_latent_proj = None
+            self.fc2_latent_proj = None
+
+        self.aux_stream_shared = aux_stream_dict[AuxStreamType.MoeShared]
+        self.event_dict = {
+            key: torch.cuda.Event()
+            for key in [EventType.Main, EventType.MoeShared]
+        }
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> torch.Tensor:
+        assert hidden_states.shape[-1] == self.hidden_dim
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_dim)
+
+        def _compute_shared_output():
+            if self.shared_experts is not None:
+                shared_expert_output = self.shared_experts(hidden_states)
+            else:
+                shared_expert_output = 0
+            return shared_expert_output
+
+        def _compute_routed_output():
+            router_logits = self.gate(hidden_states)
+
+            routed_hidden_states = hidden_states
+            if self.use_latent_moe:
+                routed_hidden_states = self.fc1_latent_proj(
+                    routed_hidden_states)
+
+            all_rank_num_tokens = attn_metadata.all_rank_num_tokens
+            final_hidden_states = self.experts(
+                routed_hidden_states,
+                router_logits,
+                all_rank_num_tokens=all_rank_num_tokens,
+                use_dp_padding=False)
+
+            if self.use_latent_moe:
+                final_hidden_states = self.fc2_latent_proj(final_hidden_states)
+
+            return final_hidden_states
+
+        routed_output, shared_output = maybe_execute_in_parallel(
+            _compute_routed_output, _compute_shared_output,
+            self.event_dict[EventType.Main],
+            self.event_dict[EventType.MoeShared], self.aux_stream_shared)
+
+        final_hidden_states = shared_output + routed_output
+
+        return final_hidden_states.view(orig_shape)
+
+
 class NemotronHLayer(DecoderLayer):
 
     def __init__(
@@ -130,6 +276,7 @@ def __init__(
         # - -> MLPLayer
         # * -> TransformerLayer
         layer_type: str,
+        aux_stream_dict: Dict[AuxStreamType, torch.cuda.Stream],
     ):
         super().__init__()
 
@@ -160,6 +307,10 @@ def __init__(
             self.mixer = MLPLayer(model_config, layer_idx)
         elif layer_type == "*":
             self.mixer = TransformerLayer(model_config, layer_idx)
+        elif layer_type == "E":
+            self.mixer = NemotronHMOE(model_config,
+                                      layer_idx=layer_idx,
+                                      aux_stream_dict=aux_stream_dict)
         else:
             raise ValueError(f"{layer_type} is not supported")
 
@@ -186,6 +337,18 @@ def __init__(self, model_config: ModelConfig[NemotronHConfig]):
         super().__init__(model_config)
         config = self.model_config.pretrained_config
 
+        aux_stream_list = [torch.cuda.Stream() for _ in range(3)]
+        self.aux_stream_dict = {
+            # TODO: add attention stream.
+            # AuxStreamType.Attention: aux_stream_list[0],
+            AuxStreamType.MoeShared:
+            aux_stream_list[0],
+            AuxStreamType.MoeChunkingOverlap:
+            aux_stream_list[1],
+            AuxStreamType.MoeBalancer:
+            aux_stream_list[2],
+        }
+
         # calculate embeddings
         self.embed_tokens = Embedding(
             config.vocab_size,
@@ -196,7 +359,11 @@ def __init__(self, model_config: ModelConfig[NemotronHConfig]):
         # create layers
         layers = []
         for layer_idx, layer_type in enumerate(config.hybrid_override_pattern):
-            layers.append(NemotronHLayer(model_config, layer_idx, layer_type))
+            layers.append(
+                NemotronHLayer(model_config,
+                               layer_idx,
+                               layer_type,
+                               aux_stream_dict=self.aux_stream_dict))
         self.layers = nn.ModuleList(layers)
 
         # final norm
@@ -251,6 +418,15 @@ def __init__(
         self,
         model_config: ModelConfig[NemotronHConfig],
     ):
+        # rms_norm_eps might be named differently in the config.
+        if hasattr(model_config.pretrained_config, "rms_norm_eps"):
+            rms_epsilon = model_config.pretrained_config.rms_norm_eps
+        elif hasattr(model_config.pretrained_config, "layer_norm_epsilon"):
+            rms_epsilon = model_config.pretrained_config.layer_norm_epsilon
+        else:
+            raise ValueError("layer_norm_epsilon or rms_norm_eps is not set")
+        model_config.pretrained_config.rms_norm_eps = rms_epsilon
+
         if not model_config.mapping.tp_size in [1, 2, 4, 8]:
             raise ValueError("TP has to be either 1, 2, 4 or 8")
 
 
@@ -6,7 +6,7 @@
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
 from ...model_config import ModelConfig
-from ...utils import AuxStreamType
+from ...utils import ActivationType, AuxStreamType
 from .fused_moe_cute_dsl import CuteDslFusedMoE
 from .fused_moe_cutlass import CutlassFusedMoE
 from .fused_moe_deepgemm import DeepGemmFusedMoE
@@ -74,6 +74,7 @@ def create_moe(
     swiglu_alpha: Optional[torch.Tensor] = None,
     swiglu_beta: Optional[torch.Tensor] = None,
     swiglu_limit: Optional[torch.Tensor] = None,
+    activation_type: ActivationType = ActivationType.Swiglu,
 ) -> MoE:
     moe_cls = get_moe_cls(model_config, override_quant_config)
 
@@ -133,6 +134,7 @@ def create_moe(
             swiglu_alpha=swiglu_alpha,
             swiglu_beta=swiglu_beta,
             swiglu_limit=swiglu_limit,
+            activation_type=activation_type,
         )
     elif moe_cls == WideEPMoE:
         return moe_cls(
@@ -161,6 +163,8 @@ def create_moe(
             model_config=model_config,
             weight_loading_mode=weight_loading_mode,
             apply_router_weight_on_input=apply_router_weight_on_input,
+            layer_idx=layer_idx,
+            activation_type=activation_type,
         )
     elif moe_cls == CuteDslFusedMoE:
         return moe_cls(