axolotl-ai-cloud · NanoCode012 · Mar 30, 2026 · Mar 31, 2026 · Apr 1, 2026 · Apr 2, 2026
diff --git a/src/axolotl/integrations/kernels/constants.py b/src/axolotl/integrations/kernels/constants.py
@@ -41,6 +41,10 @@
     "glm4v_moe": "Glm4vMoeTextMoE",
     # sigmoid -> topk routing (no group selection)
     "minimax_m2": "MiniMaxM2SparseMoeBlock",
+    # Non-GLU MoE (no gate_proj, experts have up_proj + down_proj only)
+    "nemotron_h": "NemotronHMoE",
+    # Models below need custom routing (not yet implemented):
+    # "deepseek_v2": "DeepseekV2Moe",  # softmax->topk, group_limited_greedy, different attr names (num_group)
     # softmax->topk, e_score_correction_bias between softmax and topk
     "ernie4_5_moe": "Ernie4_5_MoeSparseMoeBlock",
     # softmax->topk, group_limited_greedy, different attr names (num_group)

diff --git a/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py b/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py
@@ -196,12 +196,14 @@ def _unwrap_experts_lora(experts_module):
     if num_experts is None:
         # Fallback: infer from parameter shape
         gup = getattr(base_experts, "gate_up_proj", None)
+        if gup is None:
+            gup = getattr(base_experts, "up_proj", None)
         if gup is not None:
             num_experts = gup.shape[0]
 
-    # Extract gate_up_proj LoRA (needs A<->B swap due to transposition)
+    # Extract gate_up_proj (or up_proj for non-GLU) LoRA
     gup_lora = None
-    gup_wrapper = wrappers.get("gate_up_proj")
+    gup_wrapper = wrappers.get("gate_up_proj") or wrappers.get("up_proj")
     if gup_wrapper is not None:
         lora_A, lora_B, scaling = get_lora_params_from_wrapper(gup_wrapper)
         if lora_A is not None:
@@ -489,6 +491,21 @@ def forward(self: nn.Module, layer_input: torch.Tensor):
         # ====================================================================
         experts, gup_lora, down_lora = _unwrap_experts_lora(self.experts)
 
+        # ====================================================================
+        # Detect GLU vs non-GLU expert architecture
+        # ====================================================================
+        # GLU models (Qwen, Mixtral, etc.): gate_up_proj [E, 2*I, H]
+        # Non-GLU models (Nemotron-H, etc.): up_proj [E, I, H]
+        has_glu = hasattr(experts, "gate_up_proj")
+        up_proj_name = "gate_up_proj" if has_glu else "up_proj"
+
+        # ====================================================================
+        # Optional latent projection before experts (e.g. Nemotron-H)
+        # ====================================================================
+        fc1_latent = getattr(self, "fc1_latent_proj", None)
+        if fc1_latent is not None:
+            hidden_states_flat = fc1_latent(hidden_states_flat)
+
         # ====================================================================
         # Selective expert weight dequantization
         # ====================================================================
@@ -498,7 +515,7 @@ def forward(self: nn.Module, layer_input: torch.Tensor):
         use_selective = (
             getattr(self, "_use_selective_dequant", False)
             and hasattr(experts, "parametrizations")
-            and "gate_up_proj" in experts.parametrizations
+            and up_proj_name in experts.parametrizations
         )
 
         if use_selective:
@@ -517,11 +534,11 @@ def forward(self: nn.Module, layer_input: torch.Tensor):
                 num_experts,
             )
             # Dequantize only active experts' weights
-            gate_up_W = selective_expert_weights(
+            up_W = selective_expert_weights(
                 experts,
-                "gate_up_proj",
+                up_proj_name,
                 active_experts,
-            ).transpose(2, 1)  # [num_active, hidden, 2*inter]
+            ).transpose(2, 1)
 
             # Remap LoRA weights to match compact expert indices
             if gup_lora is not None:
@@ -538,18 +555,18 @@ def forward(self: nn.Module, layer_input: torch.Tensor):
             sei_gup = remapped_expert_idxs
             eo_gup = compact_offsets
         else:
-            gate_up_W = experts.gate_up_proj.transpose(2, 1)  # [E, hidden, 2*inter]
+            up_W = getattr(experts, up_proj_name).transpose(2, 1)
             sei_gup = sorted_expert_idxs
             eo_gup = expert_offsets
 
         # ====================================================================
-        # Gate + Up projection
+        # Up projection (GLU: gate+up fused, non-GLU: up only)
         # ====================================================================
         if gup_lora is not None:
             gup_A, gup_B, gup_scaling = gup_lora
-            gup = parallel_linear_lora(
+            up_out = parallel_linear_lora(
                 hidden_states_flat,
-                gate_up_W,
+                up_W,
                 top_k,
                 sei_gup,
                 sorted_scattered_idxs,
@@ -563,9 +580,9 @@ def forward(self: nn.Module, layer_input: torch.Tensor):
                 use_fused_gather=True,
             )
         else:
-            gup = parallel_linear(
+            up_out = parallel_linear(
                 hidden_states_flat,
-                gate_up_W,
+                up_W,
                 top_k,
                 sei_gup,
                 sorted_scattered_idxs,
@@ -574,8 +591,18 @@ def forward(self: nn.Module, layer_input: torch.Tensor):
                 grouped_out=True,
             )
 
-        gates, h = gup.chunk(2, dim=-1)
-        h = experts.act_fn(gates) * h
+        # GLU: split into gate and up, apply act_fn(gate) * up
+        # Non-GLU: apply act_fn directly
+        if has_glu:
+            gates, h = up_out.chunk(2, dim=-1)
+            h = experts.act_fn(gates) * h
+        else:
+            h = experts.act_fn(up_out)
+
+        # Some activations (e.g. relu2) upcast to fp32 internally.
+        # Cast back to weight dtype for the down projection Triton kernel.
+        if h.dtype != experts.down_proj.dtype:
+            h = h.to(experts.down_proj.dtype)
 
         # ====================================================================
         # Down projection
@@ -635,6 +662,13 @@ def forward(self: nn.Module, layer_input: torch.Tensor):
                 gates=routing_weights,
             )
 
+        # ====================================================================
+        # Optional latent projection after experts (e.g. Nemotron-H)
+        # ====================================================================
+        fc2_latent = getattr(self, "fc2_latent_proj", None)
+        if fc2_latent is not None:
+            expert_output = fc2_latent(expert_output)
+
         # ====================================================================
         # Combine with shared expert and reshape
         # ====================================================================

diff --git a/src/axolotl/integrations/kernels/sonicmoe/patch.py b/src/axolotl/integrations/kernels/sonicmoe/patch.py
@@ -39,6 +39,8 @@ def patch_sonicmoe(model_type: str, torch_compile: bool = False):
         torch_compile: If True, wrap routing functions with torch.compile
             for kernel fusion (fuses softmax+topk+renorm into fewer launches).
     """
+    from sonicmoe.enums import is_glu
+
     from .routing import get_model_moe_config
     from .weight_converter import register_sonicmoe_weight_converter
 
@@ -49,7 +51,11 @@ def patch_sonicmoe(model_type: str, torch_compile: bool = False):
 
     for moe_cls in resolve_moe_block_classes(model_type):
         _patch_forward(moe_cls, routing_fn, activation, router_attr)
-    register_sonicmoe_weight_converter(model_type)
+
+    # Weight interleaving only applies to GLU models (gate_up_proj).
+    # Non-GLU models have a plain up_proj that needs no conversion.
+    if is_glu(activation):
+        register_sonicmoe_weight_converter(model_type)
 
 
 def _try_compile_routing(routing_fn):
@@ -98,43 +104,60 @@ def _patch_forward(moe_cls, routing_fn, activation, router_attr):
 
 def _make_general_forward(moe_cls, routing_fn, activation):
     """Create forward using routing_fn + moe_general_routing_inputs."""
+    from sonicmoe.enums import is_glu
+
+    glu_activation = is_glu(activation)
 
     def sonicmoe_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         from sonicmoe import moe_general_routing_inputs
 
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states_flat = hidden_states.view(-1, hidden_dim)
 
-        # Shared expert (computed early, matching original model ordering)
+        # Shared expert
         shared_expert_output = _compute_shared_expert(self, hidden_states_flat)
 
         # Routing
         router_scores, token_indices, expert_indices, _router_logits = routing_fn(
             hidden_states_flat, self
         )
 
-        # Permute weights to SonicMoE layout:
-        #   gate_up: [E, 2*I, H] -> [2*I, H, E]
-        #   down:    [E, H, I]   -> [H, I, E]
-        gate_up_weight = self.experts.gate_up_proj.permute(1, 2, 0)
+        # Optional latent projection before experts (e.g. Nemotron-H)
+        expert_input = hidden_states_flat
+        fc1_latent = getattr(self, "fc1_latent_proj", None)
+        if fc1_latent is not None:
+            expert_input = fc1_latent(expert_input)
+
+        # Permute weights to SonicMoE layout.
+        # GLU models: gate_up_proj [E, 2*I, H] -> [2*I, H, E]
+        # Non-GLU:    up_proj      [E, I, H]   -> [I, H, E]
+        if glu_activation:
+            up_weight = self.experts.gate_up_proj.permute(1, 2, 0)
+        else:
+            up_weight = self.experts.up_proj.permute(1, 2, 0)
         down_weight = self.experts.down_proj.permute(1, 2, 0)
-        E = gate_up_weight.shape[-1]
+        E = up_weight.shape[-1]
 
         output, _ = moe_general_routing_inputs(
-            hidden_states_flat,
+            expert_input,
             router_scores,
             token_indices,
             expert_indices,
-            gate_up_weight,
-            None,  # b1 (no gate/up bias)
+            up_weight,
+            None,  # b1 (no bias)
             down_weight,
-            None,  # b2 (no down bias)
+            None,  # b2 (no bias)
             E,
             torch.cuda.current_stream().cuda_stream,
             activation,
             False,  # is_inference_mode
         )
 
+        # Optional latent projection after experts (e.g. Nemotron-H)
+        fc2_latent = getattr(self, "fc2_latent_proj", None)
+        if fc2_latent is not None:
+            output = fc2_latent(output)
+
         # Add shared expert contribution if present
         if shared_expert_output is not None:
             if hasattr(self, "shared_expert_gate"):
@@ -151,37 +174,54 @@ def sonicmoe_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 def _make_fused_forward(moe_cls, activation, router_attr):
     """Create forward using moe_TC_softmax_topk_layer (topk -> softmax)."""
+    from sonicmoe.enums import is_glu
+
+    glu_activation = is_glu(activation)
 
     def sonicmoe_fused_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         from sonicmoe import moe_TC_softmax_topk_layer
 
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states_flat = hidden_states.view(-1, hidden_dim)
 
-        # Shared expert (computed early, matching original model ordering)
+        # Shared expert
         shared_expert_output = _compute_shared_expert(self, hidden_states_flat)
 
         router = getattr(self, router_attr)
 
-        # Permute weights to SonicMoE layout:
-        #   gate_up: [E, 2*I, H] -> [2*I, H, E]
-        #   down:    [E, H, I]   -> [H, I, E]
-        gate_up_weight = self.experts.gate_up_proj.permute(1, 2, 0)
+        # Optional latent projection before experts (e.g. Nemotron-H)
+        expert_input = hidden_states_flat
+        fc1_latent = getattr(self, "fc1_latent_proj", None)
+        if fc1_latent is not None:
+            expert_input = fc1_latent(expert_input)
+
+        # Permute weights to SonicMoE layout.
+        # GLU models: gate_up_proj [E, 2*I, H] -> [2*I, H, E]
+        # Non-GLU:    up_proj      [E, I, H]   -> [I, H, E]
+        if glu_activation:
+            up_weight = self.experts.gate_up_proj.permute(1, 2, 0)
+        else:
+            up_weight = self.experts.up_proj.permute(1, 2, 0)
         down_weight = self.experts.down_proj.permute(1, 2, 0)
 
         output, _router_logits, _expert_freq = moe_TC_softmax_topk_layer(
-            hidden_states_flat,
+            expert_input,
             router.weight,
-            gate_up_weight,
-            None,  # b1 (no gate/up bias)
+            up_weight,
+            None,  # b1 (no bias)
             down_weight,
-            None,  # b2 (no down bias)
+            None,  # b2 (no bias)
             router.top_k,
             torch.cuda.current_stream().cuda_stream,
             activation,
             False,  # is_inference_mode
         )
 
+        # Optional latent projection after experts (e.g. Nemotron-H)
+        fc2_latent = getattr(self, "fc2_latent_proj", None)
+        if fc2_latent is not None:
+            output = fc2_latent(output)
+
         # Add shared expert contribution if present
         if shared_expert_output is not None:
             if hasattr(self, "shared_expert_gate"):

diff --git a/src/axolotl/integrations/kernels/sonicmoe/routing.py b/src/axolotl/integrations/kernels/sonicmoe/routing.py
@@ -59,6 +59,13 @@ def get_model_moe_config(model_type: str):
         "minimax_m2",
     ):
         return sigmoid_topk_routing, ActivationType.SWIGLU, "gate"
+    # Non-GLU MoE (no gate_proj, experts use up_proj + down_proj only)
+    elif model_type in ("nemotron_h",):
+        return sigmoid_topk_routing, ActivationType.RELU_SQ, "gate"
-    # Non-GLU MoE (no gate_proj, experts use up_proj + down_proj only)
-    elif model_type in ("nemotron_h",):
-        return sigmoid_topk_routing, ActivationType.RELU_SQ, "gate"
+def sigmoid_topk_routing(
+    hidden_states: torch.Tensor, moe_block
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    gate = moe_block.gate
+    T, H = hidden_states.shape
+    K = getattr(moe_block, "top_k", getattr(gate, "top_k", None))
+    if K is None:
+        raise AttributeError(
+            f"sigmoid_topk_routing requires top_k on moe_block or gate, "
+            f"but neither has it"
+        )
+    E = getattr(
+        moe_block,
+        "n_routed_experts",
+        getattr(gate, "n_routed_experts", gate.weight.shape[0]),
+    )
+    n_group = getattr(moe_block, "n_group", getattr(gate, "n_group", 1))
+    
+    # ... rest of function with change at topk_group access ...
+    
+    if n_group > 1:
+        # ... 
+        topk_group = getattr(
+            moe_block, "topk_group", getattr(gate, "topk_group", n_group)
+        )
+        group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[1]
-    # Non-GLU MoE (no gate_proj, experts use up_proj + down_proj only)
-    elif model_type in ("nemotron_h",):
-        return sigmoid_topk_routing, ActivationType.RELU_SQ, "gate"
+def sigmoid_topk_routing(
+    hidden_states: torch.Tensor, moe_block
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    gate = moe_block.gate
+    T, H = hidden_states.shape
+    K = getattr(moe_block, "top_k", getattr(gate, "top_k", None))
+    if K is None:
+        raise AttributeError(
+            f"sigmoid_topk_routing requires top_k on moe_block or gate, "
+            f"but neither has it"
+        )
+    E = getattr(
+        moe_block,
+        "n_routed_experts",
+        getattr(gate, "n_routed_experts", gate.weight.shape[0]),
+    )
+    n_group = getattr(moe_block, "n_group", getattr(gate, "n_group", 1))
+    
+    # ... rest of function with change at topk_group access ...
+    
+    if n_group > 1:
+        # ... 
+        topk_group = getattr(
+            moe_block, "topk_group", getattr(gate, "topk_group", n_group)
+        )
+        group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[1]
+    # elif model_type in ("deepseek_v2",):
+    #     # Softmax→topk with group_limited_greedy. Different attr names: num_group
+    #     # (not n_group), gate is nn.Linear (not a router class).
+    #     return ..., ActivationType.SWIGLU, "gate"
     elif model_type in ("ernie4_5_moe",):
         return softmax_bias_topk_routing, ActivationType.SWIGLU, "gate"
     elif model_type in ("hunyuan_v1_moe",):