axolotl-ai-cloud · winglian · Mar 30, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 21, 2026
diff --git a/examples/nemotron-h/120b-a12b-qlora.yaml b/examples/nemotron-h/120b-a12b-qlora.yaml
@@ -0,0 +1,74 @@
+base_model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
+
+# LoRA kernel patches are incompatible with this architecture — see README.
+lora_mlp_kernel: false
+lora_qkv_kernel: false
+lora_o_kernel: false
+
+chat_template: tokenizer_default
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:20%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+dataset_prepared_path: last_run_prepared
+
+sequence_len: 4096
+sample_packing: true
+
+use_cut_cross_entropy: true
+
+load_in_4bit: true
+quantize_moe_experts: true
+adapter: qlora
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.0
+lora_target_modules:
+  # Attention projection layers (present in ~12 attention layers out of 88)
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  # To also train MoE expert weights, add them via lora_target_parameters
+  # (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
+  #   lora_target_parameters:
+  #     - up_proj
+  #     - down_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_4bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 2
+saves_per_epoch: 1
+weight_decay: 0.0
+
+special_tokens:
diff --git a/examples/nemotron-h/README.md b/examples/nemotron-h/README.md
@@ -0,0 +1,48 @@
+# Nemotron-H (nvidia/NVIDIA-Nemotron-3-*)
+
+Hybrid Mamba2 / Attention / MoE architecture (`model_type: nemotron_h`).
+
+| Model | Total params | Active params | Layers |
+|---|---|---|---|
+| NVIDIA-Nemotron-3-Super-120B-A12B-BF16 | 120B | ~12B | 88 |
+| NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 | 30B | ~3B | — |
+
+## Requirements
+
+```bash
+pip install mamba-ssm causal-conv1d   # fast Mamba2 CUDA kernels
+```
+
+## Architecture notes
+
+- Three block types per layer: **Mamba2** (selective SSM), **Attention** (sparse), **MoE** (mixture-of-experts).
+- Only ~12 out of 88 blocks are attention layers (120B variant).
+- MLP activation is `relu2` via `mlp_hidden_act` (not the usual `hidden_act`).
+
+## LoRA kernel patches
+
+All three LoRA Triton kernel patches must be disabled:
+
+```yaml
+lora_qkv_kernel: false   # attention lives in NemotronHBlock.mixer, not layer.self_attn
+lora_o_kernel: false     # same reason
+lora_mlp_kernel: false   # relu2 (mlp_hidden_act) is not supported by lora_mlp_kernel
+```
+
+## MoE expert weights
+
+NemotronH experts store `up_proj` and `down_proj` as 3D `nn.Parameter` tensors
+(shape `[num_experts, out_dim, in_dim]`), **not** `nn.Linear` modules — there is no
+`gate_proj`. To fine-tune them alongside attention, use `lora_target_parameters`
+instead of `lora_target_modules`:
+
+```yaml
+lora_target_parameters:
+  - up_proj
+  - down_proj
+```
+
+## Limitations
+
+- **MoE Triton kernels**: `lora_mlp_kernel` is not supported for NemotronH's MoE expert layers. The expert weights are 3D `nn.Parameter` tensors (not `nn.Linear`), which the Triton kernel does not support. Keep `lora_mlp_kernel: false`.
+- **Gradient checkpointing**: Only supported when `sample_packing: true`. Without sample packing the upstream model marks `supports_gradient_checkpointing = False`.
diff --git a/examples/nemotron-h/nano-30b-a3b-qlora.yaml b/examples/nemotron-h/nano-30b-a3b-qlora.yaml
@@ -0,0 +1,74 @@
+# See examples/nemotron-h/README.md for architecture notes and requirements.
+base_model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+
+# LoRA kernel patches are incompatible with this architecture — see README.
+lora_mlp_kernel: false
+lora_qkv_kernel: false
+lora_o_kernel: false
+
+chat_template: tokenizer_default
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:20%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+dataset_prepared_path: last_run_prepared
+
+sequence_len: 4096
+sample_packing: true
-sample_packing: true
+sample_packing: false
-sample_packing: true
+sample_packing: false
+
+use_cut_cross_entropy: true
+
+load_in_4bit: true
+quantize_moe_experts: true
+adapter: qlora
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.0
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  # To also train MoE expert weights, add them via lora_target_parameters
+  # (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
+  #   lora_target_parameters:
+  #     - up_proj
+  #     - down_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_4bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 4
+saves_per_epoch: 1
+weight_decay: 0.0
+
+special_tokens:
diff --git a/src/axolotl/common/architectures.py b/src/axolotl/common/architectures.py
@@ -23,4 +23,5 @@
     "glm4_moe": "Glm4MoeDecoderLayer",
     "glm4_moe_lite": "Glm4MoeLiteDecoderLayer",
     "glm_moe_dsa": "GlmMoeDsaDecoderLayer",
+    "nemotron_h": "NemotronHMoE",
 }
diff --git a/src/axolotl/loaders/model.py b/src/axolotl/loaders/model.py
@@ -590,9 +590,11 @@ def _set_quantization_config(self):
                 "bnb_4bit_quant_type": "nf4",
                 "bnb_4bit_quant_storage": torch.bfloat16,
             }
-            if self.cfg.model_config_type in ["jamba", "qwen2_moe"] and not (
-                self.cfg.deepspeed or self.is_fsdp_enabled
-            ):
+            if self.cfg.model_config_type in [
+                "jamba",
+                "qwen2_moe",
+                "nemotron_h",
+            ] and not (self.cfg.deepspeed or self.is_fsdp_enabled):
                 # for some reason, this causes the loss to be off by an order of magnitude
                 # but deepspeed needs this still in bfloat16
                 bnb_config["bnb_4bit_quant_storage"] = torch.float32

diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
@@ -142,6 +142,12 @@ def _apply_transformers_patches(self):
 
     def apply_post_model_build_patches(self, model: PreTrainedModel):
         """Apply patches right after model build, before post-load setup."""
+        if self.cfg.model_config_type == "nemotron_h":
+            # Must run after model build because NemotronHForCausalLM.__init__
+            # calls register_nemotron_h_conversion_mapping() with overwrite=True,
+            # which would clobber any earlier fix.
+            self._fix_nemotron_h_conversion_mapping()
+
         self._finalize_moe_expert_quantization(model)
 
     def apply_post_model_load_patches(self, model: PreTrainedModel):
@@ -291,6 +297,66 @@ def _apply_model_specific_patches(self):
 
             patch_kimi_model()
 
+        if self.cfg.model_config_type == "nemotron_h":
+            if self.cfg.sample_packing:
+                from transformers.models.nemotron_h.modeling_nemotron_h import (
+                    NemotronHPreTrainedModel,
+                )
+
+                from axolotl.monkeypatch.models.nemotron_h.modeling import (
+                    patch_nemotron_h_modeling_packing,
+                )
+
+                patch_nemotron_h_modeling_packing()
+                # supports_gradient_checkpointing is only enabled after
+                # patch_nemotron_h_modeling_packing() installs the GC-compatible
+                # NemotronHBlock.forward. Without the patch, upstream marks this
+                # False because the original block forward is not GC-safe.
+                NemotronHPreTrainedModel.supports_gradient_checkpointing = True
+
+    @staticmethod
+    def _fix_nemotron_h_conversion_mapping():
+        """Remove the spurious embedding→embeddings WeightRenaming from the
+        nemotron_h checkpoint conversion mapping.
+
+        The nvidia Hub model registers:
+            WeightRenaming("embedding.weight", "embeddings.weight")
+        to handle a legacy checkpoint variant. Its reverse (applied on save)
+        converts ``embeddings`` back to ``embedding``, which silently renames
+        ``backbone.embeddings.weight`` → ``backbone.embedding.weight`` when
+        merging LoRA adapters back into the base model.
+        """
+        try:
+            from transformers.conversion_mapping import (
+                WeightRenaming,
+                get_checkpoint_conversion_mapping,
+                register_checkpoint_conversion_mapping,
+            )
+        except ImportError:
+            return
+
+        mapping = get_checkpoint_conversion_mapping("nemotron_h")
+        if mapping is None:
+            return
+
+        filtered = [
+            entry
+            for entry in mapping
+            if not (
+                isinstance(entry, WeightRenaming)
+                and entry.source_patterns == ["embedding.weight"]
+                and entry.target_patterns == ["embeddings.weight"]
+            )
+        ]
+        if len(filtered) != len(mapping):
+            register_checkpoint_conversion_mapping(
+                "nemotron_h", filtered, overwrite=True
+            )
+            LOG.info(
+                "Removed embedding→embeddings WeightRenaming from nemotron_h "
+                "checkpoint conversion mapping"
+            )
+
     def _apply_fp8_patches(self):
         """Apply patches for FP8 support."""
         if self.cfg.fp8:

diff --git a/src/axolotl/loaders/utils.py b/src/axolotl/loaders/utils.py
@@ -234,4 +234,6 @@ def get_linear_embedding_layers(model_type: str) -> list[str]:
         return ["embed_in", "embed_out"]
     if model_type == "falcon":
         return ["word_embeddings", "lm_head"]
+    if model_type == "nemotron_h":
+        return ["embeddings", "lm_head"]
     return ["embed_tokens", "lm_head"]
diff --git a/src/axolotl/monkeypatch/lora_kernels.py b/src/axolotl/monkeypatch/lora_kernels.py
@@ -394,15 +394,15 @@ def apply_lora_kernel_patches(
         activation = text_config.hidden_act
     elif hasattr(text_config, "hidden_activation"):
         activation = text_config.hidden_activation
+    elif hasattr(text_config, "mlp_hidden_act"):
+        # Hybrid models (e.g. nemotron_h) use mlp_hidden_act instead of hidden_act
+        activation = text_config.mlp_hidden_act
 
     # map activation to supported activation
-    if "gelu" in activation:
+    if activation and "gelu" in activation:
         # gemma3 uses gelu_pytorch_tanh
         activation = "gelu"
 
-    if activation not in SUPPORTED_ACTIVATIONS:
-        raise NotImplementedError(f"Activation {activation} is not supported")
-
     layers = get_layers(model)
 
     # Patch each layer
@@ -444,6 +444,15 @@ def apply_lora_kernel_patches(
                     )
         for gate_proj, up_proj, down_proj, mlp in find_mlp_in_layer(layer):
             if cfg.lora_mlp_kernel:
+                # Check is inside lora_mlp_kernel guard so models with an
+                # unsupported activation (e.g. nemotron_h uses relu2) can set
+                # lora_mlp_kernel: false without hitting an error here.
+                if activation not in SUPPORTED_ACTIVATIONS:
+                    raise NotImplementedError(
+                        f"Activation {activation!r} is not supported by lora_mlp_kernel. "
+                        f"Set `lora_mlp_kernel: false` in your config or use a model with "
+                        f"a supported activation ({SUPPORTED_ACTIVATIONS})."
+                    )
                 # MLP patching
                 can_patch_mlp = all(
                     hasattr(proj, "lora_A") for proj in (gate_proj, up_proj, down_proj)

diff --git a/src/axolotl/monkeypatch/models/nemotron_h/__init__.py b/src/axolotl/monkeypatch/models/nemotron_h/__init__.py