Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions examples/nemotron-h/120b-a12b-qlora.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
base_model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16

plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
- axolotl.integrations.liger.LigerPlugin
Comment thread
winglian marked this conversation as resolved.

liger_layer_norm: true
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_rms_norm_gated: true

# LoRA kernel patches are incompatible with this architecture — see README.
lora_mlp_kernel: false
lora_qkv_kernel: false
Expand All @@ -22,25 +32,23 @@ dataset_prepared_path: last_run_prepared
sequence_len: 4096
sample_packing: true

use_cut_cross_entropy: true

load_in_4bit: true
quantize_moe_experts: true
adapter: qlora
lora_r: 16
lora_alpha: 32
lora_dropout: 0.0
lora_target_modules:
# Attention projection layers (present in ~12 attention layers out of 88)
- q_proj
- k_proj
- v_proj
- o_proj
# To also train MoE expert weights, add them via lora_target_parameters
# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
# lora_target_parameters:
# - up_proj
# - down_proj

# To also train MoE expert weights, add them via lora_target_parameters
# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
# lora_target_parameters:
# - up_proj
# - down_proj

wandb_project:
wandb_entity:
Expand Down
23 changes: 16 additions & 7 deletions examples/nemotron-h/nano-30b-a3b-qlora.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# See examples/nemotron-h/README.md for architecture notes and requirements.
base_model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16

plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
- axolotl.integrations.liger.LigerPlugin
Comment thread
winglian marked this conversation as resolved.

liger_layer_norm: true
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_rms_norm_gated: true

# LoRA kernel patches are incompatible with this architecture — see README.
lora_mlp_kernel: false
lora_qkv_kernel: false
Expand All @@ -23,8 +33,6 @@ dataset_prepared_path: last_run_prepared
sequence_len: 4096
sample_packing: true

use_cut_cross_entropy: true

load_in_4bit: true
quantize_moe_experts: true
adapter: qlora
Expand All @@ -36,11 +44,12 @@ lora_target_modules:
- k_proj
- v_proj
- o_proj
# To also train MoE expert weights, add them via lora_target_parameters
# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
# lora_target_parameters:
# - up_proj
# - down_proj

# To also train MoE expert weights, add them via lora_target_parameters
# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
# lora_target_parameters:
# - up_proj
# - down_proj

wandb_project:
wandb_entity:
Expand Down
Loading