NVIDIA-NeMo · yuki-97 · Dec 23, 2025 · Dec 23, 2025
@@ -0,0 +1,22 @@
+defaults: ../../sft.yaml
+cluster:
+  gpus_per_node: 8
+policy:
+  model_name: openai/gpt-oss-20b
+  train_global_batch_size: 128
+  train_micro_batch_size: 8
+  max_total_sequence_length: 512
+  dequantize_base_checkpoint: true
+  dtensor_cfg:
+    expert_parallel_size: 8
+    automodel_kwargs:
+      backend:
+        _target_: nemo_automodel.components.moe.utils.BackendConfig
+        attn: flex
+        linear: te
+        rms_norm: te
+        enable_deepep: true
+        fake_balanced_gate: false
+        enable_hf_state_dict_adapter: true
+checkpointing:
+  checkpoint_dir: results/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel
@@ -34,6 +34,34 @@ class LoRAConfig(TypedDict):
     use_triton: NotRequired[bool]
 
 
+class AutomodelBackendConfig(TypedDict):
+    # Hydra target class path (e.g., "nemo_automodel.components.moe.utils.BackendConfig")
+    _target_: str
+    # Attention implementation: "te" (Transformer Engine), "flex" (FlexAttention), etc.
+    attn: NotRequired[str]
+    # Linear layer implementation: "te" (Transformer Engine), etc.
+    linear: NotRequired[str]
+    # RMSNorm implementation: "te" (Transformer Engine), etc.
+    rms_norm: NotRequired[str]
+    # Enable DeepEP (Deep Expert Parallelism) for MoE models
+    enable_deepep: NotRequired[bool]
+    # Use fake balanced gate for testing/debugging MoE
+    fake_balanced_gate: NotRequired[bool]
+    # Enable HuggingFace state dict adapter for checkpoint loading
+    enable_hf_state_dict_adapter: NotRequired[bool]
+    # Enable FSDP-specific optimizations
+    enable_fsdp_optimizations: NotRequired[bool]
+    # Precision for the MoE gate computation (e.g., "float64", "float32")
+    gate_precision: NotRequired[str]
+
+
+class AutomodelKwargs(TypedDict):
+    # Whether to use Liger kernel optimizations (default: false)
+    use_liger_kernel: NotRequired[bool]
+    # Backend configuration for MoE models
+    backend: NotRequired[AutomodelBackendConfig]
+
+
 class DTensorConfigDisabled(TypedDict):
     enabled: Literal[False]
 
@@ -50,6 +78,7 @@ class DTensorConfig(TypedDict):
     custom_parallel_plan: str | None
     clear_cache_every_n_steps: NotRequired[int | None]
     lora_cfg: NotRequired[LoRAConfig | LoRAConfigDisabled]
+    automodel_kwargs: NotRequired[AutomodelKwargs]
 
 
 class SequencePackingConfigDisabled(TypedDict):

@@ -111,6 +111,12 @@ def __init__(
             use_v2 = config.get("dtensor_cfg", {}).get("_v2", False)
             if use_v2:
                 worker_builder_cls = "nemo_rl.models.policy.workers.dtensor_policy_worker_v2.DTensorPolicyWorkerV2"
+
+                if "TORCH_CUDA_ARCH_LIST" not in os.environ:
+                    warnings.warn(
+                        "TORCH_CUDA_ARCH_LIST is not set. This is needed if using DeepEP in DTensorPolicyWorker V2. This variable is set in our container, but "
+                        "if you are running a custom container or baremetal, you may need to set this variable manually. Example: export TORCH_CUDA_ARCH_LIST='9.0 10.0'"
+                    )
             else:
                 assert (
                     config["dtensor_cfg"].get("lora_cfg", {}).get("enabled", False)

@@ -29,7 +29,7 @@
 
 # Try to import nemo_automodel classes, fallback to None if not available
 try:
-    from nemo_automodel.components._transformers.auto_model import (
+    from nemo_automodel._transformers.auto_model import (
         NeMoAutoModelForCausalLM,
         NeMoAutoModelForImageTextToText,
         NeMoAutoModelForTextToWaveform,