Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/Automodel-workspace/Automodel
Submodule Automodel updated 477 files
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
defaults: ../../sft.yaml
cluster:
gpus_per_node: 8
policy:
model_name: openai/gpt-oss-20b
train_global_batch_size: 128
train_micro_batch_size: 8
max_total_sequence_length: 512
dequantize_base_checkpoint: true
dtensor_cfg:
expert_parallel_size: 8
automodel_kwargs:
backend:
_target_: nemo_automodel.components.moe.utils.BackendConfig
attn: flex
linear: te
rms_norm: te
enable_deepep: true
fake_balanced_gate: false
enable_hf_state_dict_adapter: true
checkpointing:
checkpoint_dir: results/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel
29 changes: 29 additions & 0 deletions nemo_rl/models/policy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,34 @@ class LoRAConfig(TypedDict):
use_triton: NotRequired[bool]


class AutomodelBackendConfig(TypedDict):
# Hydra target class path (e.g., "nemo_automodel.components.moe.utils.BackendConfig")
_target_: str
# Attention implementation: "te" (Transformer Engine), "flex" (FlexAttention), etc.
attn: NotRequired[str]
# Linear layer implementation: "te" (Transformer Engine), etc.
linear: NotRequired[str]
# RMSNorm implementation: "te" (Transformer Engine), etc.
rms_norm: NotRequired[str]
# Enable DeepEP (Deep Expert Parallelism) for MoE models
enable_deepep: NotRequired[bool]
# Use fake balanced gate for testing/debugging MoE
fake_balanced_gate: NotRequired[bool]
# Enable HuggingFace state dict adapter for checkpoint loading
enable_hf_state_dict_adapter: NotRequired[bool]
# Enable FSDP-specific optimizations
enable_fsdp_optimizations: NotRequired[bool]
# Precision for the MoE gate computation (e.g., "float64", "float32")
gate_precision: NotRequired[str]


class AutomodelKwargs(TypedDict):
# Whether to use Liger kernel optimizations (default: false)
use_liger_kernel: NotRequired[bool]
# Backend configuration for MoE models
backend: NotRequired[AutomodelBackendConfig]


class DTensorConfigDisabled(TypedDict):
enabled: Literal[False]

Expand All @@ -50,6 +78,7 @@ class DTensorConfig(TypedDict):
custom_parallel_plan: str | None
clear_cache_every_n_steps: NotRequired[int | None]
lora_cfg: NotRequired[LoRAConfig | LoRAConfigDisabled]
automodel_kwargs: NotRequired[AutomodelKwargs]


class SequencePackingConfigDisabled(TypedDict):
Expand Down
6 changes: 6 additions & 0 deletions nemo_rl/models/policy/lm_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@ def __init__(
use_v2 = config.get("dtensor_cfg", {}).get("_v2", False)
if use_v2:
worker_builder_cls = "nemo_rl.models.policy.workers.dtensor_policy_worker_v2.DTensorPolicyWorkerV2"

if "TORCH_CUDA_ARCH_LIST" not in os.environ:
warnings.warn(
"TORCH_CUDA_ARCH_LIST is not set. This is needed if using DeepEP in DTensorPolicyWorker V2. This variable is set in our container, but "
"if you are running a custom container or baremetal, you may need to set this variable manually. Example: export TORCH_CUDA_ARCH_LIST='9.0 10.0'"
)
else:
assert (
config["dtensor_cfg"].get("lora_cfg", {}).get("enabled", False)
Expand Down
2 changes: 1 addition & 1 deletion nemo_rl/models/policy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

# Try to import nemo_automodel classes, fallback to None if not available
try:
from nemo_automodel.components._transformers.auto_model import (
from nemo_automodel._transformers.auto_model import (
NeMoAutoModelForCausalLM,
NeMoAutoModelForImageTextToText,
NeMoAutoModelForTextToWaveform,
Expand Down
Empty file.
Loading
Loading