From 88a26c7ea8e82f995c77e374ed2fd14d03904205 Mon Sep 17 00:00:00 2001
From: root <root@pool0-00629.cm.cluster>
Date: Tue, 3 Feb 2026 15:59:02 -0800
Subject: [PATCH 1/8] Initial commit.

---
 nemo_rl/models/megatron/__init__.py           |  10 +
 nemo_rl/models/megatron/recipe_config.py      | 141 +++++++++++
 nemo_rl/models/megatron/setup.py              | 219 +++++++++++-------
 .../policy/workers/megatron_policy_worker.py  |  13 ++
 4 files changed, 305 insertions(+), 78 deletions(-)
 create mode 100644 nemo_rl/models/megatron/recipe_config.py

diff --git a/nemo_rl/models/megatron/__init__.py b/nemo_rl/models/megatron/__init__.py
index 4fc25d0d3c..f7ce1ab003 100644
--- a/nemo_rl/models/megatron/__init__.py
+++ b/nemo_rl/models/megatron/__init__.py
@@ -11,3 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from nemo_rl.models.megatron.recipe_config import (
+    get_available_recipes,
+    get_recipe_function,
+)
+
+__all__ = [
+    "get_available_recipes",
+    "get_recipe_function",
+]
diff --git a/nemo_rl/models/megatron/recipe_config.py b/nemo_rl/models/megatron/recipe_config.py
new file mode 100644
index 0000000000..891129c503
--- /dev/null
+++ b/nemo_rl/models/megatron/recipe_config.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Recipe-based configuration for NeMo-RL Megatron integration.
+
+This module provides a clean integration with Megatron-Bridge recipes,
+allowing NeMo-RL to use pre-configured training recipes as a base and
+layer RL-specific settings on top.
+
+Example usage:
+    from nemo_rl.models.megatron.recipe_config import create_config_from_recipe
+    
+    megatron_cfg = create_config_from_recipe(
+        hf_model_name="meta-llama/Llama-3.1-8B-Instruct",
+        policy_config=config,
+        pretrained_path="/path/to/checkpoint",
+        weights_path=None,
+    )
+
+Internal flag for testing:
+    # To use pure recipe settings with minimal RL overrides (for testing):
+    megatron_cfg = create_config_from_recipe(
+        ...,
+        _apply_full_overrides=False,  # Internal flag - keeps recipe's optimizer/scheduler
+    )
+"""
+
+import warnings
+from typing import Any, Callable, Optional
+
+import torch
+from megatron.bridge import AutoBridge
+from megatron.bridge.training.config import (
+    CheckpointConfig,
+    ConfigContainer,
+    DistributedDataParallelConfig,
+    LoggerConfig,
+    OptimizerConfig,
+    SchedulerConfig,
+    TokenizerConfig,
+    TrainingConfig,
+)
+
+from nemo_rl.models.policy import PolicyConfig
+
+
+# =============================================================================
+# RECIPE DISCOVERY
+# =============================================================================
+
+def _import_llama_recipes():
+    """Import Llama recipes from Megatron-Bridge."""
+    try:
+        from megatron.bridge.recipes.llama.llama3 import (
+            llama31_8b_pretrain_config,
+            llama31_70b_pretrain_config,
+            llama31_405b_pretrain_config,
+            llama3_8b_pretrain_config,
+            llama3_70b_pretrain_config,
+            llama32_1b_pretrain_config,
+            llama32_3b_pretrain_config,
+        )
+        return {
+            "llama-3.2-1b": llama32_1b_pretrain_config,
+            "llama-3.2-3b": llama32_3b_pretrain_config,
+            "llama-3-8b": llama3_8b_pretrain_config,
+            "llama-3.1-8b": llama31_8b_pretrain_config,
+            "meta-llama-3-8b": llama3_8b_pretrain_config,
+            "meta-llama-3.1-8b": llama31_8b_pretrain_config,
+            "llama-3-70b": llama3_70b_pretrain_config,
+            "llama-3.1-70b": llama31_70b_pretrain_config,
+            "llama-3.1-405b": llama31_405b_pretrain_config,
+        }
+    except ImportError:
+        return {}
+
+
+def _import_qwen_recipes():
+    """Import Qwen recipes from Megatron-Bridge."""
+    try:
+        from megatron.bridge.recipes.qwen.qwen3 import (
+            qwen3_600m_pretrain_config,
+            qwen3_1p7b_pretrain_config,
+            qwen3_4b_pretrain_config,
+            qwen3_8b_pretrain_config,
+        )
+        return {
+            "qwen3-0.6b": qwen3_600m_pretrain_config,
+            "qwen3-1.7b": qwen3_1p7b_pretrain_config,
+            "qwen3-4b": qwen3_4b_pretrain_config,
+            "qwen3-8b": qwen3_8b_pretrain_config,
+        }
+    except ImportError:
+        return {}
+
+
+def get_recipe_function(hf_model_name: str) -> Optional[Callable[..., ConfigContainer]]:
+    """
+    Get the appropriate Megatron-Bridge recipe function for a model.
+    
+    Args:
+        hf_model_name: HuggingFace model name or path
+        
+    Returns:
+        Recipe function or None if no matching recipe found
+    """
+    model_lower = hf_model_name.lower().replace("/", "-").replace("_", "-")
+    
+    # Load recipes lazily
+    all_recipes = {}
+    all_recipes.update(_import_llama_recipes())
+    all_recipes.update(_import_qwen_recipes())
+    
+    # Try match
+    for pattern, recipe_fn in all_recipes.items():
+        if pattern in model_lower:
+            return recipe_fn
+    
+    return None
+
+
+def get_available_recipes() -> list[str]:
+    """Return a list of available recipe patterns."""
+    all_recipes = {}
+    all_recipes.update(_import_llama_recipes())
+    all_recipes.update(_import_qwen_recipes())
+    return list(all_recipes.keys())
+
+
diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index 24bfdb0605..14dcbabcb3 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -31,6 +31,7 @@
     CheckpointConfig,
     ConfigContainer,
     DistributedDataParallelConfig,
+    DistributedInitConfig,
     LoggerConfig,
     OptimizerConfig,
     SchedulerConfig,
@@ -68,6 +69,9 @@
 from nemo_rl.distributed.named_sharding import NamedSharding
 from nemo_rl.models.megatron.community_import import import_model_from_hf_name
 from nemo_rl.models.megatron.config import ModelAndOptimizerState, RuntimeConfig
+from nemo_rl.models.megatron.recipe_config import (
+    get_recipe_function,
+)
 from nemo_rl.models.policy import PolicyConfig
 from nemo_rl.models.policy.utils import (
     configure_dynamo_cache,
@@ -225,7 +229,13 @@ def validate_and_set_config(
         )
 
     megatron_cfg, model_cfg = setup_model_config(
-        config, rank, dtype, hf_model_name, pretrained_path, weights_path
+        config=config,
+        rank=rank,
+        dtype=dtype,
+        hf_model_name=hf_model_name,
+        pretrained_path=pretrained_path,
+        weights_path=weights_path,
+        use_recipe=True,
     )
 
     final_padded_vocab_size = calculate_padded_vocab_size(
@@ -262,7 +272,6 @@ def validate_model_paths(config: PolicyConfig) -> tuple[str, str, bool]:
 
     return hf_model_name, pretrained_path, pt_checkpoint_exists
 
-
 def setup_model_config(
     config: PolicyConfig,
     rank,
@@ -270,40 +279,53 @@ def setup_model_config(
     hf_model_name: str,
     pretrained_path: str,
     weights_path: Optional[str] = None,
+    use_recipe: bool = True,
 ) -> tuple[ConfigContainer, Any]:
-    """Handle all the model configuration logic."""
-    # Load pretrained run config
-    pretrained_run_config = os.path.join(
-        pretrained_path, "iter_0000000/run_config.yaml"
-    )
-
-    if not os.path.exists(pretrained_run_config):
-        raise FileNotFoundError(
-            f"Pretrained run config not found at {pretrained_run_config} on rank={rank}. "
-            "This usually means that the one-time HF->mcore conversion on rank=0 saved to a directory "
-            "not being mounted on this node. Please check"
+    """Setup model configuration."""
+    model_cfg = None
+    use_recipe_for_model = use_recipe and get_recipe_function(hf_model_name) is not None
+
+    if use_recipe_for_model:
+        # Use Megatron-Bridge golden recipes
+        print(f"[INFO] Using Megatron-Bridge recipe-based config for {hf_model_name}")
+        recipe_fn = get_recipe_function(hf_model_name)
+        if recipe_fn is None:
+            raise ValueError(f"No recipe found for {hf_model_name}")
+
+        megatron_cfg = recipe_fn()
+        model_cfg = megatron_cfg.model
+    else:
+        # Load pretrained run config
+        pretrained_run_config = os.path.join(
+            pretrained_path, "iter_0000000/run_config.yaml"
         )
 
-    try:
-        cfg_from_pretrained = ConfigContainer.from_yaml(
-            pretrained_run_config, mode=InstantiationMode.STRICT
-        )
-    except Exception as e:
-        # Add helpful context as a note to the exception
-        e.add_note(
-            f"\n{'=' * 80}\n"
-            f"NOTE: A common cause of this error is when the HF->mcore converted checkpoint is\n"
-            f"created with an older version of megatron-bridge.\n"
-            f"If this checkpoint is old or was generated by a different code version,\n"
-            f"try deleting it and rerunning the code.\n"
-            f"The checkpoint will be automatically regenerated with the current version.\n\n"
-            f"Checkpoint location: {pretrained_path}\n"
-            f"{'=' * 80}"
-        )
-        raise
+        if not os.path.exists(pretrained_run_config):
+            raise FileNotFoundError(
+                f"Pretrained run config not found at {pretrained_run_config} on rank={rank}. "
+                "This usually means that the one-time HF->mcore conversion on rank=0 saved to a directory "
+                "not being mounted on this node. Please check"
+            )
+
+        try:
+            megatron_cfg = ConfigContainer.from_yaml(
+                pretrained_run_config, mode=InstantiationMode.STRICT
+            )
+        except Exception as e:
+            # Add helpful context as a note to the exception
+            e.add_note(
+                f"\n{'=' * 80}\n"
+                f"NOTE: A common cause of this error is when the HF->mcore converted checkpoint is\n"
+                f"created with an older version of megatron-bridge.\n"
+                f"If this checkpoint is old or was generated by a different code version,\n"
+                f"try deleting it and rerunning the code.\n"
+                f"The checkpoint will be automatically regenerated with the current version.\n\n"
+                f"Checkpoint location: {pretrained_path}\n"
+                f"{'=' * 80}"
+            )
+            raise
 
-    model_cfg = cfg_from_pretrained.model
-    cfg_from_pretrained.logger = LoggerConfig()
+        model_cfg = megatron_cfg.model
 
     # Apply parallelism settings
     _apply_parallelism_config(model_cfg, config)
@@ -333,10 +355,8 @@ def setup_model_config(
     # Validate training configuration
     _validate_training_config(config, model_cfg)
 
-    # Create final megatron config
-    megatron_cfg = _create_megatron_config(
-        model_cfg, checkpoint_config, config, hf_model_name, dtype
-    )
+    # Update megatron config with checkpoint, optimizer, scheduler, etc.
+    _update_megatron_config(megatron_cfg, checkpoint_config, config, hf_model_name)
 
     _validate_dtype_config(dtype, megatron_cfg.model, megatron_cfg.optimizer)
 
@@ -570,51 +590,94 @@ def _validate_dtype_config(
         )
 
 
-def _create_megatron_config(
-    model_cfg: Any,
+def _update_dataclass_fields(target: Any, updates: dict) -> None:
+    """Update a dataclass with values from a dictionary.
+
+    Only sets fields that are present in the updates dict. Fields not in
+    the dict retain their original values.
+
+    Args:
+        target: A dataclass instance to update
+        updates: Dictionary of field names to new values
+    """
+    for key, value in updates.items():
+        if hasattr(target, key):
+            setattr(target, key, value)
+
+
+def _update_megatron_config(
+    megatron_cfg: ConfigContainer,
     checkpoint_config: CheckpointConfig,
     config: PolicyConfig,
     hf_model_name: str,
-    dtype: torch.dtype,
-) -> ConfigContainer:
-    """Create the final Megatron configuration container."""
-    return ConfigContainer(
-        model=model_cfg,
-        checkpoint=checkpoint_config,
-        logger=LoggerConfig(logging_level=0),
-        train=TrainingConfig(
-            micro_batch_size=1,  # ignored
-            global_batch_size=config["train_global_batch_size"],  # ignored
-            train_iters=config["megatron_cfg"]["train_iters"],
-        ),
-        optimizer=OptimizerConfig(**config["megatron_cfg"]["optimizer"]),
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=config["megatron_cfg"][
-                "distributed_data_parallel_config"
-            ]["grad_reduce_in_fp32"],
-            overlap_grad_reduce=config["megatron_cfg"][
-                "distributed_data_parallel_config"
-            ]["overlap_grad_reduce"],
-            overlap_param_gather=config["megatron_cfg"][
-                "distributed_data_parallel_config"
-            ]["overlap_param_gather"],
-            # we need to set average_in_collective=False with calculate_per_token_loss=T
-            # otherwise, mcore throws an assertion error.
-            average_in_collective=False,  # Required with calculate_per_token_loss=True
-            use_distributed_optimizer=config["megatron_cfg"]["optimizer"][
-                "use_distributed_optimizer"
-            ],
-            data_parallel_sharding_strategy=config["megatron_cfg"][
-                "distributed_data_parallel_config"
-            ]["data_parallel_sharding_strategy"],
-        ),
-        scheduler=SchedulerConfig(**config["megatron_cfg"]["scheduler"]),
-        dataset=None,
-        tokenizer=TokenizerConfig(
-            tokenizer_type="HuggingFaceTokenizer",
-            tokenizer_model=hf_model_name,
-        ),
+) -> None:
+    """Update the existing ConfigContainer with checkpoint, optimizer, scheduler, and other settings.
+
+    This modifies megatron_cfg in-place. For sub-configs (optimizer, ddp, scheduler, etc.),
+    only fields explicitly provided in the NeMo-RL config are updated; other fields retain
+    their original values from the recipe or checkpoint.
+    """
+    megatron_cfg_dict = config.get("megatron_cfg", {})
+
+    # Ensure dist config is initialized (required for validate())
+    if megatron_cfg.dist is None:
+        megatron_cfg.dist = DistributedInitConfig()
+
+    # Always replace checkpoint config (NeMo-RL manages checkpoints)
+    megatron_cfg.checkpoint = checkpoint_config
+
+    # Always set logger
+    megatron_cfg.logger = LoggerConfig(logging_level=0)
+
+    # Update training config - these are NeMo-RL specific
+    if megatron_cfg.train is None:
+        megatron_cfg.train = TrainingConfig()
+    megatron_cfg.train.micro_batch_size = 1  # ignored by NeMo-RL
+    megatron_cfg.train.global_batch_size = config.get("train_global_batch_size", 1) # ignored by NeMo-RL
+    if "train_iters" in megatron_cfg_dict:
+        megatron_cfg.train.train_iters = megatron_cfg_dict["train_iters"]
+
+    # Update optimizer config - merge with existing
+    optimizer_overrides = megatron_cfg_dict.get("optimizer", {})
+    if optimizer_overrides:
+        if megatron_cfg.optimizer is None:
+            megatron_cfg.optimizer = OptimizerConfig(**optimizer_overrides)
+        else:
+            _update_dataclass_fields(megatron_cfg.optimizer, optimizer_overrides)
+
+    # Update DDP config - merge with existing
+    ddp_overrides = megatron_cfg_dict.get("distributed_data_parallel_config", {})
+    if megatron_cfg.ddp is None:
+        megatron_cfg.ddp = DistributedDataParallelConfig()
+
+    # Apply explicit DDP overrides from config
+    if ddp_overrides:
+        _update_dataclass_fields(megatron_cfg.ddp, ddp_overrides)
+
+    # NeMo-RL required DDP settings (always set)
+    megatron_cfg.ddp.check_for_nan_in_grad = True
+    # Required with calculate_per_token_loss=True, otherwise mcore throws assertion error
+    megatron_cfg.ddp.average_in_collective = False
+
+    # Sync use_distributed_optimizer between optimizer and ddp
+    if megatron_cfg.optimizer is not None:
+        megatron_cfg.ddp.use_distributed_optimizer = megatron_cfg.optimizer.use_distributed_optimizer
+
+    # Update scheduler config - merge with existing
+    scheduler_overrides = megatron_cfg_dict.get("scheduler", {})
+    if scheduler_overrides:
+        if megatron_cfg.scheduler is None:
+            megatron_cfg.scheduler = SchedulerConfig(**scheduler_overrides)
+        else:
+            _update_dataclass_fields(megatron_cfg.scheduler, scheduler_overrides)
+
+    # NeMo-RL handles data separately
+    megatron_cfg.dataset = None
+
+    # Update tokenizer config - always set for HuggingFace tokenizer
+    megatron_cfg.tokenizer = TokenizerConfig(
+        tokenizer_type="HuggingFaceTokenizer",
+        tokenizer_model=hf_model_name,
     )
 
 
diff --git a/nemo_rl/models/policy/workers/megatron_policy_worker.py b/nemo_rl/models/policy/workers/megatron_policy_worker.py
index 48ba0623e2..83d541f2ea 100644
--- a/nemo_rl/models/policy/workers/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/workers/megatron_policy_worker.py
@@ -278,6 +278,19 @@ def __init__(
             self.model,
             self.optimizer,
         )
+        print("HELLO")
+        # Dump ConfigContainer to YAML for inspection (only on rank 0)
+        if self.rank == 0:
+            config_dump_path = "/lustre/fsw/portfolios/coreai/users/sfawzy/final_megatron_config_6.yaml"
+            try:
+                self.megatron_cfg.to_yaml(config_dump_path)
+                print(f"[DEBUG] Saved final ConfigContainer to: {config_dump_path}")
+            except Exception as e:
+                print(f"[WARNING] Failed to save ConfigContainer to YAML: {e}")
+            # Exit early after dumping config for inspection
+            import sys
+            print("[DEBUG] Exiting after ConfigContainer dump")
+            sys.exit(0)
 
         # vars used for refit
         ## will be initialized in prepare_refit_info

From ddd4151fe1d2616f5516f73db7f71be8457d74ef Mon Sep 17 00:00:00 2001
From: Sherif Fawzy <sfawzy@nvidia.com>
Date: Fri, 6 Feb 2026 08:02:21 -0800
Subject: [PATCH 2/8] test

---
 9239646-attach.sh                             |  25 +++
 9239676-attach.sh                             |  25 +++
 9240549-attach.sh                             |  25 +++
 9261863-attach.sh                             |  25 +++
 .../configs/distillation_math_megatron.yaml   |   2 +
 examples/configs/grpo_math_1B_megatron.yaml   |   1 +
 .../configs/grpo_math_70B_megatron_fp8.yaml   |  13 +-
 examples/configs/grpo_math_8B_megatron.yaml   |   1 +
 .../configs/grpo_math_8B_megatron_fp8.yaml    |   2 +-
 ....7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml |   6 -
 ...8b-instruct-4n4g-megatrontp1pp2-quick.yaml |   4 -
 ...llama3.1-8b-instruct-4n8g-megatron.v2.yaml |   1 +
 ...8b-instruct-4n8g-megatrontp2pp2-quick.yaml |   1 +
 .../grpo-dapomath17k-dsv3-32n4g-megatron.yaml |   7 -
 .../llm/grpo-dapomath17k-dsv3-megatron.yaml   |   1 +
 .../llm/grpo-gptoss-20b-8n4g-megatron.yaml    |   3 -
 ...nstruct-1n8g-megatron-fp8-rollouts.v3.yaml |   1 +
 ...3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml |   1 +
 ...po-llama3.2-1b-instruct-1n8g-megatron.yaml |   1 +
 ...-1b-instruct-1n8g-megatron_generation.yaml |   1 +
 .../grpo-moonlight-16ba3b-4n4g-megatron.yaml  |   6 -
 .../llm/grpo-nano-v2-12b-1n4g-megatron.yaml   |   3 -
 ...rpo-qwen2.5-7b-instruct-4n4g-megatron.yaml |   2 -
 .../llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml  |   4 -
 ...en3-8b-base-1n8g-fp8-kvcache-megatron.yaml |   1 +
 .../performance/grpo-deepseek-v3-32n4g.yaml   |   5 -
 .../grpo-deepseek-v3-64n4g-async-1off.yaml    |   5 -
 .../grpo-deepseek-v3-64n8g-async-1off.yaml    |   4 -
 ...grpo-deepseek-v3-64n8g-fp8-async-1off.yaml |   9 -
 ...-llama3.1-8b-instruct-2n4g-async-1off.yaml |   4 -
 ...-llama3.1-8b-instruct-2n8g-async-1off.yaml |   2 -
 ...ma3.1-8b-instruct-2n8g-fp8-async-1off.yaml |   8 -
 .../performance/grpo-qwen3-235b-16n4g.yaml    |   4 -
 .../grpo-qwen3-235b-32n4g-async-1off.yaml     |   4 -
 .../grpo-qwen3-235b-32n8g-async-1off.yaml     |   7 -
 .../grpo-qwen3-30ba3b-24n8g-async-8off.yaml   |   5 -
 .../grpo-qwen3-30ba3b-4n8g-async-1off.yaml    |   5 -
 .../grpo-qwen3-30ba3b-8n4g-async-1off.yaml    |   5 -
 .../grpo-qwen3-32b-8n4g-async-1off.yaml       |   4 -
 .../grpo-qwen3-32b-8n8g-async-1off.yaml       |   4 -
 ...lama3.1-70b-8n4g-tp2pp2-long-megatron.yaml |   2 -
 ...lama3.1-70b-8n8g-tp4pp2-long-megatron.yaml |   1 +
 .../sft-llama3.1-8b-1n8g-megatron-lora.yaml   |   1 +
 ...sft-llama3.1-8b-1n8g-megatron-seqpack.yaml |   1 +
 .../llm/sft-llama3.1-8b-1n8g-megatron.yaml    |   1 +
 .../llm/sft-qwen2.5-math7b-2n4g-megatron.yaml |   4 -
 ...3b-instruct-clevr-1n8g-megatrontp2.v1.yaml |  12 +-
 .../sft_openmathinstruct2_megatron.yaml       |   1 +
 examples/configs/vlm_grpo_3B.yaml             |  73 --------
 examples/configs/vlm_grpo_3B_megatron.yaml    |  59 -------
 nemo_rl/models/megatron/__init__.py           |   6 +-
 nemo_rl/models/megatron/recipe_config.py      | 156 ++++++------------
 nemo_rl/models/megatron/setup.py              |  20 +--
 nemo_rl/models/policy/__init__.py             |   5 +
 54 files changed, 181 insertions(+), 398 deletions(-)
 create mode 100755 9239646-attach.sh
 create mode 100755 9239676-attach.sh
 create mode 100755 9240549-attach.sh
 create mode 100755 9261863-attach.sh

diff --git a/9239646-attach.sh b/9239646-attach.sh
new file mode 100755
index 0000000000..8e318a4731
--- /dev/null
+++ b/9239646-attach.sh
@@ -0,0 +1,25 @@
+# No args launches on the head node (node 0)
+# Args 1-N launch on worker nodes (nodes 1 through N-1)
+# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
+WORKER_NUM=${1:-}
+if [[ -z "$WORKER_NUM" ]]; then
+  # Empty means we are on the head node
+  if [[ -n "${COMMAND:-}" ]]; then
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00753" --jobid 9239646 bash -c "$COMMAND"
+  else
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00753" --jobid 9239646 --pty bash
+  fi
+else
+  # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
+  # and use nodes_array[1] through nodes_array[N-1]
+  if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then
+    echo "Error: WORKER_NUM must be between 1 and 0"
+    exit 1
+  fi
+  nodes_array=(pool0-00753)
+  if [[ -n "${COMMAND:-}" ]]; then
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239646 bash -c "$COMMAND"
+  else
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239646 --pty bash
+  fi
+fi
diff --git a/9239676-attach.sh b/9239676-attach.sh
new file mode 100755
index 0000000000..68f99e49a6
--- /dev/null
+++ b/9239676-attach.sh
@@ -0,0 +1,25 @@
+# No args launches on the head node (node 0)
+# Args 1-N launch on worker nodes (nodes 1 through N-1)
+# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
+WORKER_NUM=${1:-}
+if [[ -z "$WORKER_NUM" ]]; then
+  # Empty means we are on the head node
+  if [[ -n "${COMMAND:-}" ]]; then
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01821" --jobid 9239676 bash -c "$COMMAND"
+  else
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01821" --jobid 9239676 --pty bash
+  fi
+else
+  # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
+  # and use nodes_array[1] through nodes_array[N-1]
+  if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then
+    echo "Error: WORKER_NUM must be between 1 and 0"
+    exit 1
+  fi
+  nodes_array=(pool0-01821)
+  if [[ -n "${COMMAND:-}" ]]; then
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239676 bash -c "$COMMAND"
+  else
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239676 --pty bash
+  fi
+fi
diff --git a/9240549-attach.sh b/9240549-attach.sh
new file mode 100755
index 0000000000..429b0deb56
--- /dev/null
+++ b/9240549-attach.sh
@@ -0,0 +1,25 @@
+# No args launches on the head node (node 0)
+# Args 1-N launch on worker nodes (nodes 1 through N-1)
+# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
+WORKER_NUM=${1:-}
+if [[ -z "$WORKER_NUM" ]]; then
+  # Empty means we are on the head node
+  if [[ -n "${COMMAND:-}" ]]; then
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01736" --jobid 9240549 bash -c "$COMMAND"
+  else
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01736" --jobid 9240549 --pty bash
+  fi
+else
+  # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
+  # and use nodes_array[1] through nodes_array[N-1]
+  if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then
+    echo "Error: WORKER_NUM must be between 1 and 0"
+    exit 1
+  fi
+  nodes_array=(pool0-01736)
+  if [[ -n "${COMMAND:-}" ]]; then
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9240549 bash -c "$COMMAND"
+  else
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9240549 --pty bash
+  fi
+fi
diff --git a/9261863-attach.sh b/9261863-attach.sh
new file mode 100755
index 0000000000..d1d6280cb3
--- /dev/null
+++ b/9261863-attach.sh
@@ -0,0 +1,25 @@
+# No args launches on the head node (node 0)
+# Args 1-N launch on worker nodes (nodes 1 through N-1)
+# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
+WORKER_NUM=${1:-}
+if [[ -z "$WORKER_NUM" ]]; then
+  # Empty means we are on the head node
+  if [[ -n "${COMMAND:-}" ]]; then
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00629" --jobid 9261863 bash -c "$COMMAND"
+  else
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00629" --jobid 9261863 --pty bash
+  fi
+else
+  # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
+  # and use nodes_array[1] through nodes_array[N-1]
+  if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then
+    echo "Error: WORKER_NUM must be between 1 and 0"
+    exit 1
+  fi
+  nodes_array=(pool0-00629)
+  if [[ -n "${COMMAND:-}" ]]; then
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9261863 bash -c "$COMMAND"
+  else
+    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9261863 --pty bash
+  fi
+fi
diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml
index ae2fbcd3e1..c3d1dd901a 100644
--- a/examples/configs/distillation_math_megatron.yaml
+++ b/examples/configs/distillation_math_megatron.yaml
@@ -37,6 +37,7 @@ policy: &POLICY_BASE
 
     megatron_cfg: &MEGATRON_BASE
         enabled: true
+        megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_1p7b_pretrain_config
         empty_unused_memory_level: 0
         activation_checkpointing: false
         converter_type: "Qwen3ForCausalLM"
@@ -142,6 +143,7 @@ teacher:
     model_name: "Qwen/Qwen3-4B"
     megatron_cfg:
         <<: *MEGATRON_BASE
+        megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_4b_pretrain_config
         context_parallel_size: 2
         tensor_model_parallel_size: 2
         pipeline_model_parallel_size: 2
diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
index a9368481ae..a4d7592f80 100644
--- a/examples/configs/grpo_math_1B_megatron.yaml
+++ b/examples/configs/grpo_math_1B_megatron.yaml
@@ -78,6 +78,7 @@ policy:
 
   megatron_cfg:
     enabled: true
+    megatron_recipe: null  # Set to a fully qualified recipe path to use a Megatron-Bridge recipe, e.g. megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
     activation_checkpointing: false
     converter_type: "Qwen2ForCausalLM"
diff --git a/examples/configs/grpo_math_70B_megatron_fp8.yaml b/examples/configs/grpo_math_70B_megatron_fp8.yaml
index df239cd8ff..322aa6c0f2 100644
--- a/examples/configs/grpo_math_70B_megatron_fp8.yaml
+++ b/examples/configs/grpo_math_70B_megatron_fp8.yaml
@@ -8,15 +8,4 @@ policy:
   generation:
     vllm_cfg:
       precision: "fp8"
-      use_deep_gemm: true
-  megatron_cfg:
-    pipeline_model_parallel_size: 8
-    fp8_cfg:
-      enabled: true
-      fp8: "e4m3"
-      fp8_recipe: "blockwise"
-      fp8_param: false
-    optimizer:
-      use_precision_aware_optimizer: false
-    env_vars:
-      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
\ No newline at end of file
+      use_deep_gemm: true
\ No newline at end of file
diff --git a/examples/configs/grpo_math_8B_megatron.yaml b/examples/configs/grpo_math_8B_megatron.yaml
index 977ab394b5..8d2ddfa90a 100644
--- a/examples/configs/grpo_math_8B_megatron.yaml
+++ b/examples/configs/grpo_math_8B_megatron.yaml
@@ -30,6 +30,7 @@ policy:
 
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
     converter_type: "LlamaForCausalLM"
     tensor_model_parallel_size: 1
diff --git a/examples/configs/grpo_math_8B_megatron_fp8.yaml b/examples/configs/grpo_math_8B_megatron_fp8.yaml
index ba6ee6e5c8..9548979c1c 100644
--- a/examples/configs/grpo_math_8B_megatron_fp8.yaml
+++ b/examples/configs/grpo_math_8B_megatron_fp8.yaml
@@ -19,4 +19,4 @@ policy:
     optimizer:
       use_precision_aware_optimizer: false
     env_vars:
-      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
\ No newline at end of file
+      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml
index 95c9e85573..31100ce7b9 100644
--- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml
+++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml
@@ -1,10 +1,4 @@
 defaults: ./distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
-policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 1
-teacher:
-  megatron_cfg:
-    tensor_model_parallel_size: 2
 checkpointing:
   checkpoint_dir: checkpoints/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack
 logger:
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml
index 8324173dfc..1f75679f39 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml
@@ -1,8 +1,4 @@
 defaults: ./dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
-policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 1
-    sequence_parallel: false
 logger:
   wandb:
     name: dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
index 8df4bc3fb0..a19a094bf5 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
@@ -20,6 +20,7 @@ policy:
   optimizer: null
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     tensor_model_parallel_size: 4
 logger:
   wandb_enabled: true
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
index 8b3a43ea28..83fac04256 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
@@ -22,6 +22,7 @@ policy:
   optimizer: null
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     pipeline_model_parallel_size: 2
 logger:
   wandb_enabled: true
diff --git a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml
index fb4a4bc880..627a00574e 100644
--- a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml
@@ -1,12 +1,5 @@
 defaults: ./grpo-dapomath17k-dsv3-megatron.yaml
 policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 4
-    expert_model_parallel_size: 16
-    pipeline_model_parallel_size: 4
-    context_parallel_size: 2
-    num_layers_in_first_pipeline_stage: 15
-    num_layers_in_last_pipeline_stage: 14
   make_sequence_length_divisible_by: 4
   generation:
     vllm_cfg:
diff --git a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml
index 8d19757d54..0378daaa41 100644
--- a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml
@@ -19,6 +19,7 @@ policy:
   optimizer: null
   megatron_cfg:
     enabled: true
+    megatron_recipe: null  # Can be set to megatron.bridge.recipes.deepseek.deepseek_v3.deepseek_v3_pretrain_config
     activation_checkpointing: true
     tensor_model_parallel_size: 8
     expert_model_parallel_size: 32
diff --git a/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml
index c9719f381f..ef033bc67f 100644
--- a/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml
@@ -1,8 +1,5 @@
 defaults: ./grpo-gptoss-20b-8n8g-megatron.yaml
 policy:
-  megatron_cfg:
-    expert_model_parallel_size: 4
-    tensor_model_parallel_size: 2
   generation:
     vllm_cfg:
       tensor_parallel_size: 1
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml
index dcd791eee6..f2c4af29c5 100644
--- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml
@@ -18,6 +18,7 @@ policy:
     enabled: false
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     converter_type: LlamaForCausalLM
     pipeline_model_parallel_size: 2
     activation_checkpointing: true
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
index 6411c6fb49..624ced10c5 100644
--- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
@@ -19,6 +19,7 @@ policy:
     enabled: false
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     converter_type: LlamaForCausalLM
     pipeline_model_parallel_size: 2
     activation_checkpointing: true
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
index 333a06d980..a3235a7b41 100755
--- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
@@ -12,6 +12,7 @@ policy:
   optimizer: null
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config
     scheduler:
       lr_warmup_iters: 50
   dtensor_cfg:
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml
index bb641388d8..728a711b48 100644
--- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml
@@ -12,6 +12,7 @@ policy:
   optimizer: null
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config
     scheduler:
       lr_warmup_iters: 50
   dtensor_cfg:
diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml
index 97d6ffede7..4459adc9dd 100644
--- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml
@@ -1,12 +1,6 @@
 defaults: ./grpo-moonlight-16ba3b-4n8g-megatron.yaml
 checkpointing:
   checkpoint_dir: results/grpo-moonlight-16ba3b-4n4g-megatron
-policy:
-  megatron_cfg:
-    expert_model_parallel_size: 2
-    pipeline_model_parallel_size: 2
-    num_layers_in_first_pipeline_stage: 14
-    num_layers_in_last_pipeline_stage: 13
 logger:
   wandb:
     name: grpo-moonlight-16ba3b-4n4g-megatron
diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml
index da8301a19b..ba30e6490e 100644
--- a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml
@@ -1,9 +1,6 @@
 defaults: ./grpo-nano-v2-12b-1n8g-megatron.yaml
 checkpointing:
   checkpoint_dir: results/grpo-nano-v2-12b-1n4g-megatron
-policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 4
 logger:
   log_dir: logs/grpo-nano-v2-12b-1n4g-megatron
   wandb:
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml
index b21c9dd51f..4029f002e8 100644
--- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml
@@ -1,7 +1,5 @@
 defaults: ./grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
 policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 1
   make_sequence_length_divisible_by: 2
   generation:
     vllm_cfg:
diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml
index 79fbda389d..f26cbf49b2 100644
--- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml
@@ -1,9 +1,5 @@
 defaults: ./grpo-qwen3-30ba3b-8n8g-megatron.yaml
 policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 2
-    pipeline_model_parallel_size: 2
-    expert_model_parallel_size: 2
   make_sequence_length_divisible_by: 2
   generation:
     vllm_cfg:
diff --git a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
index 69ff4a4229..d155afac75 100644
--- a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
@@ -17,6 +17,7 @@ policy:
   scheduler: null
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_8b_pretrain_config
     converter_type: Qwen3ForCausalLM
     tensor_model_parallel_size: 4
     optimizer:
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
index 04fc067d6e..9b157ea779 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
@@ -4,11 +4,6 @@ checkpointing:
 policy:
   sequence_packing:
     enabled: false
-  megatron_cfg:
-    pipeline_model_parallel_size: 8
-    expert_model_parallel_size: 16
-    num_layers_in_first_pipeline_stage: 7
-    num_layers_in_last_pipeline_stage: 6
   generation:
     vllm_cfg:
       tensor_parallel_size: 32
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
index bf9a30a5d3..ff3e68d7da 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
@@ -4,11 +4,6 @@ checkpointing:
 policy:
   sequence_packing:
     enabled: false
-  megatron_cfg:
-    pipeline_model_parallel_size: 8
-    expert_model_parallel_size: 16
-    num_layers_in_first_pipeline_stage: 7
-    num_layers_in_last_pipeline_stage: 6
   generation:
     colocated:
       resources:
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml
index 595654a3a3..0260eb39e6 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml
@@ -10,10 +10,6 @@ checkpointing:
   checkpoint_dir: results/grpo-deepseek-v3-64n8g-async-1off
 policy:
   logprob_batch_size: 2
-  megatron_cfg:
-    tensor_model_parallel_size: 1
-    pipeline_model_parallel_size: 16
-    expert_model_parallel_size: 16
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
index 7f6b5ae86b..b8f8ece97d 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
@@ -2,15 +2,6 @@ defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml
 checkpointing:
   checkpoint_dir: results/grpo-deepseek-v3-64n8g-fp8-async-1off
 policy:
-  megatron_cfg:
-    fp8_cfg:
-      enabled: true
-      fp8: "e4m3"
-      fp8_recipe: "blockwise"
-      fp8_param: false
-    moe_router_dtype: fp32
-    env_vars:
-      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
   generation:
     vllm_cfg:
       tensor_parallel_size: 16
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
index d906eda2b4..ba06869c7e 100644
--- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
@@ -9,10 +9,6 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g-async-1off
 policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    sequence_parallel: false
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
index c0263f68fb..b6d7ed441d 100644
--- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
@@ -9,8 +9,6 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-async-1off
 policy:
-  megatron_cfg:
-    pipeline_model_parallel_size: 1
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml
index b32786f7d7..1ab2bafaf4 100644
--- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml
@@ -2,14 +2,6 @@ defaults: ./grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
 checkpointing:
   checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off
 policy:
-  megatron_cfg:
-    fp8_cfg:
-      enabled: true
-      fp8: "e4m3"
-      fp8_recipe: "blockwise"
-      fp8_param: false
-    env_vars:
-      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
   generation:
     vllm_cfg:
       precision: "fp8"
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
index 1640deda09..d542091951 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
@@ -2,10 +2,6 @@ defaults: ./grpo-qwen3-235b-16n8g.yaml
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-235b-16n4g
 policy:
-  megatron_cfg:
-    pipeline_model_parallel_size: 4
-    num_layers_in_first_pipeline_stage: 23
-    num_layers_in_last_pipeline_stage: 23
   generation:
     vllm_cfg:
       tensor_parallel_size: 8
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml
index f55b383686..13d8501363 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml
@@ -2,10 +2,6 @@ defaults: ./grpo-qwen3-235b-32n8g-async-1off.yaml
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-235b-32n4g-async-1off
 policy:
-  megatron_cfg:
-    pipeline_model_parallel_size: 4
-    num_layers_in_first_pipeline_stage: 23
-    num_layers_in_last_pipeline_stage: 23
   generation:
     colocated:
       resources:
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml
index cf4f5a6f98..4545aa364a 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml
@@ -9,13 +9,6 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-235b-32n8g-async-1off
 policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 4
-    sequence_parallel: true
-    context_parallel_size: 1
-    pipeline_model_parallel_size: 8
-    expert_model_parallel_size: 16
-    defer_fp32_logits: false
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml
index 11d917fc8b..b4d5409a61 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml
@@ -9,11 +9,6 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-30ba3b-24n8g-async-8off
 policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    expert_model_parallel_size: 8
-    sequence_parallel: false
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml
index 4cc5981460..797244576f 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml
@@ -9,11 +9,6 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g-async-1off
 policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 1
-    pipeline_model_parallel_size: 2
-    expert_model_parallel_size: 8
-    sequence_parallel: false
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
index a9837c87f2..6da999f169 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
@@ -9,11 +9,6 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-30ba3b-8n4g-async-1off
 policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    expert_model_parallel_size: 16
-    sequence_parallel: false
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
index 4f8a0a03bb..f50db7fde8 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
@@ -9,10 +9,6 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-32b-8n4g-async-1off
 policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 2
-    pipeline_model_parallel_size: 1
-    sequence_parallel: true
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml
index 9f20f34f40..a54a8e7747 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml
@@ -9,10 +9,6 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-32b-8n8g-async-1off
 policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 4
-    pipeline_model_parallel_size: 4
-    sequence_parallel: true
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml
index 77c175fadf..798d5e0617 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml
@@ -1,7 +1,5 @@
 defaults: ./sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
 policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 2
   make_sequence_length_divisible_by: 2
 checkpointing:
   checkpoint_dir: results/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron
diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
index bb43955812..33434d14e0 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
@@ -19,6 +19,7 @@ policy:
     enabled: false
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config
     tensor_model_parallel_size: 4
     pipeline_model_parallel_size: 2
     freeze_moe_router: true
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml
index b2b76c0afd..6c2b1117f6 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml
@@ -16,6 +16,7 @@ policy:
     enabled: false
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     peft:
       enabled: true
       dim: 128
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
index aa62330e3e..257624dd9f 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
@@ -21,6 +21,7 @@ policy:
   optimizer: null
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     tensor_model_parallel_size: 2
     pipeline_model_parallel_size: 2
     optimizer:
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
index 7e9452dff7..100a87be03 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
@@ -19,6 +19,7 @@ policy:
   optimizer: null
   megatron_cfg:
     enabled: true
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     tensor_model_parallel_size: 2
     pipeline_model_parallel_size: 2
     optimizer:
diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml
index aad3f5c8e0..903db6113b 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml
@@ -1,8 +1,4 @@
 defaults: ./sft-qwen2.5-math7b-2n8g-megatron.yaml
-policy:
-  megatron_cfg:
-    tensor_model_parallel_size: 2
-    context_parallel_size: 1
 logger:
   wandb:
     name: sft-qwen2.5-math7b-2n4g-megatron
diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
index d81a58980e..d301812f91 100644
--- a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
+++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
@@ -9,18 +9,8 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
-  megatron_cfg:
-    enabled: true
-    optimizer:
-      lr: 5.0e-07
-      min_lr: 5.0e-08
-    scheduler:
-      lr_warmup_iters: 50
-      lr_warmup_init: 5.0e-08
-    distributed_data_parallel_config:
-      overlap_grad_reduce: false
 logger:
   wandb:
     name: vlm-grpo-3b-megatron
 cluster:
-  gpus_per_node: 8
\ No newline at end of file
+  gpus_per_node: 8
diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml
index faca12e0ae..9df98169c8 100644
--- a/examples/configs/sft_openmathinstruct2_megatron.yaml
+++ b/examples/configs/sft_openmathinstruct2_megatron.yaml
@@ -33,6 +33,7 @@ policy:
     enabled: false
 
   megatron_cfg:
+    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     activation_checkpointing: false
     context_parallel_size: 1
     distributed_data_parallel_config:
diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml
index f9612007a4..c28b11add8 100644
--- a/examples/configs/vlm_grpo_3B.yaml
+++ b/examples/configs/vlm_grpo_3B.yaml
@@ -80,79 +80,6 @@ policy:
     context_parallel_size: 1
     custom_parallel_plan: null
 
-  megatron_cfg:
-    enabled: false
-    empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
-    activation_checkpointing: false
-    converter_type: "Qwen2ForCausalLM"
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing
-    apply_rope_fusion: True
-    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
-    bias_activation_fusion: True
-    defer_fp32_logits: False
-    moe_per_layer_logging: False
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6
-      min_lr: 5.0e-7
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 13
-      lr_warmup_init: 5.0e-7
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-
   # dynamic_batching improves performance by ensuring logprob and training microbatches
   # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
   # responses are sorted by sequence length and bucketed into microbatches with a total
diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml
index b32cd7df04..0c62a03954 100644
--- a/examples/configs/vlm_grpo_3B_megatron.yaml
+++ b/examples/configs/vlm_grpo_3B_megatron.yaml
@@ -123,65 +123,6 @@ policy:
       resources:
         gpus_per_node: null
         num_nodes: null
-  megatron_cfg:
-    enabled: true
-    empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
-    activation_checkpointing: false
-    converter_type: Qwen2ForCausalLM
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: fp64
-    moe_router_load_balancing_type: none
-    moe_router_bias_update_rate: 0.0
-    moe_permute_fusion: false
-    apply_rope_fusion: true
-    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
-    bias_activation_fusion: True
-    defer_fp32_logits: False
-    moe_per_layer_logging: False
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-    optimizer:
-      optimizer: adam
-      lr: 2.0e-07
-      min_lr: 2.0e-07
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: float32
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1.0e-08
-      sgd_momentum: 0.9
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-      clip_grad: ${policy.max_grad_norm}
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: constant
-      lr_decay_style: constant
-      lr_decay_iters: 1000
-      lr_warmup_iters: 50
-      lr_warmup_init: 2.0e-08
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: false
-      overlap_param_gather: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: optim_grads_params
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   shuffle: true
diff --git a/nemo_rl/models/megatron/__init__.py b/nemo_rl/models/megatron/__init__.py
index f7ce1ab003..790146ecaa 100644
--- a/nemo_rl/models/megatron/__init__.py
+++ b/nemo_rl/models/megatron/__init__.py
@@ -13,11 +13,9 @@
 # limitations under the License.
 
 from nemo_rl.models.megatron.recipe_config import (
-    get_available_recipes,
-    get_recipe_function,
+    load_recipe,
 )
 
 __all__ = [
-    "get_available_recipes",
-    "get_recipe_function",
+    "load_recipe",
 ]
diff --git a/nemo_rl/models/megatron/recipe_config.py b/nemo_rl/models/megatron/recipe_config.py
index 891129c503..ddcb7862e3 100644
--- a/nemo_rl/models/megatron/recipe_config.py
+++ b/nemo_rl/models/megatron/recipe_config.py
@@ -19,123 +19,63 @@
 allowing NeMo-RL to use pre-configured training recipes as a base and
 layer RL-specific settings on top.
 
-Example usage:
-    from nemo_rl.models.megatron.recipe_config import create_config_from_recipe
-    
-    megatron_cfg = create_config_from_recipe(
-        hf_model_name="meta-llama/Llama-3.1-8B-Instruct",
-        policy_config=config,
-        pretrained_path="/path/to/checkpoint",
-        weights_path=None,
-    )
-
-Internal flag for testing:
-    # To use pure recipe settings with minimal RL overrides (for testing):
-    megatron_cfg = create_config_from_recipe(
-        ...,
-        _apply_full_overrides=False,  # Internal flag - keeps recipe's optimizer/scheduler
-    )
-"""
-
-import warnings
-from typing import Any, Callable, Optional
+Recipes are specified via their fully qualified Python import path in the
+YAML config under ``policy.megatron_cfg.megatron_recipe``. For example:
 
-import torch
-from megatron.bridge import AutoBridge
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    LoggerConfig,
-    OptimizerConfig,
-    SchedulerConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-
-from nemo_rl.models.policy import PolicyConfig
+    policy:
+      megatron_cfg:
+        megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
+        ...
 
+The import path is resolved at runtime using ``load_recipe()``.
+"""
 
-# =============================================================================
-# RECIPE DISCOVERY
-# =============================================================================
+import importlib
 
-def _import_llama_recipes():
-    """Import Llama recipes from Megatron-Bridge."""
-    try:
-        from megatron.bridge.recipes.llama.llama3 import (
-            llama31_8b_pretrain_config,
-            llama31_70b_pretrain_config,
-            llama31_405b_pretrain_config,
-            llama3_8b_pretrain_config,
-            llama3_70b_pretrain_config,
-            llama32_1b_pretrain_config,
-            llama32_3b_pretrain_config,
-        )
-        return {
-            "llama-3.2-1b": llama32_1b_pretrain_config,
-            "llama-3.2-3b": llama32_3b_pretrain_config,
-            "llama-3-8b": llama3_8b_pretrain_config,
-            "llama-3.1-8b": llama31_8b_pretrain_config,
-            "meta-llama-3-8b": llama3_8b_pretrain_config,
-            "meta-llama-3.1-8b": llama31_8b_pretrain_config,
-            "llama-3-70b": llama3_70b_pretrain_config,
-            "llama-3.1-70b": llama31_70b_pretrain_config,
-            "llama-3.1-405b": llama31_405b_pretrain_config,
-        }
-    except ImportError:
-        return {}
-
-
-def _import_qwen_recipes():
-    """Import Qwen recipes from Megatron-Bridge."""
-    try:
-        from megatron.bridge.recipes.qwen.qwen3 import (
-            qwen3_600m_pretrain_config,
-            qwen3_1p7b_pretrain_config,
-            qwen3_4b_pretrain_config,
-            qwen3_8b_pretrain_config,
-        )
-        return {
-            "qwen3-0.6b": qwen3_600m_pretrain_config,
-            "qwen3-1.7b": qwen3_1p7b_pretrain_config,
-            "qwen3-4b": qwen3_4b_pretrain_config,
-            "qwen3-8b": qwen3_8b_pretrain_config,
-        }
-    except ImportError:
-        return {}
+from megatron.bridge.training.config import ConfigContainer
 
 
-def get_recipe_function(hf_model_name: str) -> Optional[Callable[..., ConfigContainer]]:
+def load_recipe(recipe_path: str) -> ConfigContainer:
     """
-    Get the appropriate Megatron-Bridge recipe function for a model.
-    
+    Dynamically import and call a Megatron-Bridge recipe function.
+
     Args:
-        hf_model_name: HuggingFace model name or path
-        
+        recipe_path: Fully qualified Python import path to the recipe function.
+            For example: ``megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config``
+
     Returns:
-        Recipe function or None if no matching recipe found
+        A ConfigContainer produced by calling the recipe function.
+
+    Raises:
+        ValueError: If the recipe path is invalid or the function cannot be found.
+        TypeError: If the resolved object is not callable.
     """
-    model_lower = hf_model_name.lower().replace("/", "-").replace("_", "-")
-    
-    # Load recipes lazily
-    all_recipes = {}
-    all_recipes.update(_import_llama_recipes())
-    all_recipes.update(_import_qwen_recipes())
-    
-    # Try match
-    for pattern, recipe_fn in all_recipes.items():
-        if pattern in model_lower:
-            return recipe_fn
-    
-    return None
-
-
-def get_available_recipes() -> list[str]:
-    """Return a list of available recipe patterns."""
-    all_recipes = {}
-    all_recipes.update(_import_llama_recipes())
-    all_recipes.update(_import_qwen_recipes())
-    return list(all_recipes.keys())
+    module_path, _, func_name = recipe_path.rpartition(".")
+    if not module_path or not func_name:
+        raise ValueError(
+            f"Invalid recipe path '{recipe_path}'. "
+            "Expected a fully qualified Python path like "
+            "'megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config'"
+        )
 
+    try:
+        module = importlib.import_module(module_path)
+    except ImportError as e:
+        raise ValueError(
+            f"Could not import module '{module_path}' from recipe path '{recipe_path}': {e}"
+        ) from e
+
+    recipe_fn = getattr(module, func_name, None)
+    if recipe_fn is None:
+        raise ValueError(
+            f"Module '{module_path}' has no attribute '{func_name}'. "
+            f"Check that the recipe function name is correct in '{recipe_path}'."
+        )
+
+    if not callable(recipe_fn):
+        raise TypeError(
+            f"'{recipe_path}' resolved to a non-callable object of type {type(recipe_fn).__name__}. "
+            "Expected a recipe function."
+        )
 
+    return recipe_fn()
diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index 14dcbabcb3..cfe49b7ad6 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -69,9 +69,7 @@
 from nemo_rl.distributed.named_sharding import NamedSharding
 from nemo_rl.models.megatron.community_import import import_model_from_hf_name
 from nemo_rl.models.megatron.config import ModelAndOptimizerState, RuntimeConfig
-from nemo_rl.models.megatron.recipe_config import (
-    get_recipe_function,
-)
+from nemo_rl.models.megatron.recipe_config import load_recipe
 from nemo_rl.models.policy import PolicyConfig
 from nemo_rl.models.policy.utils import (
     configure_dynamo_cache,
@@ -235,7 +233,6 @@ def validate_and_set_config(
         hf_model_name=hf_model_name,
         pretrained_path=pretrained_path,
         weights_path=weights_path,
-        use_recipe=True,
     )
 
     final_padded_vocab_size = calculate_padded_vocab_size(
@@ -279,20 +276,15 @@ def setup_model_config(
     hf_model_name: str,
     pretrained_path: str,
     weights_path: Optional[str] = None,
-    use_recipe: bool = True,
 ) -> tuple[ConfigContainer, Any]:
     """Setup model configuration."""
     model_cfg = None
-    use_recipe_for_model = use_recipe and get_recipe_function(hf_model_name) is not None
-
-    if use_recipe_for_model:
-        # Use Megatron-Bridge golden recipes
-        print(f"[INFO] Using Megatron-Bridge recipe-based config for {hf_model_name}")
-        recipe_fn = get_recipe_function(hf_model_name)
-        if recipe_fn is None:
-            raise ValueError(f"No recipe found for {hf_model_name}")
+    megatron_recipe = config["megatron_cfg"].get("megatron_recipe")
 
-        megatron_cfg = recipe_fn()
+    if megatron_recipe:
+        # Use Megatron-Bridge recipe specified in config
+        print(f"[INFO] Using Megatron-Bridge recipe: {megatron_recipe}")
+        megatron_cfg = load_recipe(megatron_recipe)
         model_cfg = megatron_cfg.model
     else:
         # Load pretrained run config
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index 363399cbca..fa8d1d4501 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -158,6 +158,11 @@ class MegatronConfigDisabled(TypedDict):
 
 class MegatronConfig(TypedDict):
     enabled: Literal[True]
+    # Fully qualified Python import path to a Megatron-Bridge recipe function.
+    # When set, the recipe is loaded at runtime to provide the base model configuration.
+    # When null/unset, configuration is loaded from the checkpoint's run_config.yaml.
+    # Example: "megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config"
+    megatron_recipe: NotRequired[str | None]
     env_vars: NotRequired[dict[str, str] | None]
     # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation.
     # Setting to 0 is faster, but you are more likely to run out of GPU memory. In SFT/DPO, the default is 0.

From 824fbdb6c8033eae8835543aa889e17f6ca1ce01 Mon Sep 17 00:00:00 2001
From: Sherif Fawzy <sfawzy@nvidia.com>
Date: Fri, 6 Feb 2026 09:44:35 -0800
Subject: [PATCH 3/8] .

---
 9239646-attach.sh                             | 25 ------
 9239676-attach.sh                             | 25 ------
 9240549-attach.sh                             | 25 ------
 9261863-attach.sh                             | 25 ------
 examples/configs/distillation_math.yaml       | 71 ---------------
 .../configs/distillation_math_megatron.yaml   |  5 +-
 examples/configs/dpo.yaml                     | 71 ---------------
 examples/configs/grpo_math_1B.yaml            | 75 ----------------
 examples/configs/grpo_math_1B_megatron.yaml   | 69 +--------------
 examples/configs/grpo_math_70B_megatron.yaml  |  2 +
 examples/configs/grpo_math_8B_megatron.yaml   |  3 +-
 .../grpo_math_qwen30ba3b_megatron.yaml        |  2 +
 ....7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml |  1 +
 ...llama3.1-8b-instruct-4n8g-megatron.v2.yaml |  2 +-
 ...8b-instruct-4n8g-megatrontp2pp2-quick.yaml |  2 +-
 .../llm/grpo-dapomath17k-dsv3-megatron.yaml   |  2 +-
 .../llm/grpo-gptoss-20b-8n8g-megatron.yaml    |  1 +
 ...nstruct-1n8g-megatron-fp8-rollouts.v3.yaml |  2 +-
 ...3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml |  2 +-
 ...po-llama3.2-1b-instruct-1n8g-megatron.yaml |  2 +-
 ...-1b-instruct-1n8g-megatron_generation.yaml |  2 +-
 ...po-math-qwen3-30ba3b-megatron-tp4-32k.yaml |  1 +
 .../grpo-moonlight-16ba3b-4n8g-megatron.yaml  |  1 +
 .../llm/grpo-nano-v2-12b-1n8g-megatron.yaml   |  1 +
 ...rpo-qwen2.5-7b-instruct-4n8g-megatron.yaml |  1 +
 .../llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml  |  1 +
 ...en3-8b-base-1n8g-fp8-kvcache-megatron.yaml |  2 +-
 ...lama3.1-70b-8n8g-tp4pp2-long-megatron.yaml |  2 +-
 .../sft-llama3.1-8b-1n8g-megatron-lora.yaml   |  2 +-
 ...sft-llama3.1-8b-1n8g-megatron-seqpack.yaml |  2 +-
 .../llm/sft-llama3.1-8b-1n8g-megatron.yaml    |  2 +-
 .../llm/sft-qwen2.5-math7b-2n8g-megatron.yaml |  1 +
 ...3b-instruct-clevr-1n8g-megatrontp2.v1.yaml |  2 +-
 examples/configs/rm.yaml                      | 56 ------------
 examples/configs/sft.yaml                     | 87 -------------------
 examples/configs/sft_openmathinstruct2.yaml   |  3 -
 .../sft_openmathinstruct2_megatron.yaml       |  3 +-
 examples/configs/vlm_grpo_3B_megatron.yaml    |  1 -
 nemo_rl/models/megatron/recipe_config.py      |  4 +-
 nemo_rl/models/megatron/setup.py              | 38 ++++----
 nemo_rl/models/policy/__init__.py             | 10 +--
 41 files changed, 62 insertions(+), 572 deletions(-)
 delete mode 100755 9239646-attach.sh
 delete mode 100755 9239676-attach.sh
 delete mode 100755 9240549-attach.sh
 delete mode 100755 9261863-attach.sh

diff --git a/9239646-attach.sh b/9239646-attach.sh
deleted file mode 100755
index 8e318a4731..0000000000
--- a/9239646-attach.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-# No args launches on the head node (node 0)
-# Args 1-N launch on worker nodes (nodes 1 through N-1)
-# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
-WORKER_NUM=${1:-}
-if [[ -z "$WORKER_NUM" ]]; then
-  # Empty means we are on the head node
-  if [[ -n "${COMMAND:-}" ]]; then
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00753" --jobid 9239646 bash -c "$COMMAND"
-  else
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00753" --jobid 9239646 --pty bash
-  fi
-else
-  # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
-  # and use nodes_array[1] through nodes_array[N-1]
-  if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then
-    echo "Error: WORKER_NUM must be between 1 and 0"
-    exit 1
-  fi
-  nodes_array=(pool0-00753)
-  if [[ -n "${COMMAND:-}" ]]; then
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239646 bash -c "$COMMAND"
-  else
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239646 --pty bash
-  fi
-fi
diff --git a/9239676-attach.sh b/9239676-attach.sh
deleted file mode 100755
index 68f99e49a6..0000000000
--- a/9239676-attach.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-# No args launches on the head node (node 0)
-# Args 1-N launch on worker nodes (nodes 1 through N-1)
-# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
-WORKER_NUM=${1:-}
-if [[ -z "$WORKER_NUM" ]]; then
-  # Empty means we are on the head node
-  if [[ -n "${COMMAND:-}" ]]; then
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01821" --jobid 9239676 bash -c "$COMMAND"
-  else
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01821" --jobid 9239676 --pty bash
-  fi
-else
-  # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
-  # and use nodes_array[1] through nodes_array[N-1]
-  if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then
-    echo "Error: WORKER_NUM must be between 1 and 0"
-    exit 1
-  fi
-  nodes_array=(pool0-01821)
-  if [[ -n "${COMMAND:-}" ]]; then
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239676 bash -c "$COMMAND"
-  else
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239676 --pty bash
-  fi
-fi
diff --git a/9240549-attach.sh b/9240549-attach.sh
deleted file mode 100755
index 429b0deb56..0000000000
--- a/9240549-attach.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-# No args launches on the head node (node 0)
-# Args 1-N launch on worker nodes (nodes 1 through N-1)
-# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
-WORKER_NUM=${1:-}
-if [[ -z "$WORKER_NUM" ]]; then
-  # Empty means we are on the head node
-  if [[ -n "${COMMAND:-}" ]]; then
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01736" --jobid 9240549 bash -c "$COMMAND"
-  else
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01736" --jobid 9240549 --pty bash
-  fi
-else
-  # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
-  # and use nodes_array[1] through nodes_array[N-1]
-  if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then
-    echo "Error: WORKER_NUM must be between 1 and 0"
-    exit 1
-  fi
-  nodes_array=(pool0-01736)
-  if [[ -n "${COMMAND:-}" ]]; then
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9240549 bash -c "$COMMAND"
-  else
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9240549 --pty bash
-  fi
-fi
diff --git a/9261863-attach.sh b/9261863-attach.sh
deleted file mode 100755
index d1d6280cb3..0000000000
--- a/9261863-attach.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-# No args launches on the head node (node 0)
-# Args 1-N launch on worker nodes (nodes 1 through N-1)
-# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
-WORKER_NUM=${1:-}
-if [[ -z "$WORKER_NUM" ]]; then
-  # Empty means we are on the head node
-  if [[ -n "${COMMAND:-}" ]]; then
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00629" --jobid 9261863 bash -c "$COMMAND"
-  else
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00629" --jobid 9261863 --pty bash
-  fi
-else
-  # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
-  # and use nodes_array[1] through nodes_array[N-1]
-  if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then
-    echo "Error: WORKER_NUM must be between 1 and 0"
-    exit 1
-  fi
-  nodes_array=(pool0-00629)
-  if [[ -n "${COMMAND:-}" ]]; then
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9261863 bash -c "$COMMAND"
-  else
-    srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9261863 --pty bash
-  fi
-fi
diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml
index 67ff8a71d2..891976166d 100644
--- a/examples/configs/distillation_math.yaml
+++ b/examples/configs/distillation_math.yaml
@@ -84,77 +84,6 @@ policy: &POLICY_BASE
             foreach: False
             fused: False
 
-    megatron_cfg: &MEGATRON_BASE
-        enabled: false
-        empty_unused_memory_level: 0
-        activation_checkpointing: false
-        converter_type: "Qwen3ForCausalLM"
-        tensor_model_parallel_size: 2
-        expert_tensor_parallel_size: 1
-        expert_model_parallel_size: 1
-        pipeline_model_parallel_size: 2
-        num_layers_in_first_pipeline_stage: null
-        num_layers_in_last_pipeline_stage: null
-        context_parallel_size: 2
-        pipeline_dtype: ${policy.precision}
-        sequence_parallel: false
-        freeze_moe_router: true
-        moe_router_dtype: "fp64"
-        moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-        moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-        moe_permute_fusion: false
-        #gives ~20% training perf speedup with sequence packing 
-        apply_rope_fusion: True
-        bias_activation_fusion: True
-        defer_fp32_logits: False
-        moe_per_layer_logging: False
-        moe_enable_deepep: false
-        moe_token_dispatcher_type: "allgather"
-        moe_shared_expert_overlap: false
-        
-        optimizer:
-            optimizer: "adam"
-            lr: 2.00001e-5
-            min_lr: 2.0e-5
-            weight_decay: 0.01
-            bf16: true
-            fp16: false
-            params_dtype: "float32"
-
-            #adam
-            adam_beta1: 0.9
-            adam_beta2: 0.999
-            adam_eps: 1e-8
-
-            #sgd
-            sgd_momentum: 0.9
-
-            #distributed optimizer
-            use_distributed_optimizer: true
-            use_precision_aware_optimizer: true
-
-            # optimizer cpu offload
-            optimizer_cpu_offload: false
-            optimizer_offload_fraction: 0.0
-
-            clip_grad: ${policy.max_grad_norm}
-
-        scheduler:
-            start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-            end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-            weight_decay_incr_style: "constant"
-            lr_decay_style: "constant"
-            lr_decay_iters: 1000
-            lr_warmup_iters: 10
-            lr_warmup_init: 2.0e-6
-
-        distributed_data_parallel_config:
-            grad_reduce_in_fp32: false
-            overlap_grad_reduce: true
-            overlap_param_gather: true
-            use_custom_fsdp: false
-            data_parallel_sharding_strategy: "optim_grads_params"
-
     scheduler:
         - name: "torch.optim.lr_scheduler.LinearLR"
           kwargs:
diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml
index c3d1dd901a..2865707fbb 100644
--- a/examples/configs/distillation_math_megatron.yaml
+++ b/examples/configs/distillation_math_megatron.yaml
@@ -35,9 +35,10 @@ policy: &POLICY_BASE
 
     make_sequence_length_divisible_by: ${mul:${mul:${.megatron_cfg.tensor_model_parallel_size}, ${.megatron_cfg.context_parallel_size}}, 2}
 
+    megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_1p7b_pretrain_config
+
     megatron_cfg: &MEGATRON_BASE
         enabled: true
-        megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_1p7b_pretrain_config
         empty_unused_memory_level: 0
         activation_checkpointing: false
         converter_type: "Qwen3ForCausalLM"
@@ -141,9 +142,9 @@ policy: &POLICY_BASE
 teacher:
     <<: *POLICY_BASE
     model_name: "Qwen/Qwen3-4B"
+    megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_4b_pretrain_config
     megatron_cfg:
         <<: *MEGATRON_BASE
-        megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_4b_pretrain_config
         context_parallel_size: 2
         tensor_model_parallel_size: 2
         pipeline_model_parallel_size: 2
diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml
index f2b57b0bbd..ef21c555d2 100755
--- a/examples/configs/dpo.yaml
+++ b/examples/configs/dpo.yaml
@@ -106,78 +106,7 @@ policy:
         factor: 1.0
         total_iters: 10000000000
     - milestones: [20]
-    
-  ## ignored since enabled=false, but needed for testing purposes
-  megatron_cfg:
-    enabled: false
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
-    tensor_model_parallel_size: 2
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: true
-    freeze_moe_router: false
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "aux_loss"
-    moe_router_bias_update_rate: 1e-3
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
-    bias_activation_fusion: True
-    defer_fp32_logits: False
-    moe_per_layer_logging: False
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-    
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6 #4.0e-5
-      min_lr: 5.0e-6 #4.0e-5
-      weight_decay: 0.1
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1e-8
 
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_warmup_iters: 1
-      lr_warmup_init: 0.00000001
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      data_parallel_sharding_strategy: "optim_grads_params"
-      use_custom_fsdp: false
-    
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   shuffle: true
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 90269726d7..35dbe01e79 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -105,81 +105,6 @@ policy:
       lora_A_init: "xavier"  # Initialization method for LoRA A matrix: "xavier" or "uniform"
       use_triton: true  # Use Triton-optimized kernels for LoRA (faster but requires flash-attn). Disable when tensor_parallel_size > 1
   
-  megatron_cfg:
-    enabled: false
-    empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
-    activation_checkpointing: false
-    converter_type: "Qwen2ForCausalLM"
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing
-    apply_rope_fusion: True
-    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
-    bias_activation_fusion: True
-    defer_fp32_logits: False
-    moe_per_layer_logging: False
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6
-      min_lr: 5.0e-7
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 13
-      lr_warmup_init: 5.0e-7
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-    fp8_cfg: null
-
-    env_vars: null
 
   # See docs/design-docs/sequence-packing-and-dynamic-batching.md 
   # for more details on dynamic batching and sequence packing.
diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
index a4d7592f80..671e0cbbb1 100644
--- a/examples/configs/grpo_math_1B_megatron.yaml
+++ b/examples/configs/grpo_math_1B_megatron.yaml
@@ -70,76 +70,9 @@ policy:
     sequence_length_round: 64
 
   max_grad_norm: 1.0
-  # makes the training sequence length divisible by the tensor parallel size
-  # this is useful for sequence parallel training
-  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
 
-  optimizer: null # remove default FSDP optimizer
-
-  megatron_cfg:
-    enabled: true
-    megatron_recipe: null  # Set to a fully qualified recipe path to use a Megatron-Bridge recipe, e.g. megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
-    empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
-    activation_checkpointing: false
-    converter_type: "Qwen2ForCausalLM"
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6
-      min_lr: 5.0e-7
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
 
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 13
-      lr_warmup_init: 5.0e-7
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
+  optimizer: null # remove default FSDP optimizer
 
   generation:
     backend: "vllm"
diff --git a/examples/configs/grpo_math_70B_megatron.yaml b/examples/configs/grpo_math_70B_megatron.yaml
index 4d17fdcea3..c89e4e57b8 100644
--- a/examples/configs/grpo_math_70B_megatron.yaml
+++ b/examples/configs/grpo_math_70B_megatron.yaml
@@ -22,6 +22,8 @@ policy:
 
   scheduler: null # remove default FSDP scheduler
 
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config
+  
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/grpo_math_8B_megatron.yaml b/examples/configs/grpo_math_8B_megatron.yaml
index 8d2ddfa90a..e52b3d2d3e 100644
--- a/examples/configs/grpo_math_8B_megatron.yaml
+++ b/examples/configs/grpo_math_8B_megatron.yaml
@@ -28,9 +28,10 @@ policy:
 
   scheduler: null # remove default FSDP scheduler
 
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
+
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
     converter_type: "LlamaForCausalLM"
     tensor_model_parallel_size: 1
diff --git a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
index 37616e32b0..81d812372e 100644
--- a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
+++ b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
@@ -26,6 +26,8 @@ policy:
 
   scheduler: null # remove default FSDP scheduler
 
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config
+  
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
index 6fda3fe24e..6ae8b1ff1a 100644
--- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
+++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
@@ -22,6 +22,7 @@ policy:
     ${.megatron_cfg.context_parallel_size}}, 2}
   megatron_cfg:
     enabled: true
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
 teacher:
   model_name: Qwen/Qwen3-32B
   dtensor_cfg:
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
index a19a094bf5..44843ac0c1 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
@@ -18,9 +18,9 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     tensor_model_parallel_size: 4
 logger:
   wandb_enabled: true
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
index 83fac04256..8e8b2a8a3d 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
@@ -20,9 +20,9 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     pipeline_model_parallel_size: 2
 logger:
   wandb_enabled: true
diff --git a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml
index 0378daaa41..0523d30ac8 100644
--- a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml
@@ -17,9 +17,9 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.deepseek.deepseek_v3.deepseek_v3_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: null  # Can be set to megatron.bridge.recipes.deepseek.deepseek_v3.deepseek_v3_pretrain_config
     activation_checkpointing: true
     tensor_model_parallel_size: 8
     expert_model_parallel_size: 32
diff --git a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml
index b3dec78e98..58655c471e 100755
--- a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml
@@ -8,6 +8,7 @@ policy:
   model_name: openai/gpt-oss-20b
   train_micro_batch_size: 1
   max_total_sequence_length: 4096
+  megatron_recipe: megatron.bridge.recipes.openai.gptoss.gptoss_20b_pretrain_config
   megatron_cfg:
     enabled: true
     expert_model_parallel_size: 8
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml
index f2c4af29c5..8d21260fc6 100644
--- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml
@@ -16,9 +16,9 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     converter_type: LlamaForCausalLM
     pipeline_model_parallel_size: 2
     activation_checkpointing: true
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
index 624ced10c5..4930f552c2 100644
--- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
@@ -17,9 +17,9 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     converter_type: LlamaForCausalLM
     pipeline_model_parallel_size: 2
     activation_checkpointing: true
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
index a3235a7b41..3133e9d3eb 100755
--- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
@@ -10,9 +10,9 @@ policy:
   tokenizer:
     name: meta-llama/Llama-3.2-1B-Instruct
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config
     scheduler:
       lr_warmup_iters: 50
   dtensor_cfg:
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml
index 728a711b48..f89d752e81 100644
--- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml
@@ -10,9 +10,9 @@ policy:
   tokenizer:
     name: meta-llama/Llama-3.2-1B-Instruct
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config
     scheduler:
       lr_warmup_iters: 50
   dtensor_cfg:
diff --git a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
index 92fb87c196..071623c60f 100644
--- a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
+++ b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
@@ -20,6 +20,7 @@ policy:
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
   scheduler: null
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config
   megatron_cfg:
     enabled: true
     converter_type: LlamaForCausalLM
diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
index 83ea6128ef..2c20dcf8ba 100644
--- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
@@ -20,6 +20,7 @@ policy:
     algorithm: modified_ffd
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.nvidia.moonlight.moonlight_16b_a3b_pretrain_config
   megatron_cfg:
     enabled: true
     expert_model_parallel_size: 4
diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml
index 86690abcc2..31ed4e07b0 100644
--- a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml
@@ -8,6 +8,7 @@ policy:
   tokenizer:
     name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.nvidia.nemotron.nemotron_nano_12b_v2_pretrain_config
   megatron_cfg:
     enabled: true
     bias_activation_fusion: false
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
index fd0a48a663..51b0a9c5b6 100755
--- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
@@ -14,6 +14,7 @@ policy:
   max_total_sequence_length: 4096
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_7b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 2
diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
index 6e0aa5cd81..830a259e98 100755
--- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
@@ -17,6 +17,7 @@ policy:
     enabled: false
     algorithm: modified_ffd
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 4
diff --git a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
index d155afac75..aefc0a09ef 100644
--- a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
@@ -15,9 +15,9 @@ policy:
     enabled: false
   optimizer: null
   scheduler: null
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_8b_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_8b_pretrain_config
     converter_type: Qwen3ForCausalLM
     tensor_model_parallel_size: 4
     optimizer:
diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
index 33434d14e0..c638f8a85d 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
@@ -17,9 +17,9 @@ policy:
   max_total_sequence_length: 4096
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config
     tensor_model_parallel_size: 4
     pipeline_model_parallel_size: 2
     freeze_moe_router: true
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml
index 6c2b1117f6..96ccf66d44 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml
@@ -14,9 +14,9 @@ policy:
     chat_template: default
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     peft:
       enabled: true
       dim: 128
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
index 257624dd9f..43e358acea 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
@@ -19,9 +19,9 @@ policy:
     enabled: true
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     tensor_model_parallel_size: 2
     pipeline_model_parallel_size: 2
     optimizer:
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
index 100a87be03..d3ba4e5a28 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
@@ -17,9 +17,9 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     tensor_model_parallel_size: 2
     pipeline_model_parallel_size: 2
     optimizer:
diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml
index d3bdd77bb2..cfe381fd33 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml
@@ -9,6 +9,7 @@ policy:
   max_total_sequence_length: 16384
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_math_7b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 4
diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
index d301812f91..ac8e882875 100644
--- a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
+++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
@@ -7,8 +7,8 @@ policy:
     enabled: false
   dynamic_batching:
     enabled: false
-  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_vl_3b_pretrain_config
 logger:
   wandb:
     name: vlm-grpo-3b-megatron
diff --git a/examples/configs/rm.yaml b/examples/configs/rm.yaml
index 4b0936fec5..49e56d11e8 100644
--- a/examples/configs/rm.yaml
+++ b/examples/configs/rm.yaml
@@ -73,62 +73,6 @@ policy:
       foreach: false
       fused: false
     
-  ## ignored since enabled=false, but needed for testing purposes
-  megatron_cfg:
-    enabled: false
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
-    tensor_model_parallel_size: 2
-    pipeline_model_parallel_size: 2
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: false
-
-    optimizer:
-      optimizer: "adam"
-      lr: 2.0e-6
-      min_lr: 1.9999e-6
-      weight_decay: 0.1
-      bf16: false
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1e-5
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 50
-      lr_warmup_init: 1.9999e-6
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-    
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   shuffle: true
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 6d53d7f606..71d8c1cc84 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -88,93 +88,6 @@ policy:
       foreach: False
       fused: False
     
-  ## ignored since enabled=false, but needed for testing purposes
-  megatron_cfg:
-    enabled: false
-    env_vars: {}
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: false
-    freeze_moe_router: false
-    moe_router_dtype: null
-    moe_router_load_balancing_type: "aux_loss"
-    moe_router_bias_update_rate: 1e-3
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
-    bias_activation_fusion: True
-    defer_fp32_logits: False
-    moe_per_layer_logging: False
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-
-    peft:
-      enabled: false
-      target_modules: []
-      exclude_modules: []
-      dim: 8
-      alpha: 32
-      dropout: 0.0
-      dropout_position: "post"
-      lora_A_init_method: "xavier"
-      lora_B_init_method: "zero"
-      a2a_experimental: false
-      lora_dtype: None
-
-
-    optimizer:
-      optimizer: "adam" # When weight decay is set, it actually uses AdamW 
-      lr: 5.0e-6
-      min_lr: 4.9999e-6
-      weight_decay: 0.1 # When weight decay is set, it actually uses AdamW
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1e-5
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 50
-      lr_warmup_init: 4.9999e-6
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      data_parallel_sharding_strategy: "optim_grads_params"
-      use_custom_fsdp: false
-
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   add_bos: true
diff --git a/examples/configs/sft_openmathinstruct2.yaml b/examples/configs/sft_openmathinstruct2.yaml
index 63fa6d65e4..00b7bbf8e7 100644
--- a/examples/configs/sft_openmathinstruct2.yaml
+++ b/examples/configs/sft_openmathinstruct2.yaml
@@ -39,9 +39,6 @@ policy:
     context_parallel_size: 1
     custom_parallel_plan: null
 
-  megatron_cfg:
-    enabled: false
-
   dynamic_batching:
     enabled: false
 
diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml
index 9df98169c8..2d137012ef 100644
--- a/examples/configs/sft_openmathinstruct2_megatron.yaml
+++ b/examples/configs/sft_openmathinstruct2_megatron.yaml
@@ -32,8 +32,9 @@ policy:
   dtensor_cfg:
     enabled: false
 
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
+
   megatron_cfg:
-    megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
     activation_checkpointing: false
     context_parallel_size: 1
     distributed_data_parallel_config:
diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml
index 0c62a03954..9b0275ca47 100644
--- a/examples/configs/vlm_grpo_3B_megatron.yaml
+++ b/examples/configs/vlm_grpo_3B_megatron.yaml
@@ -77,7 +77,6 @@ policy:
     train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
     logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
     sequence_length_round: 64
-  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   max_grad_norm: 1.0
   sequence_packing:
     enabled: false
diff --git a/nemo_rl/models/megatron/recipe_config.py b/nemo_rl/models/megatron/recipe_config.py
index ddcb7862e3..21861fc403 100644
--- a/nemo_rl/models/megatron/recipe_config.py
+++ b/nemo_rl/models/megatron/recipe_config.py
@@ -20,11 +20,11 @@
 layer RL-specific settings on top.
 
 Recipes are specified via their fully qualified Python import path in the
-YAML config under ``policy.megatron_cfg.megatron_recipe``. For example:
+YAML config under ``policy.megatron_recipe``. For example:
 
     policy:
+      megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
       megatron_cfg:
-        megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
         ...
 
 The import path is resolved at runtime using ``load_recipe()``.
diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index cfe49b7ad6..956c6e5400 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -279,7 +279,9 @@ def setup_model_config(
 ) -> tuple[ConfigContainer, Any]:
     """Setup model configuration."""
     model_cfg = None
-    megatron_recipe = config["megatron_cfg"].get("megatron_recipe")
+    megatron_recipe = config.get("megatron_recipe") or config.get(
+        "megatron_cfg", {}
+    ).get("megatron_recipe")
 
     if megatron_recipe:
         # Use Megatron-Bridge recipe specified in config
@@ -331,27 +333,33 @@ def setup_model_config(
     # Apply performance settings
     _apply_performance_config(model_cfg, config)
 
-    # Validate optimizer configuration
-    _validate_optimizer_config(config)
 
     # Optional layernorm epsilon
     if "layernorm_epsilon" in config["megatron_cfg"]:
         model_cfg.layernorm_epsilon = config["megatron_cfg"]["layernorm_epsilon"]
 
-    # Validate chunking configuration
-    _validate_chunking_config(config)
-
     # Create checkpoint configs
     checkpoint_config = _create_checkpoint_config(pretrained_path, weights_path)
 
-    # Validate training configuration
-    _validate_training_config(config, model_cfg)
-
     # Update megatron config with checkpoint, optimizer, scheduler, etc.
     _update_megatron_config(megatron_cfg, checkpoint_config, config, hf_model_name)
 
     _validate_dtype_config(dtype, megatron_cfg.model, megatron_cfg.optimizer)
 
+    # Validate chunking configuration
+    _validate_chunking_config(config)
+
+    # Validate optimizer configuration
+    _validate_optimizer_config(megatron_cfg)
+
+    # Validate training configuration
+    _validate_training_config(megatron_cfg, model_cfg)
+
+    if "make_sequence_length_divisible_by" not in config:
+        config["make_sequence_length_divisible_by"] = (
+            model_cfg.tensor_model_parallel_size
+        )
+
     return megatron_cfg, model_cfg
 
 
@@ -481,12 +489,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
             )
 
 
-def _validate_optimizer_config(config: PolicyConfig) -> None:
+def _validate_optimizer_config(megatron_cfg: ConfigContainer) -> None:
     """Validate optimizer configuration."""
-    optimizer_cpu_offload = config["megatron_cfg"]["optimizer"]["optimizer_cpu_offload"]
-    optimizer_offload_fraction = config["megatron_cfg"]["optimizer"][
-        "optimizer_offload_fraction"
-    ]
+    optimizer_cpu_offload = megatron_cfg.optimizer.optimizer_cpu_offload
+    optimizer_offload_fraction = megatron_cfg.optimizer.optimizer_offload_fraction
 
     if optimizer_cpu_offload:
         # Currently, hybrid optimizer (partly on GPU and partly on CPU) is not supported because it conflicts with the way
@@ -524,9 +530,9 @@ def _create_checkpoint_config(
     )
 
 
-def _validate_training_config(config: PolicyConfig, model_cfg: Any) -> None:
+def _validate_training_config(megatron_cfg: ConfigContainer, model_cfg: Any) -> None:
     """Validate training configuration."""
-    assert "train_iters" in config["megatron_cfg"], (
+    assert megatron_cfg.train.train_iters is not None, (
         "train_iters must be set in megatron_cfg. For an example, see "
         "https://github.com/NVIDIA-NeMo/RL/blob/bccbc377705a81a1f4b3c31ad9767bcc15f735a8/nemo_rl/algorithms/sft.py#L175-L179."
     )
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index fa8d1d4501..d83a209f49 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -158,11 +158,6 @@ class MegatronConfigDisabled(TypedDict):
 
 class MegatronConfig(TypedDict):
     enabled: Literal[True]
-    # Fully qualified Python import path to a Megatron-Bridge recipe function.
-    # When set, the recipe is loaded at runtime to provide the base model configuration.
-    # When null/unset, configuration is loaded from the checkpoint's run_config.yaml.
-    # Example: "megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config"
-    megatron_recipe: NotRequired[str | None]
     env_vars: NotRequired[dict[str, str] | None]
     # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation.
     # Setting to 0 is faster, but you are more likely to run out of GPU memory. In SFT/DPO, the default is 0.
@@ -261,6 +256,11 @@ class PolicyConfig(TypedDict):
     reward_model_cfg: NotRequired[RewardModelConfig]
     dtensor_cfg: DTensorConfig | DTensorConfigDisabled
     megatron_cfg: NotRequired[MegatronConfig | MegatronConfigDisabled]
+    # Fully qualified Python import path to a Megatron-Bridge recipe function.
+    # When set, the recipe is loaded at runtime to provide the base model configuration.
+    # When null/unset, configuration is loaded from the checkpoint's run_config.yaml.
+    # Example: "megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config"
+    megatron_recipe: NotRequired[str | None]
     hf_config_overrides: NotRequired[dict[str, Any]]
     dynamic_batching: DynamicBatchingConfig | DynamicBatchingConfigDisabled
     sequence_packing: NotRequired[SequencePackingConfig | SequencePackingConfigDisabled]

From 35cb86e01a723a2ffc048c917aeb3bcecaac5ca0 Mon Sep 17 00:00:00 2001
From: root <root@pool0-01725.cm.cluster>
Date: Fri, 6 Feb 2026 10:17:12 -0800
Subject: [PATCH 4/8] fix.

---
 nemo_rl/models/megatron/setup.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index 956c6e5400..19963ede7f 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -215,9 +215,7 @@ def validate_and_set_config(
     }
     dtype = dtype_map[config["precision"]]
 
-    # Optimizer configuration
-    optimizer_cpu_offload = config["megatron_cfg"]["optimizer"]["optimizer_cpu_offload"]
-    offload_optimizer_for_logprob = config["offload_optimizer_for_logprob"]
+
 
     # Reward models are not yet supported with Megatron.
     if "reward_model_cfg" in config and config["reward_model_cfg"]["enabled"]:
@@ -234,11 +232,14 @@ def validate_and_set_config(
         pretrained_path=pretrained_path,
         weights_path=weights_path,
     )
+    # Optimizer configuration
+    optimizer_cpu_offload = megatron_cfg.optimizer.optimizer_cpu_offload
+    offload_optimizer_for_logprob = config["offload_optimizer_for_logprob"]
 
     final_padded_vocab_size = calculate_padded_vocab_size(
         megatron_cfg.model.vocab_size,
         megatron_cfg.model.make_vocab_size_divisible_by,
-        config["megatron_cfg"]["tensor_model_parallel_size"],
+        megatron_cfg.model.tensor_model_parallel_size,
     )
 
     return RuntimeConfig(

From 0b88cccc042bec07b5f3435b7fd135669b6e5ddd Mon Sep 17 00:00:00 2001
From: Sherif Fawzy <sfawzy@cw-pdx-cs-001-vscode-02.cm.cluster>
Date: Mon, 9 Feb 2026 11:16:09 -0800
Subject: [PATCH 5/8] recipes/llm/performance complete.

---
 .../.grpo-deepseek-v3-32n4g.yaml.swp            | Bin 12288 -> 0 bytes
 .../.grpo-deepseek-v3-32n8g.yaml.swp            | Bin 12288 -> 0 bytes
 .../llm/performance/dapo-deepseek-v3-64n8g.yaml |   1 +
 .../llm/performance/grpo-deepseek-v3-32n4g.yaml |   5 +++++
 .../llm/performance/grpo-deepseek-v3-32n8g.yaml |   1 +
 .../grpo-deepseek-v3-64n4g-async-1off.yaml      |   5 +++++
 .../grpo-deepseek-v3-64n8g-async-1off.yaml      |   4 ++++
 .../grpo-deepseek-v3-64n8g-fp8-async-1off.yaml  |   9 +++++++++
 ...po-llama3.1-8b-instruct-2n4g-async-1off.yaml |   4 ++++
 .../grpo-llama3.1-8b-instruct-2n4g.yaml         |   1 +
 ...po-llama3.1-8b-instruct-2n8g-async-1off.yaml |   2 ++
 ...lama3.1-8b-instruct-2n8g-fp8-async-1off.yaml |   8 ++++++++
 .../grpo-llama3.1-8b-instruct-2n8g.yaml         |   1 +
 .../llm/performance/grpo-qwen3-235b-16n4g.yaml  |   4 ++++
 .../llm/performance/grpo-qwen3-235b-16n8g.yaml  |   1 +
 .../grpo-qwen3-235b-32n4g-async-1off.yaml       |   4 ++++
 .../grpo-qwen3-235b-32n8g-async-1off.yaml       |   7 +++++++
 .../grpo-qwen3-30ba3b-24n8g-async-8off.yaml     |   5 +++++
 .../llm/performance/grpo-qwen3-30ba3b-4n4g.yaml |   1 +
 .../performance/grpo-qwen3-30ba3b-4n8g-40K.yaml |   1 +
 .../grpo-qwen3-30ba3b-4n8g-async-1off.yaml      |   5 +++++
 .../llm/performance/grpo-qwen3-30ba3b-4n8g.yaml |   1 +
 .../grpo-qwen3-30ba3b-8n4g-async-1off.yaml      |   5 +++++
 .../llm/performance/grpo-qwen3-32b-4n4g.yaml    |   1 +
 .../llm/performance/grpo-qwen3-32b-4n8g.yaml    |   1 +
 .../grpo-qwen3-32b-8n4g-async-1off.yaml         |   4 ++++
 .../grpo-qwen3-32b-8n8g-async-1off.yaml         |   4 ++++
 27 files changed, 85 insertions(+)
 delete mode 100644 examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp
 delete mode 100644 examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n8g.yaml.swp

diff --git a/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp b/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp
deleted file mode 100644
index 287b7b097343d270e8223669de52de4e6ab3a829..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeI2!EV$r5Qe?nOVJ|krW|26-OU0eJOL6Mi^PS<aVEi9$2PXpc2OlReE=jb+<5|?
zfHQBvBk&ZI$!<cWYFmy7F_wNgw*Sm{zFU+j_Fq1G!JkZL496|T9*kbAv!`q%&&SML
z8@ug44Ux*SD5O{vv+6o;a%}xFI3Xb<1{yo@I)M|>IE@?9no89n@gSAMwbo7Iz*p8c
z!bnJJ@9d-mZ~<UBIZ7w#+$`$pN;LYqoODJ6h`>N#ho$q`ez&@}cbDHD9}WjJ69FPX
z1c(3;AOb{y2oM1x@E;I}Zp7YWoj3Yg@AUKN(%dg;A_7E!2oM1xKm>>Y5g-CYfCvx)
zB0vQGK?0&=?Bfn&=cw-g|M30)eV4Iss86U5s8f_ey+S=ijZr^tGWHeq1@#$qhB`$Z
zqMoDfqxMj{s4o7`E(Z-FKm>>Y5g-CYfCvx)B0vO)01@~b1UNTslN(z?$ap$uoY$@m
zIW}IeH#5FqQnw+3&+rW0qJfNW@5M&Lu`p%PIkm0xQu(ZV2v_uOGd!z@z??@g!TQ_@
zFSG`o2lWn9t?$J75w?TpvZ}k-H5l+BDr@>XVFg;Oz=vEJd<rpl*g~$AfjmS}W0Xhu
z+NRVCW2O8S=i{{#PLM#%8|0!d-)Oc8_B+K^(*cHQpcc_vGu)eVFrv^<X1o$Q09pdP
zZ2`O6F^60#gXKA^m0Wd&$u}UE&MFf(Bt3*y$17o8*w=ANs6=Pa_;f4&gT8hC7vL7_
Awg3PC

diff --git a/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n8g.yaml.swp b/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n8g.yaml.swp
deleted file mode 100644
index 98e5b39f68de87af9a263f30298fa37bc3f40254..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeI2zi%8x6vxMO5DZaL&<-G=K=(TaXFC#PDN<5I5h-Yf+1=Uoa`P)Qvu7U?Af&Xk
zNGK(ugn|ko8bpieXh1@uK~VA!0N>fY^VyDL8XCl`{MCAA-p;=FnfFGr{POYL8@K4y
z;e_D2NXYiZ>&<75Y!!c9BF>l)hyUr~v!cxNf?vre)p@p~1LNl2@<RBL7Rn5q9En4&
zti(WJbk)>;<V4X}JWHuYR=CPI#dRS@wX<eW3Sqqv^TA>^m`(L=JzR1n&--SYO9T>u
zHGyq1n@+~@>G|iLrI#<gv7SixL?97J1QLNnAQ4Ce5`jb@5%?bv2zHCSk2oJ&#d>vh
z-#T?)J*HbCkO(9Ki9jNd2qXfDKq8O`Bm#**B9I9D2MO?!5b-!6{0TUZ|NpQ5{{QJ|
zLVke0f<A`sLl$}yx(&SuJp=vv6d^xDUqGKi??N290lfqf=#OpKK;J`OLl2+@bQ>B%
z7ocCBB;-rz0rVb}LwBIppqHV`(C-%r`4RdC`ULt2`T+8ffcBsP^aAuJ;`t5w4*C}Q
z9QqKt58Z=eyl+A2N(2&tL?97J1QLNnAQAYN2~etA#k47f&**GQD6MVl8CpBL!Hn*b
zLbg5#mthXpT#1aH-R;>KSZCfAMabx?g%pDaT$g!#JqYcM%MEuW6Po8zlo<`K6>Fo1
zMy07)*|?127ypPRQ-ZmA?H{GwFLi;1>PExU)#?;i|8-`$<5CLAeDe+rx|k_lprMr&
zRUO@|g%*y7#^^p~N{^haxb)FmN-Mj&wvUg}u><Z@YgwZkM87i}i@|u8#JLl*APLeD
zG^M`S7iB9qzPDa58(VBWKF&B+6wKJrD7Xp;$AUQ!O}!sjDGI*K=y*8p7Nk@)nn{?R
z{{DL8)y{6SX7^Z%3j1Bz*_4506X4*{r<8$90+9v5sy6y{Y?50mm&_TvncEPGi6!|G
z@q!6{^MP6P4omC{^MTiZ-(xswST*hp%US0rFG92Ec7_36F}J4Cq0#j*m&75w4NM`U
z<>|eiywa_5<i&YLXn0;=A#e#x3s-MSuK1pD*Lmx?ym_lzNhlkZtku}Jk}32|yCsui
zA!J4;y}_w0SEA;@VQ*Kdc1CKIt2|v<C_V=+ajz9<v&Q1O4*MvH1yTg{!*c9?{FLH{
zESSKN)+cQ-y)%n(V`ovDIm>w{_E*Ut-wcjpdsI_6^U&P7-dpXZ3Hzhh(cpb2HhABi
z4aT$SmBF>Wj%hG+p}QQ7u5{@8gW~)bt&vT!j77HD&A!~7@U0AwtoV(2+<d$r3#U=m
zxDdP1nU7uTm|}%-D5tKJ`;?Z(dscOY9g!^}u)f6|pdzL?avaf4j0l!7NpTEX)W*&@
zR$Cm{oD}_$3BbsR)k@KzALu9J+ueB+PtS~?=y;4Vqr>5-|A`Y!acuD9THiYVH;l$E
Ang9R*

diff --git a/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml b/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml
index 9c4edd2b30..2bfaf20955 100644
--- a/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml
+++ b/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml
@@ -40,6 +40,7 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${mul:${policy.dtensor_cfg.tensor_parallel_size},
     ${mul:2, ${policy.dtensor_cfg.context_parallel_size}}}
+  megatron_recipe: megatron_bridge.recipes.deepseek.deepseek_v3_pretrain_config
   megatron_cfg:
     empty_unused_memory_level: 2
     enabled: true
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
index 9b157ea779..890124d3e0 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
@@ -7,6 +7,11 @@ policy:
   generation:
     vllm_cfg:
       tensor_parallel_size: 32
+  megatron_cfg:
+      pipeline_model_parallel_size: 8
+      expert_model_parallel_size: 16
+      num_layers_in_first_pipeline_stage: 7
+      num_layers_in_last_pipeline_stage: 6
 logger:
   log_dir: logs/grpo-deepseek-v3-32n4g
   wandb:
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml
index 75457ab802..7965f72764 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml
@@ -19,6 +19,7 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron_bridge.recipes.deepseek.deepseek_v3_pretrain_config_32nodes
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
index ff3e68d7da..bf9a30a5d3 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
@@ -4,6 +4,11 @@ checkpointing:
 policy:
   sequence_packing:
     enabled: false
+  megatron_cfg:
+    pipeline_model_parallel_size: 8
+    expert_model_parallel_size: 16
+    num_layers_in_first_pipeline_stage: 7
+    num_layers_in_last_pipeline_stage: 6
   generation:
     colocated:
       resources:
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml
index 0260eb39e6..595654a3a3 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml
@@ -10,6 +10,10 @@ checkpointing:
   checkpoint_dir: results/grpo-deepseek-v3-64n8g-async-1off
 policy:
   logprob_batch_size: 2
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 16
+    expert_model_parallel_size: 16
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
index b8f8ece97d..7f6b5ae86b 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
@@ -2,6 +2,15 @@ defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml
 checkpointing:
   checkpoint_dir: results/grpo-deepseek-v3-64n8g-fp8-async-1off
 policy:
+  megatron_cfg:
+    fp8_cfg:
+      enabled: true
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
+    moe_router_dtype: fp32
+    env_vars:
+      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
   generation:
     vllm_cfg:
       tensor_parallel_size: 16
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
index ba06869c7e..d906eda2b4 100644
--- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
@@ -9,6 +9,10 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g-async-1off
 policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    sequence_parallel: false
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
index a99f7c1498..e3c9e25c85 100644
--- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
@@ -17,6 +17,7 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron_bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
index b6d7ed441d..c0263f68fb 100644
--- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
@@ -9,6 +9,8 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-async-1off
 policy:
+  megatron_cfg:
+    pipeline_model_parallel_size: 1
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml
index 1ab2bafaf4..b32786f7d7 100644
--- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml
@@ -2,6 +2,14 @@ defaults: ./grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
 checkpointing:
   checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off
 policy:
+  megatron_cfg:
+    fp8_cfg:
+      enabled: true
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
+    env_vars:
+      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
   generation:
     vllm_cfg:
       precision: "fp8"
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml
index afdbf8c414..fb0f103855 100644
--- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml
@@ -17,6 +17,7 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron_bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
index d542091951..1640deda09 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
@@ -2,6 +2,10 @@ defaults: ./grpo-qwen3-235b-16n8g.yaml
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-235b-16n4g
 policy:
+  megatron_cfg:
+    pipeline_model_parallel_size: 4
+    num_layers_in_first_pipeline_stage: 23
+    num_layers_in_last_pipeline_stage: 23
   generation:
     vllm_cfg:
       tensor_parallel_size: 8
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml
index 1376c8d340..e2e02de396 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml
@@ -19,6 +19,7 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_235b_a22b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml
index 13d8501363..f55b383686 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml
@@ -2,6 +2,10 @@ defaults: ./grpo-qwen3-235b-32n8g-async-1off.yaml
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-235b-32n4g-async-1off
 policy:
+  megatron_cfg:
+    pipeline_model_parallel_size: 4
+    num_layers_in_first_pipeline_stage: 23
+    num_layers_in_last_pipeline_stage: 23
   generation:
     colocated:
       resources:
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml
index 4545aa364a..cf4f5a6f98 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml
@@ -9,6 +9,13 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-235b-32n8g-async-1off
 policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 4
+    sequence_parallel: true
+    context_parallel_size: 1
+    pipeline_model_parallel_size: 8
+    expert_model_parallel_size: 16
+    defer_fp32_logits: false
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml
index b4d5409a61..11d917fc8b 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml
@@ -9,6 +9,11 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-30ba3b-24n8g-async-8off
 policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    expert_model_parallel_size: 8
+    sequence_parallel: false
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
index 21b9746f4b..c4749c0faf 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
@@ -14,6 +14,7 @@ policy:
   optimizer: null
   scheduler: null
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml
index 2270d5e272..d2a4eb24b5 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml
@@ -14,6 +14,7 @@ policy:
   optimizer: null
   scheduler: null
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml
index 797244576f..4cc5981460 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml
@@ -9,6 +9,11 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g-async-1off
 policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 2
+    expert_model_parallel_size: 8
+    sequence_parallel: false
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml
index 795764d3ee..6a029c6fde 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml
@@ -7,6 +7,7 @@ checkpointing:
   checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g
 policy:
   model_name: Qwen/Qwen3-30B-A3B
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config
   train_micro_batch_size: 1
   max_total_sequence_length: 4096
   dtensor_cfg:
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
index 6da999f169..a9837c87f2 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
@@ -9,6 +9,11 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-30ba3b-8n4g-async-1off
 policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    expert_model_parallel_size: 16
+    sequence_parallel: false
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
index 2e441cdb5f..d17dad323a 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
@@ -14,6 +14,7 @@ policy:
   optimizer: null
   scheduler: null
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml
index ad780ebc50..7b33ced71a 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml
@@ -14,6 +14,7 @@ policy:
   optimizer: null
   scheduler: null
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
index f50db7fde8..4f8a0a03bb 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
@@ -9,6 +9,10 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-32b-8n4g-async-1off
 policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    sequence_parallel: true
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml
index a54a8e7747..9f20f34f40 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml
@@ -9,6 +9,10 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-32b-8n8g-async-1off
 policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 4
+    pipeline_model_parallel_size: 4
+    sequence_parallel: true
   generation:
     colocated:
       enabled: false

From 64e681b29927c1d8e6475a5001e16db1fa56d7ff Mon Sep 17 00:00:00 2001
From: Sherif Fawzy <sfawzy@cw-pdx-cs-001-vscode-02.cm.cluster>
Date: Mon, 9 Feb 2026 12:52:44 -0800
Subject: [PATCH 6/8] recipe/llm done.

---
 examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml    |  1 +
 ...2b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml |  6 ++++++
 ...2b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml |  1 +
 ...ama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml |  4 ++++
 .../llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml    |  7 +++++++
 .../recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml   |  3 +++
 .../recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml   |  2 +-
 .../llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml |  2 +-
 .../llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml     |  6 ++++++
 .../grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml |  1 +
 .../llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml     |  2 +-
 .../recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml  |  3 +++
 .../recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml  |  2 +-
 .../llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml  |  2 ++
 .../llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml  |  2 +-
 .../recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml |  4 ++++
 .../recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml |  2 +-
 ...grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml |  2 +-
 .../sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml  |  2 ++
 .../llm/sft-qwen2.5-math7b-2n4g-megatron.yaml        |  4 ++++
 .../llm/sft-qwen2.5-math7b-2n8g-megatron.yaml        |  2 +-
 ...2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml | 12 +++++++++++-
 22 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
index 9035a3598c..8f615b4361 100644
--- a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
+++ b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
@@ -34,6 +34,7 @@ policy:
   dtensor_cfg:
     _v2: false
     context_parallel_size: 4
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen25_7b_finetune_config
   megatron_cfg:
     tensor_model_parallel_size: 4
     pipeline_model_parallel_size: 2
diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml
index 31100ce7b9..95c9e85573 100644
--- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml
+++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml
@@ -1,4 +1,10 @@
 defaults: ./distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+teacher:
+  megatron_cfg:
+    tensor_model_parallel_size: 2
 checkpointing:
   checkpoint_dir: checkpoints/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack
 logger:
diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
index 6ae8b1ff1a..d8cce7d5d0 100644
--- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
+++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
@@ -31,6 +31,7 @@ teacher:
     enabled: false
   sequence_packing:
     enabled: true
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 4
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml
index 1f75679f39..8324173dfc 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml
@@ -1,4 +1,8 @@
 defaults: ./dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    sequence_parallel: false
 logger:
   wandb:
     name: dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick
diff --git a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml
index 627a00574e..fb4a4bc880 100644
--- a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml
@@ -1,5 +1,12 @@
 defaults: ./grpo-dapomath17k-dsv3-megatron.yaml
 policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 4
+    expert_model_parallel_size: 16
+    pipeline_model_parallel_size: 4
+    context_parallel_size: 2
+    num_layers_in_first_pipeline_stage: 15
+    num_layers_in_last_pipeline_stage: 14
   make_sequence_length_divisible_by: 4
   generation:
     vllm_cfg:
diff --git a/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml
index ef033bc67f..c9719f381f 100644
--- a/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml
@@ -1,5 +1,8 @@
 defaults: ./grpo-gptoss-20b-8n8g-megatron.yaml
 policy:
+  megatron_cfg:
+    expert_model_parallel_size: 4
+    tensor_model_parallel_size: 2
   generation:
     vllm_cfg:
       tensor_parallel_size: 1
diff --git a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml
index 58655c471e..4f2a8ee3ec 100755
--- a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml
@@ -8,7 +8,7 @@ policy:
   model_name: openai/gpt-oss-20b
   train_micro_batch_size: 1
   max_total_sequence_length: 4096
-  megatron_recipe: megatron.bridge.recipes.openai.gptoss.gptoss_20b_pretrain_config
+  megatron_recipe: megatron.bridge.recipes.openai.gpt_oss.gpt_oss_20b_pretrain_config
   megatron_cfg:
     enabled: true
     expert_model_parallel_size: 8
diff --git a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
index 071623c60f..5c8d8594fd 100644
--- a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
+++ b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
@@ -20,7 +20,7 @@ policy:
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
   scheduler: null
-  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_finetune_config
   megatron_cfg:
     enabled: true
     converter_type: LlamaForCausalLM
diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml
index 4459adc9dd..97d6ffede7 100644
--- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml
@@ -1,6 +1,12 @@
 defaults: ./grpo-moonlight-16ba3b-4n8g-megatron.yaml
 checkpointing:
   checkpoint_dir: results/grpo-moonlight-16ba3b-4n4g-megatron
+policy:
+  megatron_cfg:
+    expert_model_parallel_size: 2
+    pipeline_model_parallel_size: 2
+    num_layers_in_first_pipeline_stage: 14
+    num_layers_in_last_pipeline_stage: 13
 logger:
   wandb:
     name: grpo-moonlight-16ba3b-4n4g-megatron
diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
index 27108c55c7..951bb0371f 100644
--- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
+++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
@@ -18,6 +18,7 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.moonlight.moonlight_16b_pretrain_config
   megatron_cfg:
     enabled: true
     moe_router_dtype: fp32
diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
index 2c20dcf8ba..8674bdf00a 100644
--- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
@@ -20,7 +20,7 @@ policy:
     algorithm: modified_ffd
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
-  megatron_recipe: megatron.bridge.recipes.nvidia.moonlight.moonlight_16b_a3b_pretrain_config
+  megatron_recipe: megatron.bridge.recipes.moonlight.moonlight_16b_pretrain_config
   megatron_cfg:
     enabled: true
     expert_model_parallel_size: 4
diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml
index ba30e6490e..da8301a19b 100644
--- a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml
@@ -1,6 +1,9 @@
 defaults: ./grpo-nano-v2-12b-1n8g-megatron.yaml
 checkpointing:
   checkpoint_dir: results/grpo-nano-v2-12b-1n4g-megatron
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 4
 logger:
   log_dir: logs/grpo-nano-v2-12b-1n4g-megatron
   wandb:
diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml
index 31ed4e07b0..cd7a7c8b96 100644
--- a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml
@@ -8,7 +8,7 @@ policy:
   tokenizer:
     name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
   optimizer: null
-  megatron_recipe: megatron.bridge.recipes.nvidia.nemotron.nemotron_nano_12b_v2_pretrain_config
+  megatron_recipe: megatron.bridge.recipes.nemotronh.nemotron_nano_12b_v2_pretrain_config
   megatron_cfg:
     enabled: true
     bias_activation_fusion: false
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml
index 4029f002e8..b21c9dd51f 100644
--- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml
@@ -1,5 +1,7 @@
 defaults: ./grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
 policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
   make_sequence_length_divisible_by: 2
   generation:
     vllm_cfg:
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
index 51b0a9c5b6..e37c892929 100755
--- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
@@ -14,7 +14,7 @@ policy:
   max_total_sequence_length: 4096
   dtensor_cfg:
     enabled: false
-  megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_7b_pretrain_config
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen25_7b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 2
diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml
index f26cbf49b2..79fbda389d 100644
--- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml
@@ -1,5 +1,9 @@
 defaults: ./grpo-qwen3-30ba3b-8n8g-megatron.yaml
 policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 2
+    expert_model_parallel_size: 2
   make_sequence_length_divisible_by: 2
   generation:
     vllm_cfg:
diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
index 830a259e98..c7f3eca79f 100755
--- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
@@ -17,7 +17,7 @@ policy:
     enabled: false
     algorithm: modified_ffd
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
-  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3_30b_a3b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 4
diff --git a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
index aefc0a09ef..777100853f 100644
--- a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
@@ -15,7 +15,7 @@ policy:
     enabled: false
   optimizer: null
   scheduler: null
-  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_8b_pretrain_config
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3_8b_pretrain_config
   megatron_cfg:
     enabled: true
     converter_type: Qwen3ForCausalLM
diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml
index 798d5e0617..77c175fadf 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml
@@ -1,5 +1,7 @@
 defaults: ./sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
 policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 2
   make_sequence_length_divisible_by: 2
 checkpointing:
   checkpoint_dir: results/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron
diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml
index 903db6113b..aad3f5c8e0 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml
@@ -1,4 +1,8 @@
 defaults: ./sft-qwen2.5-math7b-2n8g-megatron.yaml
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 2
+    context_parallel_size: 1
 logger:
   wandb:
     name: sft-qwen2.5-math7b-2n4g-megatron
diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml
index cfe381fd33..0b3388f915 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml
@@ -9,7 +9,7 @@ policy:
   max_total_sequence_length: 16384
   dtensor_cfg:
     enabled: false
-  megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_math_7b_pretrain_config
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen25_7b_finetune_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 4
diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
index ac8e882875..45188bc54e 100644
--- a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
+++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
@@ -7,8 +7,18 @@ policy:
     enabled: false
   dynamic_batching:
     enabled: false
+  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
-  megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_vl_3b_pretrain_config
+  megatron_cfg:
+    enabled: true
+    optimizer:
+      lr: 5.0e-07
+      min_lr: 5.0e-08
+    scheduler:
+      lr_warmup_iters: 50
+      lr_warmup_init: 5.0e-08
+    distributed_data_parallel_config:
+      overlap_grad_reduce: false
 logger:
   wandb:
     name: vlm-grpo-3b-megatron

From ef54ee32bef8d591feea94be3e1c3d6e8e891ca6 Mon Sep 17 00:00:00 2001
From: Sherif Fawzy <sfawzy@cw-pdx-cs-001-vscode-02.cm.cluster>
Date: Mon, 9 Feb 2026 13:37:13 -0800
Subject: [PATCH 7/8] more fixes.

---
 examples/configs/grpo_math_70B_megatron_fp8.yaml    | 13 ++++++++++++-
 examples/configs/grpo_math_qwen30ba3b_megatron.yaml |  2 +-
 nemo_rl/models/megatron/recipe_config.py            |  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/examples/configs/grpo_math_70B_megatron_fp8.yaml b/examples/configs/grpo_math_70B_megatron_fp8.yaml
index 322aa6c0f2..df239cd8ff 100644
--- a/examples/configs/grpo_math_70B_megatron_fp8.yaml
+++ b/examples/configs/grpo_math_70B_megatron_fp8.yaml
@@ -8,4 +8,15 @@ policy:
   generation:
     vllm_cfg:
       precision: "fp8"
-      use_deep_gemm: true
\ No newline at end of file
+      use_deep_gemm: true
+  megatron_cfg:
+    pipeline_model_parallel_size: 8
+    fp8_cfg:
+      enabled: true
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
+    optimizer:
+      use_precision_aware_optimizer: false
+    env_vars:
+      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
\ No newline at end of file
diff --git a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
index 81d812372e..2d4f0f3151 100644
--- a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
+++ b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
@@ -26,7 +26,7 @@ policy:
 
   scheduler: null # remove default FSDP scheduler
 
-  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3_30b_a3b_finetune_config
   
   megatron_cfg:
     enabled: true
diff --git a/nemo_rl/models/megatron/recipe_config.py b/nemo_rl/models/megatron/recipe_config.py
index 21861fc403..4bf3d900fd 100644
--- a/nemo_rl/models/megatron/recipe_config.py
+++ b/nemo_rl/models/megatron/recipe_config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 1ee4ebb198dc22b091db5f48f2fb34fa6f5cf77d Mon Sep 17 00:00:00 2001
From: root <root@pool0-00265.cm.cluster>
Date: Mon, 9 Feb 2026 20:45:58 -0800
Subject: [PATCH 8/8] Multiple fixes.

---
 nemo_rl/models/megatron/data.py               | 23 ++++--
 nemo_rl/models/megatron/setup.py              | 82 +++++++++++--------
 .../policy/workers/megatron_policy_worker.py  | 11 ++-
 3 files changed, 69 insertions(+), 47 deletions(-)

diff --git a/nemo_rl/models/megatron/data.py b/nemo_rl/models/megatron/data.py
index f884e95e1b..87ffd9e83f 100644
--- a/nemo_rl/models/megatron/data.py
+++ b/nemo_rl/models/megatron/data.py
@@ -128,6 +128,7 @@ def get_microbatch_iterator(
     mbs: int,
     straggler_timer: StragglerDetector,
     seq_length_key: Optional[str] = None,
+    model_cfg: Optional[Any] = None,
 ) -> Tuple[Iterator[ProcessedMicrobatch], int, int, int, int]:
     """Create a processed microbatch iterator from a batch of data.
 
@@ -140,6 +141,8 @@ def get_microbatch_iterator(
         cfg: Configuration dictionary
         mbs: Microbatch size
         seq_length_key: Key for sequence lengths in data dict (auto-detected if None)
+        model_cfg: Optional Megatron model config (ConfigContainer). When provided,
+            parallelism settings are read from here instead of the raw config dict.
 
     Returns:
         Tuple containing the iterator and metadata
@@ -175,6 +178,7 @@ def get_microbatch_iterator(
         ) = _get_pack_sequence_parameters_for_megatron(
             cfg["megatron_cfg"],
             pack_seq_dim_size,
+            model_cfg=model_cfg,
         )
         micro_batch_size = 1
     else:
@@ -528,12 +532,15 @@ def _pack_sequences_for_megatron(
 def _get_pack_sequence_parameters_for_megatron(
     megatron_cfg: dict,
     max_seq_len_in_batch: int,
+    model_cfg: Optional[Any] = None,
 ):
     """Get pack sequence parameters for Megatron model processing with optional context parallelism.
 
     Args:
-        megatron_cfg: Megatron configuration
+        megatron_cfg: Megatron configuration dict (from YAML)
         max_seq_len_in_batch: Maximum sequence length in batch
+        model_cfg: Optional Megatron model config (ConfigContainer). When provided,
+            parallelism settings are read from here instead of the raw config dict.
 
     Returns:
         Tuple of:
@@ -541,10 +548,16 @@ def _get_pack_sequence_parameters_for_megatron(
         - pad_packed_seq_to_multiple_of: Pad packed sequences to a multiple of this value
         - pad_packed_seq_to: Pad packed sequences to this value (before CP)
     """
-    tp_size = megatron_cfg["tensor_model_parallel_size"]
-    sp = megatron_cfg["sequence_parallel"]
-    pp_size = megatron_cfg["pipeline_model_parallel_size"]
-    cp_size = megatron_cfg["context_parallel_size"]
+    if model_cfg is not None:
+        tp_size = model_cfg.tensor_model_parallel_size
+        sp = model_cfg.sequence_parallel
+        pp_size = model_cfg.pipeline_model_parallel_size
+        cp_size = model_cfg.context_parallel_size
+    else:
+        tp_size = megatron_cfg["tensor_model_parallel_size"]
+        sp = megatron_cfg.get("sequence_parallel", False)
+        pp_size = megatron_cfg["pipeline_model_parallel_size"]
+        cp_size = megatron_cfg["context_parallel_size"]
     fp8_cfg = megatron_cfg.get("fp8_cfg", None) or {}
     use_fp8 = fp8_cfg.get("enabled", False)
     use_blockwise_fp8 = fp8_cfg.get("fp8_recipe", None) == "blockwise"
diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index 19963ede7f..b9eb13c6e3 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -372,13 +372,13 @@ def _apply_parallelism_config(model_cfg: Any, config: PolicyConfig) -> None:
     model_cfg.pipeline_model_parallel_size = config["megatron_cfg"][
         "pipeline_model_parallel_size"
     ]
-    model_cfg.num_layers_in_first_pipeline_stage = config["megatron_cfg"][
-        "num_layers_in_first_pipeline_stage"
-    ]
-    model_cfg.num_layers_in_last_pipeline_stage = config["megatron_cfg"][
-        "num_layers_in_last_pipeline_stage"
-    ]
-    model_cfg.sequence_parallel = config["megatron_cfg"]["sequence_parallel"]
+    model_cfg.num_layers_in_first_pipeline_stage = config["megatron_cfg"].get(
+        "num_layers_in_first_pipeline_stage", None
+    )
+    model_cfg.num_layers_in_last_pipeline_stage = config["megatron_cfg"].get(
+        "num_layers_in_last_pipeline_stage", None
+    )
+    model_cfg.sequence_parallel = config["megatron_cfg"].get("sequence_parallel", False)
     model_cfg.context_parallel_size = config["megatron_cfg"]["context_parallel_size"]
 
     if model_cfg.context_parallel_size > 1:
@@ -389,41 +389,49 @@ def _apply_parallelism_config(model_cfg: Any, config: PolicyConfig) -> None:
 
 def _apply_moe_config(model_cfg: Any, config: PolicyConfig) -> None:
     """Apply Mixture of Experts configuration."""
-    model_cfg.expert_tensor_parallel_size = config["megatron_cfg"][
-        "expert_tensor_parallel_size"
-    ]
-    model_cfg.expert_model_parallel_size = config["megatron_cfg"][
-        "expert_model_parallel_size"
-    ]
+    megatron_cfg = config["megatron_cfg"]
+    model_cfg.expert_tensor_parallel_size = megatron_cfg.get(
+        "expert_tensor_parallel_size", 1
+    )
+    model_cfg.expert_model_parallel_size = megatron_cfg.get(
+        "expert_model_parallel_size", 1
+    )
 
     # MoE stability settings
 
     # Setting moe_router_dtype to higher precision (e.g. fp64) can improve numerical stability,
     # especially when using many experts.
-    model_cfg.moe_router_dtype = config["megatron_cfg"]["moe_router_dtype"]
+    if "moe_router_dtype" in megatron_cfg:
+        model_cfg.moe_router_dtype = megatron_cfg["moe_router_dtype"]
 
     # The below two configs (and "freeze_moe_router") are used to stabilize moe training
     # by preventing updates to the moe router. We found that this is helpful in reducing
     # logprob error during training.
 
     # Set this to "none" to disable load balancing loss.
-    model_cfg.moe_router_load_balancing_type = config["megatron_cfg"][
-        "moe_router_load_balancing_type"
-    ]
+    if "moe_router_load_balancing_type" in megatron_cfg:
+        model_cfg.moe_router_load_balancing_type = megatron_cfg[
+            "moe_router_load_balancing_type"
+        ]
     # Set this to 0.0 to disable updates to the moe router expert bias
-    model_cfg.moe_router_bias_update_rate = config["megatron_cfg"][
-        "moe_router_bias_update_rate"
-    ]
+    if "moe_router_bias_update_rate" in megatron_cfg:
+        model_cfg.moe_router_bias_update_rate = megatron_cfg[
+            "moe_router_bias_update_rate"
+        ]
 
-    model_cfg.moe_enable_deepep = config["megatron_cfg"]["moe_enable_deepep"]
-    model_cfg.moe_token_dispatcher_type = config["megatron_cfg"][
-        "moe_token_dispatcher_type"
-    ]
-    model_cfg.moe_shared_expert_overlap = config["megatron_cfg"][
-        "moe_shared_expert_overlap"
-    ]
+    if "moe_enable_deepep" in megatron_cfg:
+        model_cfg.moe_enable_deepep = megatron_cfg["moe_enable_deepep"]
+    if "moe_token_dispatcher_type" in megatron_cfg:
+        model_cfg.moe_token_dispatcher_type = megatron_cfg[
+            "moe_token_dispatcher_type"
+        ]
+    if "moe_shared_expert_overlap" in megatron_cfg:
+        model_cfg.moe_shared_expert_overlap = megatron_cfg[
+            "moe_shared_expert_overlap"
+        ]
 
-    model_cfg.moe_permute_fusion = config["megatron_cfg"]["moe_permute_fusion"]
+    if "moe_permute_fusion" in megatron_cfg:
+        model_cfg.moe_permute_fusion = megatron_cfg["moe_permute_fusion"]
 
 
 def _apply_precision_config(
@@ -454,8 +462,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
     """Apply performance optimization configuration."""
     model_cfg.parallel_output = True
 
+    megatron_cfg = config["megatron_cfg"]
+
     # Activation checkpointing
-    if config["megatron_cfg"]["activation_checkpointing"]:
+    if megatron_cfg.get("activation_checkpointing", False):
         model_cfg.recompute_granularity = "full"
         model_cfg.recompute_method = "uniform"
         model_cfg.recompute_num_layers = 1
@@ -470,8 +480,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
         )
 
     # Fusion settings
-    model_cfg.apply_rope_fusion = config["megatron_cfg"]["apply_rope_fusion"]
-    model_cfg.bias_activation_fusion = config["megatron_cfg"]["bias_activation_fusion"]
+    if "apply_rope_fusion" in megatron_cfg:
+        model_cfg.apply_rope_fusion = megatron_cfg["apply_rope_fusion"]
+    if "bias_activation_fusion" in megatron_cfg:
+        model_cfg.bias_activation_fusion = megatron_cfg["bias_activation_fusion"]
 
     # FP8 configuration
     fp8_cfg = config["megatron_cfg"].get("fp8_cfg", None)
@@ -741,7 +753,7 @@ def setup_model_and_optimizer(
     use_peft = policy_cfg["megatron_cfg"].get("peft", {}).get("enabled", False)
 
     mixed_precision_wrapper = Float16Module
-    if policy_cfg["megatron_cfg"]["freeze_moe_router"]:
+    if policy_cfg["megatron_cfg"].get("freeze_moe_router", False):
         if use_peft:
             raise ValueError(
                 "Freezing the MOE router is not currently supported when using PEFT"
@@ -1008,10 +1020,8 @@ def finalize_megatron_setup(
     )
 
     should_disable_forward_pre_hook = (
-        config["megatron_cfg"]["optimizer"]["use_distributed_optimizer"]
-        and config["megatron_cfg"]["distributed_data_parallel_config"][
-            "overlap_param_gather"
-        ]
+        megatron_cfg.optimizer.use_distributed_optimizer
+        and megatron_cfg.ddp.overlap_param_gather
     )
 
     return megatron_tokenizer, megatron_bridge, should_disable_forward_pre_hook, dp_size
diff --git a/nemo_rl/models/policy/workers/megatron_policy_worker.py b/nemo_rl/models/policy/workers/megatron_policy_worker.py
index 83d541f2ea..ef23ff556e 100644
--- a/nemo_rl/models/policy/workers/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/workers/megatron_policy_worker.py
@@ -278,19 +278,15 @@ def __init__(
             self.model,
             self.optimizer,
         )
-        print("HELLO")
+
         # Dump ConfigContainer to YAML for inspection (only on rank 0)
         if self.rank == 0:
-            config_dump_path = "/lustre/fsw/portfolios/coreai/users/sfawzy/final_megatron_config_6.yaml"
+            config_dump_path = "/lustre/fsw/portfolios/coreai/users/sfawzy/final_megatron_config.yaml"
             try:
                 self.megatron_cfg.to_yaml(config_dump_path)
                 print(f"[DEBUG] Saved final ConfigContainer to: {config_dump_path}")
             except Exception as e:
                 print(f"[WARNING] Failed to save ConfigContainer to YAML: {e}")
-            # Exit early after dumping config for inspection
-            import sys
-            print("[DEBUG] Exiting after ConfigContainer dump")
-            sys.exit(0)
 
         # vars used for refit
         ## will be initialized in prepare_refit_info
@@ -385,6 +381,7 @@ def train(
                     self.cfg,
                     mbs,
                     straggler_timer=self.mcore_state.straggler_timer,
+                    model_cfg=self.megatron_cfg.model,
                 )
                 # Track total microbatches for MoE aux-loss averaging
                 total_num_microbatches += int(num_microbatches)
@@ -569,6 +566,7 @@ def get_logprobs(
             self.cfg,
             logprob_batch_size,
             straggler_timer=self.mcore_state.straggler_timer,
+            model_cfg=self.megatron_cfg.model,
         )
 
         def forward_step_fn(
@@ -776,6 +774,7 @@ def get_topk_logits(
             self.cfg,
             logprob_batch_size,
             straggler_timer=self.mcore_state.straggler_timer,
+            model_cfg=self.megatron_cfg.model,
         )
 
         def forward_step_fn(