From 88a26c7ea8e82f995c77e374ed2fd14d03904205 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 3 Feb 2026 15:59:02 -0800 Subject: [PATCH 1/8] Initial commit. --- nemo_rl/models/megatron/__init__.py | 10 + nemo_rl/models/megatron/recipe_config.py | 141 +++++++++++ nemo_rl/models/megatron/setup.py | 219 +++++++++++------- .../policy/workers/megatron_policy_worker.py | 13 ++ 4 files changed, 305 insertions(+), 78 deletions(-) create mode 100644 nemo_rl/models/megatron/recipe_config.py diff --git a/nemo_rl/models/megatron/__init__.py b/nemo_rl/models/megatron/__init__.py index 4fc25d0d3c..f7ce1ab003 100644 --- a/nemo_rl/models/megatron/__init__.py +++ b/nemo_rl/models/megatron/__init__.py @@ -11,3 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from nemo_rl.models.megatron.recipe_config import ( + get_available_recipes, + get_recipe_function, +) + +__all__ = [ + "get_available_recipes", + "get_recipe_function", +] diff --git a/nemo_rl/models/megatron/recipe_config.py b/nemo_rl/models/megatron/recipe_config.py new file mode 100644 index 0000000000..891129c503 --- /dev/null +++ b/nemo_rl/models/megatron/recipe_config.py @@ -0,0 +1,141 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Recipe-based configuration for NeMo-RL Megatron integration. + +This module provides a clean integration with Megatron-Bridge recipes, +allowing NeMo-RL to use pre-configured training recipes as a base and +layer RL-specific settings on top. + +Example usage: + from nemo_rl.models.megatron.recipe_config import create_config_from_recipe + + megatron_cfg = create_config_from_recipe( + hf_model_name="meta-llama/Llama-3.1-8B-Instruct", + policy_config=config, + pretrained_path="/path/to/checkpoint", + weights_path=None, + ) + +Internal flag for testing: + # To use pure recipe settings with minimal RL overrides (for testing): + megatron_cfg = create_config_from_recipe( + ..., + _apply_full_overrides=False, # Internal flag - keeps recipe's optimizer/scheduler + ) +""" + +import warnings +from typing import Any, Callable, Optional + +import torch +from megatron.bridge import AutoBridge +from megatron.bridge.training.config import ( + CheckpointConfig, + ConfigContainer, + DistributedDataParallelConfig, + LoggerConfig, + OptimizerConfig, + SchedulerConfig, + TokenizerConfig, + TrainingConfig, +) + +from nemo_rl.models.policy import PolicyConfig + + +# ============================================================================= +# RECIPE DISCOVERY +# ============================================================================= + +def _import_llama_recipes(): + """Import Llama recipes from Megatron-Bridge.""" + try: + from megatron.bridge.recipes.llama.llama3 import ( + llama31_8b_pretrain_config, + llama31_70b_pretrain_config, + llama31_405b_pretrain_config, + llama3_8b_pretrain_config, + llama3_70b_pretrain_config, + llama32_1b_pretrain_config, + llama32_3b_pretrain_config, + ) + return { + "llama-3.2-1b": llama32_1b_pretrain_config, + "llama-3.2-3b": llama32_3b_pretrain_config, + "llama-3-8b": llama3_8b_pretrain_config, + "llama-3.1-8b": llama31_8b_pretrain_config, + "meta-llama-3-8b": llama3_8b_pretrain_config, + "meta-llama-3.1-8b": llama31_8b_pretrain_config, + "llama-3-70b": llama3_70b_pretrain_config, + "llama-3.1-70b": llama31_70b_pretrain_config, + "llama-3.1-405b": llama31_405b_pretrain_config, + } + except ImportError: + return {} + + +def _import_qwen_recipes(): + """Import Qwen recipes from Megatron-Bridge.""" + try: + from megatron.bridge.recipes.qwen.qwen3 import ( + qwen3_600m_pretrain_config, + qwen3_1p7b_pretrain_config, + qwen3_4b_pretrain_config, + qwen3_8b_pretrain_config, + ) + return { + "qwen3-0.6b": qwen3_600m_pretrain_config, + "qwen3-1.7b": qwen3_1p7b_pretrain_config, + "qwen3-4b": qwen3_4b_pretrain_config, + "qwen3-8b": qwen3_8b_pretrain_config, + } + except ImportError: + return {} + + +def get_recipe_function(hf_model_name: str) -> Optional[Callable[..., ConfigContainer]]: + """ + Get the appropriate Megatron-Bridge recipe function for a model. + + Args: + hf_model_name: HuggingFace model name or path + + Returns: + Recipe function or None if no matching recipe found + """ + model_lower = hf_model_name.lower().replace("/", "-").replace("_", "-") + + # Load recipes lazily + all_recipes = {} + all_recipes.update(_import_llama_recipes()) + all_recipes.update(_import_qwen_recipes()) + + # Try match + for pattern, recipe_fn in all_recipes.items(): + if pattern in model_lower: + return recipe_fn + + return None + + +def get_available_recipes() -> list[str]: + """Return a list of available recipe patterns.""" + all_recipes = {} + all_recipes.update(_import_llama_recipes()) + all_recipes.update(_import_qwen_recipes()) + return list(all_recipes.keys()) + + diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index 24bfdb0605..14dcbabcb3 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -31,6 +31,7 @@ CheckpointConfig, ConfigContainer, DistributedDataParallelConfig, + DistributedInitConfig, LoggerConfig, OptimizerConfig, SchedulerConfig, @@ -68,6 +69,9 @@ from nemo_rl.distributed.named_sharding import NamedSharding from nemo_rl.models.megatron.community_import import import_model_from_hf_name from nemo_rl.models.megatron.config import ModelAndOptimizerState, RuntimeConfig +from nemo_rl.models.megatron.recipe_config import ( + get_recipe_function, +) from nemo_rl.models.policy import PolicyConfig from nemo_rl.models.policy.utils import ( configure_dynamo_cache, @@ -225,7 +229,13 @@ def validate_and_set_config( ) megatron_cfg, model_cfg = setup_model_config( - config, rank, dtype, hf_model_name, pretrained_path, weights_path + config=config, + rank=rank, + dtype=dtype, + hf_model_name=hf_model_name, + pretrained_path=pretrained_path, + weights_path=weights_path, + use_recipe=True, ) final_padded_vocab_size = calculate_padded_vocab_size( @@ -262,7 +272,6 @@ def validate_model_paths(config: PolicyConfig) -> tuple[str, str, bool]: return hf_model_name, pretrained_path, pt_checkpoint_exists - def setup_model_config( config: PolicyConfig, rank, @@ -270,40 +279,53 @@ def setup_model_config( hf_model_name: str, pretrained_path: str, weights_path: Optional[str] = None, + use_recipe: bool = True, ) -> tuple[ConfigContainer, Any]: - """Handle all the model configuration logic.""" - # Load pretrained run config - pretrained_run_config = os.path.join( - pretrained_path, "iter_0000000/run_config.yaml" - ) - - if not os.path.exists(pretrained_run_config): - raise FileNotFoundError( - f"Pretrained run config not found at {pretrained_run_config} on rank={rank}. " - "This usually means that the one-time HF->mcore conversion on rank=0 saved to a directory " - "not being mounted on this node. Please check" + """Setup model configuration.""" + model_cfg = None + use_recipe_for_model = use_recipe and get_recipe_function(hf_model_name) is not None + + if use_recipe_for_model: + # Use Megatron-Bridge golden recipes + print(f"[INFO] Using Megatron-Bridge recipe-based config for {hf_model_name}") + recipe_fn = get_recipe_function(hf_model_name) + if recipe_fn is None: + raise ValueError(f"No recipe found for {hf_model_name}") + + megatron_cfg = recipe_fn() + model_cfg = megatron_cfg.model + else: + # Load pretrained run config + pretrained_run_config = os.path.join( + pretrained_path, "iter_0000000/run_config.yaml" ) - try: - cfg_from_pretrained = ConfigContainer.from_yaml( - pretrained_run_config, mode=InstantiationMode.STRICT - ) - except Exception as e: - # Add helpful context as a note to the exception - e.add_note( - f"\n{'=' * 80}\n" - f"NOTE: A common cause of this error is when the HF->mcore converted checkpoint is\n" - f"created with an older version of megatron-bridge.\n" - f"If this checkpoint is old or was generated by a different code version,\n" - f"try deleting it and rerunning the code.\n" - f"The checkpoint will be automatically regenerated with the current version.\n\n" - f"Checkpoint location: {pretrained_path}\n" - f"{'=' * 80}" - ) - raise + if not os.path.exists(pretrained_run_config): + raise FileNotFoundError( + f"Pretrained run config not found at {pretrained_run_config} on rank={rank}. " + "This usually means that the one-time HF->mcore conversion on rank=0 saved to a directory " + "not being mounted on this node. Please check" + ) + + try: + megatron_cfg = ConfigContainer.from_yaml( + pretrained_run_config, mode=InstantiationMode.STRICT + ) + except Exception as e: + # Add helpful context as a note to the exception + e.add_note( + f"\n{'=' * 80}\n" + f"NOTE: A common cause of this error is when the HF->mcore converted checkpoint is\n" + f"created with an older version of megatron-bridge.\n" + f"If this checkpoint is old or was generated by a different code version,\n" + f"try deleting it and rerunning the code.\n" + f"The checkpoint will be automatically regenerated with the current version.\n\n" + f"Checkpoint location: {pretrained_path}\n" + f"{'=' * 80}" + ) + raise - model_cfg = cfg_from_pretrained.model - cfg_from_pretrained.logger = LoggerConfig() + model_cfg = megatron_cfg.model # Apply parallelism settings _apply_parallelism_config(model_cfg, config) @@ -333,10 +355,8 @@ def setup_model_config( # Validate training configuration _validate_training_config(config, model_cfg) - # Create final megatron config - megatron_cfg = _create_megatron_config( - model_cfg, checkpoint_config, config, hf_model_name, dtype - ) + # Update megatron config with checkpoint, optimizer, scheduler, etc. + _update_megatron_config(megatron_cfg, checkpoint_config, config, hf_model_name) _validate_dtype_config(dtype, megatron_cfg.model, megatron_cfg.optimizer) @@ -570,51 +590,94 @@ def _validate_dtype_config( ) -def _create_megatron_config( - model_cfg: Any, +def _update_dataclass_fields(target: Any, updates: dict) -> None: + """Update a dataclass with values from a dictionary. + + Only sets fields that are present in the updates dict. Fields not in + the dict retain their original values. + + Args: + target: A dataclass instance to update + updates: Dictionary of field names to new values + """ + for key, value in updates.items(): + if hasattr(target, key): + setattr(target, key, value) + + +def _update_megatron_config( + megatron_cfg: ConfigContainer, checkpoint_config: CheckpointConfig, config: PolicyConfig, hf_model_name: str, - dtype: torch.dtype, -) -> ConfigContainer: - """Create the final Megatron configuration container.""" - return ConfigContainer( - model=model_cfg, - checkpoint=checkpoint_config, - logger=LoggerConfig(logging_level=0), - train=TrainingConfig( - micro_batch_size=1, # ignored - global_batch_size=config["train_global_batch_size"], # ignored - train_iters=config["megatron_cfg"]["train_iters"], - ), - optimizer=OptimizerConfig(**config["megatron_cfg"]["optimizer"]), - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=config["megatron_cfg"][ - "distributed_data_parallel_config" - ]["grad_reduce_in_fp32"], - overlap_grad_reduce=config["megatron_cfg"][ - "distributed_data_parallel_config" - ]["overlap_grad_reduce"], - overlap_param_gather=config["megatron_cfg"][ - "distributed_data_parallel_config" - ]["overlap_param_gather"], - # we need to set average_in_collective=False with calculate_per_token_loss=T - # otherwise, mcore throws an assertion error. - average_in_collective=False, # Required with calculate_per_token_loss=True - use_distributed_optimizer=config["megatron_cfg"]["optimizer"][ - "use_distributed_optimizer" - ], - data_parallel_sharding_strategy=config["megatron_cfg"][ - "distributed_data_parallel_config" - ]["data_parallel_sharding_strategy"], - ), - scheduler=SchedulerConfig(**config["megatron_cfg"]["scheduler"]), - dataset=None, - tokenizer=TokenizerConfig( - tokenizer_type="HuggingFaceTokenizer", - tokenizer_model=hf_model_name, - ), +) -> None: + """Update the existing ConfigContainer with checkpoint, optimizer, scheduler, and other settings. + + This modifies megatron_cfg in-place. For sub-configs (optimizer, ddp, scheduler, etc.), + only fields explicitly provided in the NeMo-RL config are updated; other fields retain + their original values from the recipe or checkpoint. + """ + megatron_cfg_dict = config.get("megatron_cfg", {}) + + # Ensure dist config is initialized (required for validate()) + if megatron_cfg.dist is None: + megatron_cfg.dist = DistributedInitConfig() + + # Always replace checkpoint config (NeMo-RL manages checkpoints) + megatron_cfg.checkpoint = checkpoint_config + + # Always set logger + megatron_cfg.logger = LoggerConfig(logging_level=0) + + # Update training config - these are NeMo-RL specific + if megatron_cfg.train is None: + megatron_cfg.train = TrainingConfig() + megatron_cfg.train.micro_batch_size = 1 # ignored by NeMo-RL + megatron_cfg.train.global_batch_size = config.get("train_global_batch_size", 1) # ignored by NeMo-RL + if "train_iters" in megatron_cfg_dict: + megatron_cfg.train.train_iters = megatron_cfg_dict["train_iters"] + + # Update optimizer config - merge with existing + optimizer_overrides = megatron_cfg_dict.get("optimizer", {}) + if optimizer_overrides: + if megatron_cfg.optimizer is None: + megatron_cfg.optimizer = OptimizerConfig(**optimizer_overrides) + else: + _update_dataclass_fields(megatron_cfg.optimizer, optimizer_overrides) + + # Update DDP config - merge with existing + ddp_overrides = megatron_cfg_dict.get("distributed_data_parallel_config", {}) + if megatron_cfg.ddp is None: + megatron_cfg.ddp = DistributedDataParallelConfig() + + # Apply explicit DDP overrides from config + if ddp_overrides: + _update_dataclass_fields(megatron_cfg.ddp, ddp_overrides) + + # NeMo-RL required DDP settings (always set) + megatron_cfg.ddp.check_for_nan_in_grad = True + # Required with calculate_per_token_loss=True, otherwise mcore throws assertion error + megatron_cfg.ddp.average_in_collective = False + + # Sync use_distributed_optimizer between optimizer and ddp + if megatron_cfg.optimizer is not None: + megatron_cfg.ddp.use_distributed_optimizer = megatron_cfg.optimizer.use_distributed_optimizer + + # Update scheduler config - merge with existing + scheduler_overrides = megatron_cfg_dict.get("scheduler", {}) + if scheduler_overrides: + if megatron_cfg.scheduler is None: + megatron_cfg.scheduler = SchedulerConfig(**scheduler_overrides) + else: + _update_dataclass_fields(megatron_cfg.scheduler, scheduler_overrides) + + # NeMo-RL handles data separately + megatron_cfg.dataset = None + + # Update tokenizer config - always set for HuggingFace tokenizer + megatron_cfg.tokenizer = TokenizerConfig( + tokenizer_type="HuggingFaceTokenizer", + tokenizer_model=hf_model_name, ) diff --git a/nemo_rl/models/policy/workers/megatron_policy_worker.py b/nemo_rl/models/policy/workers/megatron_policy_worker.py index 48ba0623e2..83d541f2ea 100644 --- a/nemo_rl/models/policy/workers/megatron_policy_worker.py +++ b/nemo_rl/models/policy/workers/megatron_policy_worker.py @@ -278,6 +278,19 @@ def __init__( self.model, self.optimizer, ) + print("HELLO") + # Dump ConfigContainer to YAML for inspection (only on rank 0) + if self.rank == 0: + config_dump_path = "/lustre/fsw/portfolios/coreai/users/sfawzy/final_megatron_config_6.yaml" + try: + self.megatron_cfg.to_yaml(config_dump_path) + print(f"[DEBUG] Saved final ConfigContainer to: {config_dump_path}") + except Exception as e: + print(f"[WARNING] Failed to save ConfigContainer to YAML: {e}") + # Exit early after dumping config for inspection + import sys + print("[DEBUG] Exiting after ConfigContainer dump") + sys.exit(0) # vars used for refit ## will be initialized in prepare_refit_info From ddd4151fe1d2616f5516f73db7f71be8457d74ef Mon Sep 17 00:00:00 2001 From: Sherif Fawzy Date: Fri, 6 Feb 2026 08:02:21 -0800 Subject: [PATCH 2/8] test --- 9239646-attach.sh | 25 +++ 9239676-attach.sh | 25 +++ 9240549-attach.sh | 25 +++ 9261863-attach.sh | 25 +++ .../configs/distillation_math_megatron.yaml | 2 + examples/configs/grpo_math_1B_megatron.yaml | 1 + .../configs/grpo_math_70B_megatron_fp8.yaml | 13 +- examples/configs/grpo_math_8B_megatron.yaml | 1 + .../configs/grpo_math_8B_megatron_fp8.yaml | 2 +- ....7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml | 6 - ...8b-instruct-4n4g-megatrontp1pp2-quick.yaml | 4 - ...llama3.1-8b-instruct-4n8g-megatron.v2.yaml | 1 + ...8b-instruct-4n8g-megatrontp2pp2-quick.yaml | 1 + .../grpo-dapomath17k-dsv3-32n4g-megatron.yaml | 7 - .../llm/grpo-dapomath17k-dsv3-megatron.yaml | 1 + .../llm/grpo-gptoss-20b-8n4g-megatron.yaml | 3 - ...nstruct-1n8g-megatron-fp8-rollouts.v3.yaml | 1 + ...3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml | 1 + ...po-llama3.2-1b-instruct-1n8g-megatron.yaml | 1 + ...-1b-instruct-1n8g-megatron_generation.yaml | 1 + .../grpo-moonlight-16ba3b-4n4g-megatron.yaml | 6 - .../llm/grpo-nano-v2-12b-1n4g-megatron.yaml | 3 - ...rpo-qwen2.5-7b-instruct-4n4g-megatron.yaml | 2 - .../llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml | 4 - ...en3-8b-base-1n8g-fp8-kvcache-megatron.yaml | 1 + .../performance/grpo-deepseek-v3-32n4g.yaml | 5 - .../grpo-deepseek-v3-64n4g-async-1off.yaml | 5 - .../grpo-deepseek-v3-64n8g-async-1off.yaml | 4 - ...grpo-deepseek-v3-64n8g-fp8-async-1off.yaml | 9 - ...-llama3.1-8b-instruct-2n4g-async-1off.yaml | 4 - ...-llama3.1-8b-instruct-2n8g-async-1off.yaml | 2 - ...ma3.1-8b-instruct-2n8g-fp8-async-1off.yaml | 8 - .../performance/grpo-qwen3-235b-16n4g.yaml | 4 - .../grpo-qwen3-235b-32n4g-async-1off.yaml | 4 - .../grpo-qwen3-235b-32n8g-async-1off.yaml | 7 - .../grpo-qwen3-30ba3b-24n8g-async-8off.yaml | 5 - .../grpo-qwen3-30ba3b-4n8g-async-1off.yaml | 5 - .../grpo-qwen3-30ba3b-8n4g-async-1off.yaml | 5 - .../grpo-qwen3-32b-8n4g-async-1off.yaml | 4 - .../grpo-qwen3-32b-8n8g-async-1off.yaml | 4 - ...lama3.1-70b-8n4g-tp2pp2-long-megatron.yaml | 2 - ...lama3.1-70b-8n8g-tp4pp2-long-megatron.yaml | 1 + .../sft-llama3.1-8b-1n8g-megatron-lora.yaml | 1 + ...sft-llama3.1-8b-1n8g-megatron-seqpack.yaml | 1 + .../llm/sft-llama3.1-8b-1n8g-megatron.yaml | 1 + .../llm/sft-qwen2.5-math7b-2n4g-megatron.yaml | 4 - ...3b-instruct-clevr-1n8g-megatrontp2.v1.yaml | 12 +- .../sft_openmathinstruct2_megatron.yaml | 1 + examples/configs/vlm_grpo_3B.yaml | 73 -------- examples/configs/vlm_grpo_3B_megatron.yaml | 59 ------- nemo_rl/models/megatron/__init__.py | 6 +- nemo_rl/models/megatron/recipe_config.py | 156 ++++++------------ nemo_rl/models/megatron/setup.py | 20 +-- nemo_rl/models/policy/__init__.py | 5 + 54 files changed, 181 insertions(+), 398 deletions(-) create mode 100755 9239646-attach.sh create mode 100755 9239676-attach.sh create mode 100755 9240549-attach.sh create mode 100755 9261863-attach.sh diff --git a/9239646-attach.sh b/9239646-attach.sh new file mode 100755 index 0000000000..8e318a4731 --- /dev/null +++ b/9239646-attach.sh @@ -0,0 +1,25 @@ +# No args launches on the head node (node 0) +# Args 1-N launch on worker nodes (nodes 1 through N-1) +# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell +WORKER_NUM=${1:-} +if [[ -z "$WORKER_NUM" ]]; then + # Empty means we are on the head node + if [[ -n "${COMMAND:-}" ]]; then + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00753" --jobid 9239646 bash -c "$COMMAND" + else + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00753" --jobid 9239646 --pty bash + fi +else + # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1) + # and use nodes_array[1] through nodes_array[N-1] + if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then + echo "Error: WORKER_NUM must be between 1 and 0" + exit 1 + fi + nodes_array=(pool0-00753) + if [[ -n "${COMMAND:-}" ]]; then + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239646 bash -c "$COMMAND" + else + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239646 --pty bash + fi +fi diff --git a/9239676-attach.sh b/9239676-attach.sh new file mode 100755 index 0000000000..68f99e49a6 --- /dev/null +++ b/9239676-attach.sh @@ -0,0 +1,25 @@ +# No args launches on the head node (node 0) +# Args 1-N launch on worker nodes (nodes 1 through N-1) +# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell +WORKER_NUM=${1:-} +if [[ -z "$WORKER_NUM" ]]; then + # Empty means we are on the head node + if [[ -n "${COMMAND:-}" ]]; then + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01821" --jobid 9239676 bash -c "$COMMAND" + else + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01821" --jobid 9239676 --pty bash + fi +else + # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1) + # and use nodes_array[1] through nodes_array[N-1] + if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then + echo "Error: WORKER_NUM must be between 1 and 0" + exit 1 + fi + nodes_array=(pool0-01821) + if [[ -n "${COMMAND:-}" ]]; then + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239676 bash -c "$COMMAND" + else + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239676 --pty bash + fi +fi diff --git a/9240549-attach.sh b/9240549-attach.sh new file mode 100755 index 0000000000..429b0deb56 --- /dev/null +++ b/9240549-attach.sh @@ -0,0 +1,25 @@ +# No args launches on the head node (node 0) +# Args 1-N launch on worker nodes (nodes 1 through N-1) +# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell +WORKER_NUM=${1:-} +if [[ -z "$WORKER_NUM" ]]; then + # Empty means we are on the head node + if [[ -n "${COMMAND:-}" ]]; then + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01736" --jobid 9240549 bash -c "$COMMAND" + else + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01736" --jobid 9240549 --pty bash + fi +else + # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1) + # and use nodes_array[1] through nodes_array[N-1] + if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then + echo "Error: WORKER_NUM must be between 1 and 0" + exit 1 + fi + nodes_array=(pool0-01736) + if [[ -n "${COMMAND:-}" ]]; then + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9240549 bash -c "$COMMAND" + else + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9240549 --pty bash + fi +fi diff --git a/9261863-attach.sh b/9261863-attach.sh new file mode 100755 index 0000000000..d1d6280cb3 --- /dev/null +++ b/9261863-attach.sh @@ -0,0 +1,25 @@ +# No args launches on the head node (node 0) +# Args 1-N launch on worker nodes (nodes 1 through N-1) +# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell +WORKER_NUM=${1:-} +if [[ -z "$WORKER_NUM" ]]; then + # Empty means we are on the head node + if [[ -n "${COMMAND:-}" ]]; then + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00629" --jobid 9261863 bash -c "$COMMAND" + else + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00629" --jobid 9261863 --pty bash + fi +else + # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1) + # and use nodes_array[1] through nodes_array[N-1] + if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then + echo "Error: WORKER_NUM must be between 1 and 0" + exit 1 + fi + nodes_array=(pool0-00629) + if [[ -n "${COMMAND:-}" ]]; then + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9261863 bash -c "$COMMAND" + else + srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9261863 --pty bash + fi +fi diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml index ae2fbcd3e1..c3d1dd901a 100644 --- a/examples/configs/distillation_math_megatron.yaml +++ b/examples/configs/distillation_math_megatron.yaml @@ -37,6 +37,7 @@ policy: &POLICY_BASE megatron_cfg: &MEGATRON_BASE enabled: true + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_1p7b_pretrain_config empty_unused_memory_level: 0 activation_checkpointing: false converter_type: "Qwen3ForCausalLM" @@ -142,6 +143,7 @@ teacher: model_name: "Qwen/Qwen3-4B" megatron_cfg: <<: *MEGATRON_BASE + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_4b_pretrain_config context_parallel_size: 2 tensor_model_parallel_size: 2 pipeline_model_parallel_size: 2 diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index a9368481ae..a4d7592f80 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -78,6 +78,7 @@ policy: megatron_cfg: enabled: true + megatron_recipe: null # Set to a fully qualified recipe path to use a Megatron-Bridge recipe, e.g. megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. activation_checkpointing: false converter_type: "Qwen2ForCausalLM" diff --git a/examples/configs/grpo_math_70B_megatron_fp8.yaml b/examples/configs/grpo_math_70B_megatron_fp8.yaml index df239cd8ff..322aa6c0f2 100644 --- a/examples/configs/grpo_math_70B_megatron_fp8.yaml +++ b/examples/configs/grpo_math_70B_megatron_fp8.yaml @@ -8,15 +8,4 @@ policy: generation: vllm_cfg: precision: "fp8" - use_deep_gemm: true - megatron_cfg: - pipeline_model_parallel_size: 8 - fp8_cfg: - enabled: true - fp8: "e4m3" - fp8_recipe: "blockwise" - fp8_param: false - optimizer: - use_precision_aware_optimizer: false - env_vars: - NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" \ No newline at end of file + use_deep_gemm: true \ No newline at end of file diff --git a/examples/configs/grpo_math_8B_megatron.yaml b/examples/configs/grpo_math_8B_megatron.yaml index 977ab394b5..8d2ddfa90a 100644 --- a/examples/configs/grpo_math_8B_megatron.yaml +++ b/examples/configs/grpo_math_8B_megatron.yaml @@ -30,6 +30,7 @@ policy: megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. converter_type: "LlamaForCausalLM" tensor_model_parallel_size: 1 diff --git a/examples/configs/grpo_math_8B_megatron_fp8.yaml b/examples/configs/grpo_math_8B_megatron_fp8.yaml index ba6ee6e5c8..9548979c1c 100644 --- a/examples/configs/grpo_math_8B_megatron_fp8.yaml +++ b/examples/configs/grpo_math_8B_megatron_fp8.yaml @@ -19,4 +19,4 @@ policy: optimizer: use_precision_aware_optimizer: false env_vars: - NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" \ No newline at end of file + NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml index 95c9e85573..31100ce7b9 100644 --- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml +++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml @@ -1,10 +1,4 @@ defaults: ./distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml -policy: - megatron_cfg: - tensor_model_parallel_size: 1 -teacher: - megatron_cfg: - tensor_model_parallel_size: 2 checkpointing: checkpoint_dir: checkpoints/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack logger: diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml index 8324173dfc..1f75679f39 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml @@ -1,8 +1,4 @@ defaults: ./dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml -policy: - megatron_cfg: - tensor_model_parallel_size: 1 - sequence_parallel: false logger: wandb: name: dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml index 8df4bc3fb0..a19a094bf5 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml @@ -20,6 +20,7 @@ policy: optimizer: null megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config tensor_model_parallel_size: 4 logger: wandb_enabled: true diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml index 8b3a43ea28..83fac04256 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml @@ -22,6 +22,7 @@ policy: optimizer: null megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config pipeline_model_parallel_size: 2 logger: wandb_enabled: true diff --git a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml index fb4a4bc880..627a00574e 100644 --- a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml @@ -1,12 +1,5 @@ defaults: ./grpo-dapomath17k-dsv3-megatron.yaml policy: - megatron_cfg: - tensor_model_parallel_size: 4 - expert_model_parallel_size: 16 - pipeline_model_parallel_size: 4 - context_parallel_size: 2 - num_layers_in_first_pipeline_stage: 15 - num_layers_in_last_pipeline_stage: 14 make_sequence_length_divisible_by: 4 generation: vllm_cfg: diff --git a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml index 8d19757d54..0378daaa41 100644 --- a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml @@ -19,6 +19,7 @@ policy: optimizer: null megatron_cfg: enabled: true + megatron_recipe: null # Can be set to megatron.bridge.recipes.deepseek.deepseek_v3.deepseek_v3_pretrain_config activation_checkpointing: true tensor_model_parallel_size: 8 expert_model_parallel_size: 32 diff --git a/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml index c9719f381f..ef033bc67f 100644 --- a/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml @@ -1,8 +1,5 @@ defaults: ./grpo-gptoss-20b-8n8g-megatron.yaml policy: - megatron_cfg: - expert_model_parallel_size: 4 - tensor_model_parallel_size: 2 generation: vllm_cfg: tensor_parallel_size: 1 diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml index dcd791eee6..f2c4af29c5 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml @@ -18,6 +18,7 @@ policy: enabled: false megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config converter_type: LlamaForCausalLM pipeline_model_parallel_size: 2 activation_checkpointing: true diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml index 6411c6fb49..624ced10c5 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml @@ -19,6 +19,7 @@ policy: enabled: false megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config converter_type: LlamaForCausalLM pipeline_model_parallel_size: 2 activation_checkpointing: true diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml index 333a06d980..a3235a7b41 100755 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml @@ -12,6 +12,7 @@ policy: optimizer: null megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config scheduler: lr_warmup_iters: 50 dtensor_cfg: diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml index bb641388d8..728a711b48 100644 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml @@ -12,6 +12,7 @@ policy: optimizer: null megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config scheduler: lr_warmup_iters: 50 dtensor_cfg: diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml index 97d6ffede7..4459adc9dd 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml @@ -1,12 +1,6 @@ defaults: ./grpo-moonlight-16ba3b-4n8g-megatron.yaml checkpointing: checkpoint_dir: results/grpo-moonlight-16ba3b-4n4g-megatron -policy: - megatron_cfg: - expert_model_parallel_size: 2 - pipeline_model_parallel_size: 2 - num_layers_in_first_pipeline_stage: 14 - num_layers_in_last_pipeline_stage: 13 logger: wandb: name: grpo-moonlight-16ba3b-4n4g-megatron diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml index da8301a19b..ba30e6490e 100644 --- a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml @@ -1,9 +1,6 @@ defaults: ./grpo-nano-v2-12b-1n8g-megatron.yaml checkpointing: checkpoint_dir: results/grpo-nano-v2-12b-1n4g-megatron -policy: - megatron_cfg: - tensor_model_parallel_size: 4 logger: log_dir: logs/grpo-nano-v2-12b-1n4g-megatron wandb: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml index b21c9dd51f..4029f002e8 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml @@ -1,7 +1,5 @@ defaults: ./grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml policy: - megatron_cfg: - tensor_model_parallel_size: 1 make_sequence_length_divisible_by: 2 generation: vllm_cfg: diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml index 79fbda389d..f26cbf49b2 100644 --- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml @@ -1,9 +1,5 @@ defaults: ./grpo-qwen3-30ba3b-8n8g-megatron.yaml policy: - megatron_cfg: - tensor_model_parallel_size: 2 - pipeline_model_parallel_size: 2 - expert_model_parallel_size: 2 make_sequence_length_divisible_by: 2 generation: vllm_cfg: diff --git a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml index 69ff4a4229..d155afac75 100644 --- a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml @@ -17,6 +17,7 @@ policy: scheduler: null megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_8b_pretrain_config converter_type: Qwen3ForCausalLM tensor_model_parallel_size: 4 optimizer: diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml index 04fc067d6e..9b157ea779 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml @@ -4,11 +4,6 @@ checkpointing: policy: sequence_packing: enabled: false - megatron_cfg: - pipeline_model_parallel_size: 8 - expert_model_parallel_size: 16 - num_layers_in_first_pipeline_stage: 7 - num_layers_in_last_pipeline_stage: 6 generation: vllm_cfg: tensor_parallel_size: 32 diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml index bf9a30a5d3..ff3e68d7da 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml @@ -4,11 +4,6 @@ checkpointing: policy: sequence_packing: enabled: false - megatron_cfg: - pipeline_model_parallel_size: 8 - expert_model_parallel_size: 16 - num_layers_in_first_pipeline_stage: 7 - num_layers_in_last_pipeline_stage: 6 generation: colocated: resources: diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml index 595654a3a3..0260eb39e6 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml @@ -10,10 +10,6 @@ checkpointing: checkpoint_dir: results/grpo-deepseek-v3-64n8g-async-1off policy: logprob_batch_size: 2 - megatron_cfg: - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 16 - expert_model_parallel_size: 16 generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml index 7f6b5ae86b..b8f8ece97d 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml @@ -2,15 +2,6 @@ defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml checkpointing: checkpoint_dir: results/grpo-deepseek-v3-64n8g-fp8-async-1off policy: - megatron_cfg: - fp8_cfg: - enabled: true - fp8: "e4m3" - fp8_recipe: "blockwise" - fp8_param: false - moe_router_dtype: fp32 - env_vars: - NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" generation: vllm_cfg: tensor_parallel_size: 16 diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml index d906eda2b4..ba06869c7e 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml @@ -9,10 +9,6 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g-async-1off policy: - megatron_cfg: - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - sequence_parallel: false generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml index c0263f68fb..b6d7ed441d 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml @@ -9,8 +9,6 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-async-1off policy: - megatron_cfg: - pipeline_model_parallel_size: 1 generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml index b32786f7d7..1ab2bafaf4 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml @@ -2,14 +2,6 @@ defaults: ./grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml checkpointing: checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off policy: - megatron_cfg: - fp8_cfg: - enabled: true - fp8: "e4m3" - fp8_recipe: "blockwise" - fp8_param: false - env_vars: - NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" generation: vllm_cfg: precision: "fp8" diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml index 1640deda09..d542091951 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml @@ -2,10 +2,6 @@ defaults: ./grpo-qwen3-235b-16n8g.yaml checkpointing: checkpoint_dir: results/grpo-qwen3-235b-16n4g policy: - megatron_cfg: - pipeline_model_parallel_size: 4 - num_layers_in_first_pipeline_stage: 23 - num_layers_in_last_pipeline_stage: 23 generation: vllm_cfg: tensor_parallel_size: 8 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml index f55b383686..13d8501363 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml @@ -2,10 +2,6 @@ defaults: ./grpo-qwen3-235b-32n8g-async-1off.yaml checkpointing: checkpoint_dir: results/grpo-qwen3-235b-32n4g-async-1off policy: - megatron_cfg: - pipeline_model_parallel_size: 4 - num_layers_in_first_pipeline_stage: 23 - num_layers_in_last_pipeline_stage: 23 generation: colocated: resources: diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml index cf4f5a6f98..4545aa364a 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml @@ -9,13 +9,6 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-235b-32n8g-async-1off policy: - megatron_cfg: - tensor_model_parallel_size: 4 - sequence_parallel: true - context_parallel_size: 1 - pipeline_model_parallel_size: 8 - expert_model_parallel_size: 16 - defer_fp32_logits: false generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml index 11d917fc8b..b4d5409a61 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml @@ -9,11 +9,6 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-30ba3b-24n8g-async-8off policy: - megatron_cfg: - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - expert_model_parallel_size: 8 - sequence_parallel: false generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml index 4cc5981460..797244576f 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml @@ -9,11 +9,6 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g-async-1off policy: - megatron_cfg: - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 2 - expert_model_parallel_size: 8 - sequence_parallel: false generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml index a9837c87f2..6da999f169 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml @@ -9,11 +9,6 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-30ba3b-8n4g-async-1off policy: - megatron_cfg: - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - expert_model_parallel_size: 16 - sequence_parallel: false generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml index 4f8a0a03bb..f50db7fde8 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml @@ -9,10 +9,6 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-32b-8n4g-async-1off policy: - megatron_cfg: - tensor_model_parallel_size: 2 - pipeline_model_parallel_size: 1 - sequence_parallel: true generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml index 9f20f34f40..a54a8e7747 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml @@ -9,10 +9,6 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-32b-8n8g-async-1off policy: - megatron_cfg: - tensor_model_parallel_size: 4 - pipeline_model_parallel_size: 4 - sequence_parallel: true generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml index 77c175fadf..798d5e0617 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml @@ -1,7 +1,5 @@ defaults: ./sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml policy: - megatron_cfg: - tensor_model_parallel_size: 2 make_sequence_length_divisible_by: 2 checkpointing: checkpoint_dir: results/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml index bb43955812..33434d14e0 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml @@ -19,6 +19,7 @@ policy: enabled: false megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config tensor_model_parallel_size: 4 pipeline_model_parallel_size: 2 freeze_moe_router: true diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml index b2b76c0afd..6c2b1117f6 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml @@ -16,6 +16,7 @@ policy: enabled: false megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config peft: enabled: true dim: 128 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml index aa62330e3e..257624dd9f 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml @@ -21,6 +21,7 @@ policy: optimizer: null megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config tensor_model_parallel_size: 2 pipeline_model_parallel_size: 2 optimizer: diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml index 7e9452dff7..100a87be03 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml @@ -19,6 +19,7 @@ policy: optimizer: null megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config tensor_model_parallel_size: 2 pipeline_model_parallel_size: 2 optimizer: diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml index aad3f5c8e0..903db6113b 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml @@ -1,8 +1,4 @@ defaults: ./sft-qwen2.5-math7b-2n8g-megatron.yaml -policy: - megatron_cfg: - tensor_model_parallel_size: 2 - context_parallel_size: 1 logger: wandb: name: sft-qwen2.5-math7b-2n4g-megatron diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml index d81a58980e..d301812f91 100644 --- a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml +++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml @@ -9,18 +9,8 @@ policy: enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null - megatron_cfg: - enabled: true - optimizer: - lr: 5.0e-07 - min_lr: 5.0e-08 - scheduler: - lr_warmup_iters: 50 - lr_warmup_init: 5.0e-08 - distributed_data_parallel_config: - overlap_grad_reduce: false logger: wandb: name: vlm-grpo-3b-megatron cluster: - gpus_per_node: 8 \ No newline at end of file + gpus_per_node: 8 diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml index faca12e0ae..9df98169c8 100644 --- a/examples/configs/sft_openmathinstruct2_megatron.yaml +++ b/examples/configs/sft_openmathinstruct2_megatron.yaml @@ -33,6 +33,7 @@ policy: enabled: false megatron_cfg: + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config activation_checkpointing: false context_parallel_size: 1 distributed_data_parallel_config: diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml index f9612007a4..c28b11add8 100644 --- a/examples/configs/vlm_grpo_3B.yaml +++ b/examples/configs/vlm_grpo_3B.yaml @@ -80,79 +80,6 @@ policy: context_parallel_size: 1 custom_parallel_plan: null - megatron_cfg: - enabled: false - empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. - activation_checkpointing: false - converter_type: "Qwen2ForCausalLM" - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - # gives ~25% training perf speedup with sequence packing and apply_rope_fusion - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - - optimizer: - optimizer: "adam" - lr: 5.0e-6 - min_lr: 5.0e-7 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 13 - lr_warmup_init: 5.0e-7 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - - # dynamic_batching improves performance by ensuring logprob and training microbatches # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length # responses are sorted by sequence length and bucketed into microbatches with a total diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml index b32cd7df04..0c62a03954 100644 --- a/examples/configs/vlm_grpo_3B_megatron.yaml +++ b/examples/configs/vlm_grpo_3B_megatron.yaml @@ -123,65 +123,6 @@ policy: resources: gpus_per_node: null num_nodes: null - megatron_cfg: - enabled: true - empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. - activation_checkpointing: false - converter_type: Qwen2ForCausalLM - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: fp64 - moe_router_load_balancing_type: none - moe_router_bias_update_rate: 0.0 - moe_permute_fusion: false - apply_rope_fusion: true - # gives ~25% training perf speedup with sequence packing and apply_rope_fusion - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - optimizer: - optimizer: adam - lr: 2.0e-07 - min_lr: 2.0e-07 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: float32 - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1.0e-08 - sgd_momentum: 0.9 - use_distributed_optimizer: true - use_precision_aware_optimizer: true - clip_grad: ${policy.max_grad_norm} - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: constant - lr_decay_style: constant - lr_decay_iters: 1000 - lr_warmup_iters: 50 - lr_warmup_init: 2.0e-08 - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: false - overlap_param_gather: true - use_custom_fsdp: false - data_parallel_sharding_strategy: optim_grads_params data: max_input_seq_length: ${policy.max_total_sequence_length} shuffle: true diff --git a/nemo_rl/models/megatron/__init__.py b/nemo_rl/models/megatron/__init__.py index f7ce1ab003..790146ecaa 100644 --- a/nemo_rl/models/megatron/__init__.py +++ b/nemo_rl/models/megatron/__init__.py @@ -13,11 +13,9 @@ # limitations under the License. from nemo_rl.models.megatron.recipe_config import ( - get_available_recipes, - get_recipe_function, + load_recipe, ) __all__ = [ - "get_available_recipes", - "get_recipe_function", + "load_recipe", ] diff --git a/nemo_rl/models/megatron/recipe_config.py b/nemo_rl/models/megatron/recipe_config.py index 891129c503..ddcb7862e3 100644 --- a/nemo_rl/models/megatron/recipe_config.py +++ b/nemo_rl/models/megatron/recipe_config.py @@ -19,123 +19,63 @@ allowing NeMo-RL to use pre-configured training recipes as a base and layer RL-specific settings on top. -Example usage: - from nemo_rl.models.megatron.recipe_config import create_config_from_recipe - - megatron_cfg = create_config_from_recipe( - hf_model_name="meta-llama/Llama-3.1-8B-Instruct", - policy_config=config, - pretrained_path="/path/to/checkpoint", - weights_path=None, - ) - -Internal flag for testing: - # To use pure recipe settings with minimal RL overrides (for testing): - megatron_cfg = create_config_from_recipe( - ..., - _apply_full_overrides=False, # Internal flag - keeps recipe's optimizer/scheduler - ) -""" - -import warnings -from typing import Any, Callable, Optional +Recipes are specified via their fully qualified Python import path in the +YAML config under ``policy.megatron_cfg.megatron_recipe``. For example: -import torch -from megatron.bridge import AutoBridge -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DistributedDataParallelConfig, - LoggerConfig, - OptimizerConfig, - SchedulerConfig, - TokenizerConfig, - TrainingConfig, -) - -from nemo_rl.models.policy import PolicyConfig + policy: + megatron_cfg: + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config + ... +The import path is resolved at runtime using ``load_recipe()``. +""" -# ============================================================================= -# RECIPE DISCOVERY -# ============================================================================= +import importlib -def _import_llama_recipes(): - """Import Llama recipes from Megatron-Bridge.""" - try: - from megatron.bridge.recipes.llama.llama3 import ( - llama31_8b_pretrain_config, - llama31_70b_pretrain_config, - llama31_405b_pretrain_config, - llama3_8b_pretrain_config, - llama3_70b_pretrain_config, - llama32_1b_pretrain_config, - llama32_3b_pretrain_config, - ) - return { - "llama-3.2-1b": llama32_1b_pretrain_config, - "llama-3.2-3b": llama32_3b_pretrain_config, - "llama-3-8b": llama3_8b_pretrain_config, - "llama-3.1-8b": llama31_8b_pretrain_config, - "meta-llama-3-8b": llama3_8b_pretrain_config, - "meta-llama-3.1-8b": llama31_8b_pretrain_config, - "llama-3-70b": llama3_70b_pretrain_config, - "llama-3.1-70b": llama31_70b_pretrain_config, - "llama-3.1-405b": llama31_405b_pretrain_config, - } - except ImportError: - return {} - - -def _import_qwen_recipes(): - """Import Qwen recipes from Megatron-Bridge.""" - try: - from megatron.bridge.recipes.qwen.qwen3 import ( - qwen3_600m_pretrain_config, - qwen3_1p7b_pretrain_config, - qwen3_4b_pretrain_config, - qwen3_8b_pretrain_config, - ) - return { - "qwen3-0.6b": qwen3_600m_pretrain_config, - "qwen3-1.7b": qwen3_1p7b_pretrain_config, - "qwen3-4b": qwen3_4b_pretrain_config, - "qwen3-8b": qwen3_8b_pretrain_config, - } - except ImportError: - return {} +from megatron.bridge.training.config import ConfigContainer -def get_recipe_function(hf_model_name: str) -> Optional[Callable[..., ConfigContainer]]: +def load_recipe(recipe_path: str) -> ConfigContainer: """ - Get the appropriate Megatron-Bridge recipe function for a model. - + Dynamically import and call a Megatron-Bridge recipe function. + Args: - hf_model_name: HuggingFace model name or path - + recipe_path: Fully qualified Python import path to the recipe function. + For example: ``megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config`` + Returns: - Recipe function or None if no matching recipe found + A ConfigContainer produced by calling the recipe function. + + Raises: + ValueError: If the recipe path is invalid or the function cannot be found. + TypeError: If the resolved object is not callable. """ - model_lower = hf_model_name.lower().replace("/", "-").replace("_", "-") - - # Load recipes lazily - all_recipes = {} - all_recipes.update(_import_llama_recipes()) - all_recipes.update(_import_qwen_recipes()) - - # Try match - for pattern, recipe_fn in all_recipes.items(): - if pattern in model_lower: - return recipe_fn - - return None - - -def get_available_recipes() -> list[str]: - """Return a list of available recipe patterns.""" - all_recipes = {} - all_recipes.update(_import_llama_recipes()) - all_recipes.update(_import_qwen_recipes()) - return list(all_recipes.keys()) + module_path, _, func_name = recipe_path.rpartition(".") + if not module_path or not func_name: + raise ValueError( + f"Invalid recipe path '{recipe_path}'. " + "Expected a fully qualified Python path like " + "'megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config'" + ) + try: + module = importlib.import_module(module_path) + except ImportError as e: + raise ValueError( + f"Could not import module '{module_path}' from recipe path '{recipe_path}': {e}" + ) from e + + recipe_fn = getattr(module, func_name, None) + if recipe_fn is None: + raise ValueError( + f"Module '{module_path}' has no attribute '{func_name}'. " + f"Check that the recipe function name is correct in '{recipe_path}'." + ) + + if not callable(recipe_fn): + raise TypeError( + f"'{recipe_path}' resolved to a non-callable object of type {type(recipe_fn).__name__}. " + "Expected a recipe function." + ) + return recipe_fn() diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index 14dcbabcb3..cfe49b7ad6 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -69,9 +69,7 @@ from nemo_rl.distributed.named_sharding import NamedSharding from nemo_rl.models.megatron.community_import import import_model_from_hf_name from nemo_rl.models.megatron.config import ModelAndOptimizerState, RuntimeConfig -from nemo_rl.models.megatron.recipe_config import ( - get_recipe_function, -) +from nemo_rl.models.megatron.recipe_config import load_recipe from nemo_rl.models.policy import PolicyConfig from nemo_rl.models.policy.utils import ( configure_dynamo_cache, @@ -235,7 +233,6 @@ def validate_and_set_config( hf_model_name=hf_model_name, pretrained_path=pretrained_path, weights_path=weights_path, - use_recipe=True, ) final_padded_vocab_size = calculate_padded_vocab_size( @@ -279,20 +276,15 @@ def setup_model_config( hf_model_name: str, pretrained_path: str, weights_path: Optional[str] = None, - use_recipe: bool = True, ) -> tuple[ConfigContainer, Any]: """Setup model configuration.""" model_cfg = None - use_recipe_for_model = use_recipe and get_recipe_function(hf_model_name) is not None - - if use_recipe_for_model: - # Use Megatron-Bridge golden recipes - print(f"[INFO] Using Megatron-Bridge recipe-based config for {hf_model_name}") - recipe_fn = get_recipe_function(hf_model_name) - if recipe_fn is None: - raise ValueError(f"No recipe found for {hf_model_name}") + megatron_recipe = config["megatron_cfg"].get("megatron_recipe") - megatron_cfg = recipe_fn() + if megatron_recipe: + # Use Megatron-Bridge recipe specified in config + print(f"[INFO] Using Megatron-Bridge recipe: {megatron_recipe}") + megatron_cfg = load_recipe(megatron_recipe) model_cfg = megatron_cfg.model else: # Load pretrained run config diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index 363399cbca..fa8d1d4501 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -158,6 +158,11 @@ class MegatronConfigDisabled(TypedDict): class MegatronConfig(TypedDict): enabled: Literal[True] + # Fully qualified Python import path to a Megatron-Bridge recipe function. + # When set, the recipe is loaded at runtime to provide the base model configuration. + # When null/unset, configuration is loaded from the checkpoint's run_config.yaml. + # Example: "megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config" + megatron_recipe: NotRequired[str | None] env_vars: NotRequired[dict[str, str] | None] # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. # Setting to 0 is faster, but you are more likely to run out of GPU memory. In SFT/DPO, the default is 0. From 824fbdb6c8033eae8835543aa889e17f6ca1ce01 Mon Sep 17 00:00:00 2001 From: Sherif Fawzy Date: Fri, 6 Feb 2026 09:44:35 -0800 Subject: [PATCH 3/8] . --- 9239646-attach.sh | 25 ------ 9239676-attach.sh | 25 ------ 9240549-attach.sh | 25 ------ 9261863-attach.sh | 25 ------ examples/configs/distillation_math.yaml | 71 --------------- .../configs/distillation_math_megatron.yaml | 5 +- examples/configs/dpo.yaml | 71 --------------- examples/configs/grpo_math_1B.yaml | 75 ---------------- examples/configs/grpo_math_1B_megatron.yaml | 69 +-------------- examples/configs/grpo_math_70B_megatron.yaml | 2 + examples/configs/grpo_math_8B_megatron.yaml | 3 +- .../grpo_math_qwen30ba3b_megatron.yaml | 2 + ....7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml | 1 + ...llama3.1-8b-instruct-4n8g-megatron.v2.yaml | 2 +- ...8b-instruct-4n8g-megatrontp2pp2-quick.yaml | 2 +- .../llm/grpo-dapomath17k-dsv3-megatron.yaml | 2 +- .../llm/grpo-gptoss-20b-8n8g-megatron.yaml | 1 + ...nstruct-1n8g-megatron-fp8-rollouts.v3.yaml | 2 +- ...3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml | 2 +- ...po-llama3.2-1b-instruct-1n8g-megatron.yaml | 2 +- ...-1b-instruct-1n8g-megatron_generation.yaml | 2 +- ...po-math-qwen3-30ba3b-megatron-tp4-32k.yaml | 1 + .../grpo-moonlight-16ba3b-4n8g-megatron.yaml | 1 + .../llm/grpo-nano-v2-12b-1n8g-megatron.yaml | 1 + ...rpo-qwen2.5-7b-instruct-4n8g-megatron.yaml | 1 + .../llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml | 1 + ...en3-8b-base-1n8g-fp8-kvcache-megatron.yaml | 2 +- ...lama3.1-70b-8n8g-tp4pp2-long-megatron.yaml | 2 +- .../sft-llama3.1-8b-1n8g-megatron-lora.yaml | 2 +- ...sft-llama3.1-8b-1n8g-megatron-seqpack.yaml | 2 +- .../llm/sft-llama3.1-8b-1n8g-megatron.yaml | 2 +- .../llm/sft-qwen2.5-math7b-2n8g-megatron.yaml | 1 + ...3b-instruct-clevr-1n8g-megatrontp2.v1.yaml | 2 +- examples/configs/rm.yaml | 56 ------------ examples/configs/sft.yaml | 87 ------------------- examples/configs/sft_openmathinstruct2.yaml | 3 - .../sft_openmathinstruct2_megatron.yaml | 3 +- examples/configs/vlm_grpo_3B_megatron.yaml | 1 - nemo_rl/models/megatron/recipe_config.py | 4 +- nemo_rl/models/megatron/setup.py | 38 ++++---- nemo_rl/models/policy/__init__.py | 10 +-- 41 files changed, 62 insertions(+), 572 deletions(-) delete mode 100755 9239646-attach.sh delete mode 100755 9239676-attach.sh delete mode 100755 9240549-attach.sh delete mode 100755 9261863-attach.sh diff --git a/9239646-attach.sh b/9239646-attach.sh deleted file mode 100755 index 8e318a4731..0000000000 --- a/9239646-attach.sh +++ /dev/null @@ -1,25 +0,0 @@ -# No args launches on the head node (node 0) -# Args 1-N launch on worker nodes (nodes 1 through N-1) -# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell -WORKER_NUM=${1:-} -if [[ -z "$WORKER_NUM" ]]; then - # Empty means we are on the head node - if [[ -n "${COMMAND:-}" ]]; then - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00753" --jobid 9239646 bash -c "$COMMAND" - else - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00753" --jobid 9239646 --pty bash - fi -else - # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1) - # and use nodes_array[1] through nodes_array[N-1] - if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then - echo "Error: WORKER_NUM must be between 1 and 0" - exit 1 - fi - nodes_array=(pool0-00753) - if [[ -n "${COMMAND:-}" ]]; then - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239646 bash -c "$COMMAND" - else - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p batch --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239646 --pty bash - fi -fi diff --git a/9239676-attach.sh b/9239676-attach.sh deleted file mode 100755 index 68f99e49a6..0000000000 --- a/9239676-attach.sh +++ /dev/null @@ -1,25 +0,0 @@ -# No args launches on the head node (node 0) -# Args 1-N launch on worker nodes (nodes 1 through N-1) -# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell -WORKER_NUM=${1:-} -if [[ -z "$WORKER_NUM" ]]; then - # Empty means we are on the head node - if [[ -n "${COMMAND:-}" ]]; then - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01821" --jobid 9239676 bash -c "$COMMAND" - else - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01821" --jobid 9239676 --pty bash - fi -else - # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1) - # and use nodes_array[1] through nodes_array[N-1] - if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then - echo "Error: WORKER_NUM must be between 1 and 0" - exit 1 - fi - nodes_array=(pool0-01821) - if [[ -n "${COMMAND:-}" ]]; then - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239676 bash -c "$COMMAND" - else - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9239676 --pty bash - fi -fi diff --git a/9240549-attach.sh b/9240549-attach.sh deleted file mode 100755 index 429b0deb56..0000000000 --- a/9240549-attach.sh +++ /dev/null @@ -1,25 +0,0 @@ -# No args launches on the head node (node 0) -# Args 1-N launch on worker nodes (nodes 1 through N-1) -# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell -WORKER_NUM=${1:-} -if [[ -z "$WORKER_NUM" ]]; then - # Empty means we are on the head node - if [[ -n "${COMMAND:-}" ]]; then - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01736" --jobid 9240549 bash -c "$COMMAND" - else - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-01736" --jobid 9240549 --pty bash - fi -else - # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1) - # and use nodes_array[1] through nodes_array[N-1] - if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then - echo "Error: WORKER_NUM must be between 1 and 0" - exit 1 - fi - nodes_array=(pool0-01736) - if [[ -n "${COMMAND:-}" ]]; then - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9240549 bash -c "$COMMAND" - else - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9240549 --pty bash - fi -fi diff --git a/9261863-attach.sh b/9261863-attach.sh deleted file mode 100755 index d1d6280cb3..0000000000 --- a/9261863-attach.sh +++ /dev/null @@ -1,25 +0,0 @@ -# No args launches on the head node (node 0) -# Args 1-N launch on worker nodes (nodes 1 through N-1) -# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell -WORKER_NUM=${1:-} -if [[ -z "$WORKER_NUM" ]]; then - # Empty means we are on the head node - if [[ -n "${COMMAND:-}" ]]; then - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00629" --jobid 9261863 bash -c "$COMMAND" - else - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-head --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "pool0-00629" --jobid 9261863 --pty bash - fi -else - # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1) - # and use nodes_array[1] through nodes_array[N-1] - if [[ $WORKER_NUM -lt 1 || $WORKER_NUM -ge 1 ]]; then - echo "Error: WORKER_NUM must be between 1 and 0" - exit 1 - fi - nodes_array=(pool0-00629) - if [[ -n "${COMMAND:-}" ]]; then - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9261863 bash -c "$COMMAND" - else - srun --no-container-mount-home --gres=gpu:8 -A coreai_dlalgo_nemorl -p interactive --overlap --container-name=ray-worker-$WORKER_NUM --container-workdir=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/sfawzy/nemo-rl --nodes=1 --ntasks=1 -w "${nodes_array[$WORKER_NUM]}" --jobid 9261863 --pty bash - fi -fi diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml index 67ff8a71d2..891976166d 100644 --- a/examples/configs/distillation_math.yaml +++ b/examples/configs/distillation_math.yaml @@ -84,77 +84,6 @@ policy: &POLICY_BASE foreach: False fused: False - megatron_cfg: &MEGATRON_BASE - enabled: false - empty_unused_memory_level: 0 - activation_checkpointing: false - converter_type: "Qwen3ForCausalLM" - tensor_model_parallel_size: 2 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 2 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 2 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - - optimizer: - optimizer: "adam" - lr: 2.00001e-5 - min_lr: 2.0e-5 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - clip_grad: ${policy.max_grad_norm} - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 10 - lr_warmup_init: 2.0e-6 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - scheduler: - name: "torch.optim.lr_scheduler.LinearLR" kwargs: diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml index c3d1dd901a..2865707fbb 100644 --- a/examples/configs/distillation_math_megatron.yaml +++ b/examples/configs/distillation_math_megatron.yaml @@ -35,9 +35,10 @@ policy: &POLICY_BASE make_sequence_length_divisible_by: ${mul:${mul:${.megatron_cfg.tensor_model_parallel_size}, ${.megatron_cfg.context_parallel_size}}, 2} + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_1p7b_pretrain_config + megatron_cfg: &MEGATRON_BASE enabled: true - megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_1p7b_pretrain_config empty_unused_memory_level: 0 activation_checkpointing: false converter_type: "Qwen3ForCausalLM" @@ -141,9 +142,9 @@ policy: &POLICY_BASE teacher: <<: *POLICY_BASE model_name: "Qwen/Qwen3-4B" + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_4b_pretrain_config megatron_cfg: <<: *MEGATRON_BASE - megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_4b_pretrain_config context_parallel_size: 2 tensor_model_parallel_size: 2 pipeline_model_parallel_size: 2 diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index f2b57b0bbd..ef21c555d2 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -106,78 +106,7 @@ policy: factor: 1.0 total_iters: 10000000000 - milestones: [20] - - ## ignored since enabled=false, but needed for testing purposes - megatron_cfg: - enabled: false - empty_unused_memory_level: 1 - activation_checkpointing: false - tensor_model_parallel_size: 2 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: true - freeze_moe_router: false - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "aux_loss" - moe_router_bias_update_rate: 1e-3 - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - # gives ~25% training perf speedup with sequence packing and apply_rope_fusion - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - - optimizer: - optimizer: "adam" - lr: 5.0e-6 #4.0e-5 - min_lr: 5.0e-6 #4.0e-5 - weight_decay: 0.1 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-8 - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_warmup_iters: 1 - lr_warmup_init: 0.00000001 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - data_parallel_sharding_strategy: "optim_grads_params" - use_custom_fsdp: false - data: max_input_seq_length: ${policy.max_total_sequence_length} shuffle: true diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 90269726d7..35dbe01e79 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -105,81 +105,6 @@ policy: lora_A_init: "xavier" # Initialization method for LoRA A matrix: "xavier" or "uniform" use_triton: true # Use Triton-optimized kernels for LoRA (faster but requires flash-attn). Disable when tensor_parallel_size > 1 - megatron_cfg: - enabled: false - empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. - activation_checkpointing: false - converter_type: "Qwen2ForCausalLM" - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - # gives ~25% training perf speedup with sequence packing and apply_rope_fusion - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - - optimizer: - optimizer: "adam" - lr: 5.0e-6 - min_lr: 5.0e-7 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 13 - lr_warmup_init: 5.0e-7 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - - fp8_cfg: null - - env_vars: null # See docs/design-docs/sequence-packing-and-dynamic-batching.md # for more details on dynamic batching and sequence packing. diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index a4d7592f80..671e0cbbb1 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -70,76 +70,9 @@ policy: sequence_length_round: 64 max_grad_norm: 1.0 - # makes the training sequence length divisible by the tensor parallel size - # this is useful for sequence parallel training - make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} - optimizer: null # remove default FSDP optimizer - - megatron_cfg: - enabled: true - megatron_recipe: null # Set to a fully qualified recipe path to use a Megatron-Bridge recipe, e.g. megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config - empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. - activation_checkpointing: false - converter_type: "Qwen2ForCausalLM" - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - - optimizer: - optimizer: "adam" - lr: 5.0e-6 - min_lr: 5.0e-7 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 13 - lr_warmup_init: 5.0e-7 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" + optimizer: null # remove default FSDP optimizer generation: backend: "vllm" diff --git a/examples/configs/grpo_math_70B_megatron.yaml b/examples/configs/grpo_math_70B_megatron.yaml index 4d17fdcea3..c89e4e57b8 100644 --- a/examples/configs/grpo_math_70B_megatron.yaml +++ b/examples/configs/grpo_math_70B_megatron.yaml @@ -22,6 +22,8 @@ policy: scheduler: null # remove default FSDP scheduler + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config + megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/grpo_math_8B_megatron.yaml b/examples/configs/grpo_math_8B_megatron.yaml index 8d2ddfa90a..e52b3d2d3e 100644 --- a/examples/configs/grpo_math_8B_megatron.yaml +++ b/examples/configs/grpo_math_8B_megatron.yaml @@ -28,9 +28,10 @@ policy: scheduler: null # remove default FSDP scheduler + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config + megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. converter_type: "LlamaForCausalLM" tensor_model_parallel_size: 1 diff --git a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml index 37616e32b0..81d812372e 100644 --- a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml +++ b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml @@ -26,6 +26,8 @@ policy: scheduler: null # remove default FSDP scheduler + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config + megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml index 6fda3fe24e..6ae8b1ff1a 100644 --- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml +++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml @@ -22,6 +22,7 @@ policy: ${.megatron_cfg.context_parallel_size}}, 2} megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config teacher: model_name: Qwen/Qwen3-32B dtensor_cfg: diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml index a19a094bf5..44843ac0c1 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml @@ -18,9 +18,9 @@ policy: enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config tensor_model_parallel_size: 4 logger: wandb_enabled: true diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml index 83fac04256..8e8b2a8a3d 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml @@ -20,9 +20,9 @@ policy: enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config pipeline_model_parallel_size: 2 logger: wandb_enabled: true diff --git a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml index 0378daaa41..0523d30ac8 100644 --- a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml @@ -17,9 +17,9 @@ policy: enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.deepseek.deepseek_v3.deepseek_v3_pretrain_config megatron_cfg: enabled: true - megatron_recipe: null # Can be set to megatron.bridge.recipes.deepseek.deepseek_v3.deepseek_v3_pretrain_config activation_checkpointing: true tensor_model_parallel_size: 8 expert_model_parallel_size: 32 diff --git a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml index b3dec78e98..58655c471e 100755 --- a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml @@ -8,6 +8,7 @@ policy: model_name: openai/gpt-oss-20b train_micro_batch_size: 1 max_total_sequence_length: 4096 + megatron_recipe: megatron.bridge.recipes.openai.gptoss.gptoss_20b_pretrain_config megatron_cfg: enabled: true expert_model_parallel_size: 8 diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml index f2c4af29c5..8d21260fc6 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml @@ -16,9 +16,9 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config converter_type: LlamaForCausalLM pipeline_model_parallel_size: 2 activation_checkpointing: true diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml index 624ced10c5..4930f552c2 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml @@ -17,9 +17,9 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config converter_type: LlamaForCausalLM pipeline_model_parallel_size: 2 activation_checkpointing: true diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml index a3235a7b41..3133e9d3eb 100755 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml @@ -10,9 +10,9 @@ policy: tokenizer: name: meta-llama/Llama-3.2-1B-Instruct optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config scheduler: lr_warmup_iters: 50 dtensor_cfg: diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml index 728a711b48..f89d752e81 100644 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml @@ -10,9 +10,9 @@ policy: tokenizer: name: meta-llama/Llama-3.2-1B-Instruct optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config scheduler: lr_warmup_iters: 50 dtensor_cfg: diff --git a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml index 92fb87c196..071623c60f 100644 --- a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml +++ b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml @@ -20,6 +20,7 @@ policy: make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null scheduler: null + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config megatron_cfg: enabled: true converter_type: LlamaForCausalLM diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml index 83ea6128ef..2c20dcf8ba 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml @@ -20,6 +20,7 @@ policy: algorithm: modified_ffd make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.nvidia.moonlight.moonlight_16b_a3b_pretrain_config megatron_cfg: enabled: true expert_model_parallel_size: 4 diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml index 86690abcc2..31ed4e07b0 100644 --- a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml @@ -8,6 +8,7 @@ policy: tokenizer: name: nvidia/NVIDIA-Nemotron-Nano-12B-v2 optimizer: null + megatron_recipe: megatron.bridge.recipes.nvidia.nemotron.nemotron_nano_12b_v2_pretrain_config megatron_cfg: enabled: true bias_activation_fusion: false diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml index fd0a48a663..51b0a9c5b6 100755 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml @@ -14,6 +14,7 @@ policy: max_total_sequence_length: 4096 dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_7b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 2 diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml index 6e0aa5cd81..830a259e98 100755 --- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml @@ -17,6 +17,7 @@ policy: enabled: false algorithm: modified_ffd make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml index d155afac75..aefc0a09ef 100644 --- a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml @@ -15,9 +15,9 @@ policy: enabled: false optimizer: null scheduler: null + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_8b_pretrain_config megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_8b_pretrain_config converter_type: Qwen3ForCausalLM tensor_model_parallel_size: 4 optimizer: diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml index 33434d14e0..c638f8a85d 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml @@ -17,9 +17,9 @@ policy: max_total_sequence_length: 4096 dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config tensor_model_parallel_size: 4 pipeline_model_parallel_size: 2 freeze_moe_router: true diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml index 6c2b1117f6..96ccf66d44 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml @@ -14,9 +14,9 @@ policy: chat_template: default dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config peft: enabled: true dim: 128 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml index 257624dd9f..43e358acea 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml @@ -19,9 +19,9 @@ policy: enabled: true make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config tensor_model_parallel_size: 2 pipeline_model_parallel_size: 2 optimizer: diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml index 100a87be03..d3ba4e5a28 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml @@ -17,9 +17,9 @@ policy: enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config tensor_model_parallel_size: 2 pipeline_model_parallel_size: 2 optimizer: diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml index d3bdd77bb2..cfe381fd33 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml @@ -9,6 +9,7 @@ policy: max_total_sequence_length: 16384 dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_math_7b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 4 diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml index d301812f91..ac8e882875 100644 --- a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml +++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml @@ -7,8 +7,8 @@ policy: enabled: false dynamic_batching: enabled: false - make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_vl_3b_pretrain_config logger: wandb: name: vlm-grpo-3b-megatron diff --git a/examples/configs/rm.yaml b/examples/configs/rm.yaml index 4b0936fec5..49e56d11e8 100644 --- a/examples/configs/rm.yaml +++ b/examples/configs/rm.yaml @@ -73,62 +73,6 @@ policy: foreach: false fused: false - ## ignored since enabled=false, but needed for testing purposes - megatron_cfg: - enabled: false - empty_unused_memory_level: 1 - activation_checkpointing: false - tensor_model_parallel_size: 2 - pipeline_model_parallel_size: 2 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: false - - optimizer: - optimizer: "adam" - lr: 2.0e-6 - min_lr: 1.9999e-6 - weight_decay: 0.1 - bf16: false - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-5 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 50 - lr_warmup_init: 1.9999e-6 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: false - data_parallel_sharding_strategy: "optim_grads_params" - - data: max_input_seq_length: ${policy.max_total_sequence_length} shuffle: true diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 6d53d7f606..71d8c1cc84 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -88,93 +88,6 @@ policy: foreach: False fused: False - ## ignored since enabled=false, but needed for testing purposes - megatron_cfg: - enabled: false - env_vars: {} - empty_unused_memory_level: 1 - activation_checkpointing: false - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: false - freeze_moe_router: false - moe_router_dtype: null - moe_router_load_balancing_type: "aux_loss" - moe_router_bias_update_rate: 1e-3 - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - # gives ~25% training perf speedup with sequence packing and apply_rope_fusion - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - - peft: - enabled: false - target_modules: [] - exclude_modules: [] - dim: 8 - alpha: 32 - dropout: 0.0 - dropout_position: "post" - lora_A_init_method: "xavier" - lora_B_init_method: "zero" - a2a_experimental: false - lora_dtype: None - - - optimizer: - optimizer: "adam" # When weight decay is set, it actually uses AdamW - lr: 5.0e-6 - min_lr: 4.9999e-6 - weight_decay: 0.1 # When weight decay is set, it actually uses AdamW - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-5 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 50 - lr_warmup_init: 4.9999e-6 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - data_parallel_sharding_strategy: "optim_grads_params" - use_custom_fsdp: false - data: max_input_seq_length: ${policy.max_total_sequence_length} add_bos: true diff --git a/examples/configs/sft_openmathinstruct2.yaml b/examples/configs/sft_openmathinstruct2.yaml index 63fa6d65e4..00b7bbf8e7 100644 --- a/examples/configs/sft_openmathinstruct2.yaml +++ b/examples/configs/sft_openmathinstruct2.yaml @@ -39,9 +39,6 @@ policy: context_parallel_size: 1 custom_parallel_plan: null - megatron_cfg: - enabled: false - dynamic_batching: enabled: false diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml index 9df98169c8..2d137012ef 100644 --- a/examples/configs/sft_openmathinstruct2_megatron.yaml +++ b/examples/configs/sft_openmathinstruct2_megatron.yaml @@ -32,8 +32,9 @@ policy: dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config + megatron_cfg: - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config activation_checkpointing: false context_parallel_size: 1 distributed_data_parallel_config: diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml index 0c62a03954..9b0275ca47 100644 --- a/examples/configs/vlm_grpo_3B_megatron.yaml +++ b/examples/configs/vlm_grpo_3B_megatron.yaml @@ -77,7 +77,6 @@ policy: train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} sequence_length_round: 64 - make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} max_grad_norm: 1.0 sequence_packing: enabled: false diff --git a/nemo_rl/models/megatron/recipe_config.py b/nemo_rl/models/megatron/recipe_config.py index ddcb7862e3..21861fc403 100644 --- a/nemo_rl/models/megatron/recipe_config.py +++ b/nemo_rl/models/megatron/recipe_config.py @@ -20,11 +20,11 @@ layer RL-specific settings on top. Recipes are specified via their fully qualified Python import path in the -YAML config under ``policy.megatron_cfg.megatron_recipe``. For example: +YAML config under ``policy.megatron_recipe``. For example: policy: + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: - megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config ... The import path is resolved at runtime using ``load_recipe()``. diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index cfe49b7ad6..956c6e5400 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -279,7 +279,9 @@ def setup_model_config( ) -> tuple[ConfigContainer, Any]: """Setup model configuration.""" model_cfg = None - megatron_recipe = config["megatron_cfg"].get("megatron_recipe") + megatron_recipe = config.get("megatron_recipe") or config.get( + "megatron_cfg", {} + ).get("megatron_recipe") if megatron_recipe: # Use Megatron-Bridge recipe specified in config @@ -331,27 +333,33 @@ def setup_model_config( # Apply performance settings _apply_performance_config(model_cfg, config) - # Validate optimizer configuration - _validate_optimizer_config(config) # Optional layernorm epsilon if "layernorm_epsilon" in config["megatron_cfg"]: model_cfg.layernorm_epsilon = config["megatron_cfg"]["layernorm_epsilon"] - # Validate chunking configuration - _validate_chunking_config(config) - # Create checkpoint configs checkpoint_config = _create_checkpoint_config(pretrained_path, weights_path) - # Validate training configuration - _validate_training_config(config, model_cfg) - # Update megatron config with checkpoint, optimizer, scheduler, etc. _update_megatron_config(megatron_cfg, checkpoint_config, config, hf_model_name) _validate_dtype_config(dtype, megatron_cfg.model, megatron_cfg.optimizer) + # Validate chunking configuration + _validate_chunking_config(config) + + # Validate optimizer configuration + _validate_optimizer_config(megatron_cfg) + + # Validate training configuration + _validate_training_config(megatron_cfg, model_cfg) + + if "make_sequence_length_divisible_by" not in config: + config["make_sequence_length_divisible_by"] = ( + model_cfg.tensor_model_parallel_size + ) + return megatron_cfg, model_cfg @@ -481,12 +489,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: ) -def _validate_optimizer_config(config: PolicyConfig) -> None: +def _validate_optimizer_config(megatron_cfg: ConfigContainer) -> None: """Validate optimizer configuration.""" - optimizer_cpu_offload = config["megatron_cfg"]["optimizer"]["optimizer_cpu_offload"] - optimizer_offload_fraction = config["megatron_cfg"]["optimizer"][ - "optimizer_offload_fraction" - ] + optimizer_cpu_offload = megatron_cfg.optimizer.optimizer_cpu_offload + optimizer_offload_fraction = megatron_cfg.optimizer.optimizer_offload_fraction if optimizer_cpu_offload: # Currently, hybrid optimizer (partly on GPU and partly on CPU) is not supported because it conflicts with the way @@ -524,9 +530,9 @@ def _create_checkpoint_config( ) -def _validate_training_config(config: PolicyConfig, model_cfg: Any) -> None: +def _validate_training_config(megatron_cfg: ConfigContainer, model_cfg: Any) -> None: """Validate training configuration.""" - assert "train_iters" in config["megatron_cfg"], ( + assert megatron_cfg.train.train_iters is not None, ( "train_iters must be set in megatron_cfg. For an example, see " "https://github.com/NVIDIA-NeMo/RL/blob/bccbc377705a81a1f4b3c31ad9767bcc15f735a8/nemo_rl/algorithms/sft.py#L175-L179." ) diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index fa8d1d4501..d83a209f49 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -158,11 +158,6 @@ class MegatronConfigDisabled(TypedDict): class MegatronConfig(TypedDict): enabled: Literal[True] - # Fully qualified Python import path to a Megatron-Bridge recipe function. - # When set, the recipe is loaded at runtime to provide the base model configuration. - # When null/unset, configuration is loaded from the checkpoint's run_config.yaml. - # Example: "megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config" - megatron_recipe: NotRequired[str | None] env_vars: NotRequired[dict[str, str] | None] # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. # Setting to 0 is faster, but you are more likely to run out of GPU memory. In SFT/DPO, the default is 0. @@ -261,6 +256,11 @@ class PolicyConfig(TypedDict): reward_model_cfg: NotRequired[RewardModelConfig] dtensor_cfg: DTensorConfig | DTensorConfigDisabled megatron_cfg: NotRequired[MegatronConfig | MegatronConfigDisabled] + # Fully qualified Python import path to a Megatron-Bridge recipe function. + # When set, the recipe is loaded at runtime to provide the base model configuration. + # When null/unset, configuration is loaded from the checkpoint's run_config.yaml. + # Example: "megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config" + megatron_recipe: NotRequired[str | None] hf_config_overrides: NotRequired[dict[str, Any]] dynamic_batching: DynamicBatchingConfig | DynamicBatchingConfigDisabled sequence_packing: NotRequired[SequencePackingConfig | SequencePackingConfigDisabled] From 35cb86e01a723a2ffc048c917aeb3bcecaac5ca0 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 6 Feb 2026 10:17:12 -0800 Subject: [PATCH 4/8] fix. --- nemo_rl/models/megatron/setup.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index 956c6e5400..19963ede7f 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -215,9 +215,7 @@ def validate_and_set_config( } dtype = dtype_map[config["precision"]] - # Optimizer configuration - optimizer_cpu_offload = config["megatron_cfg"]["optimizer"]["optimizer_cpu_offload"] - offload_optimizer_for_logprob = config["offload_optimizer_for_logprob"] + # Reward models are not yet supported with Megatron. if "reward_model_cfg" in config and config["reward_model_cfg"]["enabled"]: @@ -234,11 +232,14 @@ def validate_and_set_config( pretrained_path=pretrained_path, weights_path=weights_path, ) + # Optimizer configuration + optimizer_cpu_offload = megatron_cfg.optimizer.optimizer_cpu_offload + offload_optimizer_for_logprob = config["offload_optimizer_for_logprob"] final_padded_vocab_size = calculate_padded_vocab_size( megatron_cfg.model.vocab_size, megatron_cfg.model.make_vocab_size_divisible_by, - config["megatron_cfg"]["tensor_model_parallel_size"], + megatron_cfg.model.tensor_model_parallel_size, ) return RuntimeConfig( From 0b88cccc042bec07b5f3435b7fd135669b6e5ddd Mon Sep 17 00:00:00 2001 From: Sherif Fawzy Date: Mon, 9 Feb 2026 11:16:09 -0800 Subject: [PATCH 5/8] recipes/llm/performance complete. --- .../.grpo-deepseek-v3-32n4g.yaml.swp | Bin 12288 -> 0 bytes .../.grpo-deepseek-v3-32n8g.yaml.swp | Bin 12288 -> 0 bytes .../llm/performance/dapo-deepseek-v3-64n8g.yaml | 1 + .../llm/performance/grpo-deepseek-v3-32n4g.yaml | 5 +++++ .../llm/performance/grpo-deepseek-v3-32n8g.yaml | 1 + .../grpo-deepseek-v3-64n4g-async-1off.yaml | 5 +++++ .../grpo-deepseek-v3-64n8g-async-1off.yaml | 4 ++++ .../grpo-deepseek-v3-64n8g-fp8-async-1off.yaml | 9 +++++++++ ...po-llama3.1-8b-instruct-2n4g-async-1off.yaml | 4 ++++ .../grpo-llama3.1-8b-instruct-2n4g.yaml | 1 + ...po-llama3.1-8b-instruct-2n8g-async-1off.yaml | 2 ++ ...lama3.1-8b-instruct-2n8g-fp8-async-1off.yaml | 8 ++++++++ .../grpo-llama3.1-8b-instruct-2n8g.yaml | 1 + .../llm/performance/grpo-qwen3-235b-16n4g.yaml | 4 ++++ .../llm/performance/grpo-qwen3-235b-16n8g.yaml | 1 + .../grpo-qwen3-235b-32n4g-async-1off.yaml | 4 ++++ .../grpo-qwen3-235b-32n8g-async-1off.yaml | 7 +++++++ .../grpo-qwen3-30ba3b-24n8g-async-8off.yaml | 5 +++++ .../llm/performance/grpo-qwen3-30ba3b-4n4g.yaml | 1 + .../performance/grpo-qwen3-30ba3b-4n8g-40K.yaml | 1 + .../grpo-qwen3-30ba3b-4n8g-async-1off.yaml | 5 +++++ .../llm/performance/grpo-qwen3-30ba3b-4n8g.yaml | 1 + .../grpo-qwen3-30ba3b-8n4g-async-1off.yaml | 5 +++++ .../llm/performance/grpo-qwen3-32b-4n4g.yaml | 1 + .../llm/performance/grpo-qwen3-32b-4n8g.yaml | 1 + .../grpo-qwen3-32b-8n4g-async-1off.yaml | 4 ++++ .../grpo-qwen3-32b-8n8g-async-1off.yaml | 4 ++++ 27 files changed, 85 insertions(+) delete mode 100644 examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp delete mode 100644 examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n8g.yaml.swp diff --git a/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp b/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp deleted file mode 100644 index 287b7b097343d270e8223669de52de4e6ab3a829..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI2!EV$r5Qe?nOVJ|krW|26-OU0eJOL6Mi^PSIE@?9no89n@gSAMwbo7Iz*p8c z!bnJJ@9d-mZ~N#ho$q`ez&@}cbDHD9}WjJ69FPX z1c(3;AOb{y2oM1x@E;I}Zp7YWoj3Yg@AUKN(%dg;A_7E!2oM1xKm>>Y5g-CYfCvx) zB0vQGK?0&=?Bfn&=cw-g|M30)eV4Iss86U5s8f_ey+S=ijZr^tGWHeq1@#$qhB`$Z zqMoDfqxMj{s4o7`E(Z-FKm>>Y5g-CYfCvx)B0vO)01@~b1UNTslN(z?$ap$uoY$@m zIW}IeH#5FqQnw+3&+rW0qJfNW@5M&Lu`p%PIkm0xQu(ZV2v_uOGd!z@z??@g!TQ_@ zFSG`o2lWn9t?$J75w?TpvZ}k-H5l+BDr@>XVFg;Oz=vEJdfY^VyDL8XCl`{MCAA-p;=FnfFGr{POYL8@K4y z;e_D2NXYiZ>&<75Y!!c9BF>l)hyUr~v!cxNf?vre)p@p~1LNl2@;^m`(L=JzR1n&--SYO9T>u zHGyq1n@+~@>G|iLrI#vh z-#T?)J*HbCkO(9Ki9jNd2qXfDKq8O`Bm#**B9I9D2MO?!5b-!6{0TUZ|NpQ5{{QJ| zLVke0fB% z7ocCBB;-rz0rVb}LwBIppqHV`(C-%r`4RdC`ULt2`T+8ffcBsP^aAuJ;`t5w4*C}Q z9QqKt58Z=eyl+A2N(2&tL?97J1QLNnAQAYN2~etA#k47f&**GQD6MVl8CpBL!Hn*b zLbg5#mthXpT#1aH-R;>KSZCfAMabx?g%pDaT$g!#JqYcM%MEuW6Po8zlo<`K6>Fo1 zMy07)*|?127ypPRQ-ZmA?H{GwFLi;1>PExU)#?;i|8-`$<5CLAeDe+rx|k_lprMr& zRUO@|g%*y7#^^p~N{^haxb)FmN-Mj&wvUg}u>|eiywa_5w{_E*Ut-wcjpdsI_6^U&P7-dpXZ3Hzhh(cpb2HhABi z4aT$SmBF>Wj%hG+p}QQ7u5{@8gW~)bt&vT!j77HD&A!~7@U0AwtoV(2+5-|A`Y!acuD9THiYVH;l$E Ang9R* diff --git a/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml b/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml index 9c4edd2b30..2bfaf20955 100644 --- a/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml +++ b/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml @@ -40,6 +40,7 @@ policy: enabled: false make_sequence_length_divisible_by: ${mul:${policy.dtensor_cfg.tensor_parallel_size}, ${mul:2, ${policy.dtensor_cfg.context_parallel_size}}} + megatron_recipe: megatron_bridge.recipes.deepseek.deepseek_v3_pretrain_config megatron_cfg: empty_unused_memory_level: 2 enabled: true diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml index 9b157ea779..890124d3e0 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml @@ -7,6 +7,11 @@ policy: generation: vllm_cfg: tensor_parallel_size: 32 + megatron_cfg: + pipeline_model_parallel_size: 8 + expert_model_parallel_size: 16 + num_layers_in_first_pipeline_stage: 7 + num_layers_in_last_pipeline_stage: 6 logger: log_dir: logs/grpo-deepseek-v3-32n4g wandb: diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml index 75457ab802..7965f72764 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml @@ -19,6 +19,7 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron_bridge.recipes.deepseek.deepseek_v3_pretrain_config_32nodes megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml index ff3e68d7da..bf9a30a5d3 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml @@ -4,6 +4,11 @@ checkpointing: policy: sequence_packing: enabled: false + megatron_cfg: + pipeline_model_parallel_size: 8 + expert_model_parallel_size: 16 + num_layers_in_first_pipeline_stage: 7 + num_layers_in_last_pipeline_stage: 6 generation: colocated: resources: diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml index 0260eb39e6..595654a3a3 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-async-1off.yaml @@ -10,6 +10,10 @@ checkpointing: checkpoint_dir: results/grpo-deepseek-v3-64n8g-async-1off policy: logprob_batch_size: 2 + megatron_cfg: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 16 + expert_model_parallel_size: 16 generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml index b8f8ece97d..7f6b5ae86b 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml @@ -2,6 +2,15 @@ defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml checkpointing: checkpoint_dir: results/grpo-deepseek-v3-64n8g-fp8-async-1off policy: + megatron_cfg: + fp8_cfg: + enabled: true + fp8: "e4m3" + fp8_recipe: "blockwise" + fp8_param: false + moe_router_dtype: fp32 + env_vars: + NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" generation: vllm_cfg: tensor_parallel_size: 16 diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml index ba06869c7e..d906eda2b4 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml @@ -9,6 +9,10 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g-async-1off policy: + megatron_cfg: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + sequence_parallel: false generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml index a99f7c1498..e3c9e25c85 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml @@ -17,6 +17,7 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron_bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml index b6d7ed441d..c0263f68fb 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml @@ -9,6 +9,8 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-async-1off policy: + megatron_cfg: + pipeline_model_parallel_size: 1 generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml index 1ab2bafaf4..b32786f7d7 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml @@ -2,6 +2,14 @@ defaults: ./grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml checkpointing: checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off policy: + megatron_cfg: + fp8_cfg: + enabled: true + fp8: "e4m3" + fp8_recipe: "blockwise" + fp8_param: false + env_vars: + NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" generation: vllm_cfg: precision: "fp8" diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml index afdbf8c414..fb0f103855 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml @@ -17,6 +17,7 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron_bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml index d542091951..1640deda09 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml @@ -2,6 +2,10 @@ defaults: ./grpo-qwen3-235b-16n8g.yaml checkpointing: checkpoint_dir: results/grpo-qwen3-235b-16n4g policy: + megatron_cfg: + pipeline_model_parallel_size: 4 + num_layers_in_first_pipeline_stage: 23 + num_layers_in_last_pipeline_stage: 23 generation: vllm_cfg: tensor_parallel_size: 8 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml index 1376c8d340..e2e02de396 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml @@ -19,6 +19,7 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_235b_a22b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml index 13d8501363..f55b383686 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml @@ -2,6 +2,10 @@ defaults: ./grpo-qwen3-235b-32n8g-async-1off.yaml checkpointing: checkpoint_dir: results/grpo-qwen3-235b-32n4g-async-1off policy: + megatron_cfg: + pipeline_model_parallel_size: 4 + num_layers_in_first_pipeline_stage: 23 + num_layers_in_last_pipeline_stage: 23 generation: colocated: resources: diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml index 4545aa364a..cf4f5a6f98 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g-async-1off.yaml @@ -9,6 +9,13 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-235b-32n8g-async-1off policy: + megatron_cfg: + tensor_model_parallel_size: 4 + sequence_parallel: true + context_parallel_size: 1 + pipeline_model_parallel_size: 8 + expert_model_parallel_size: 16 + defer_fp32_logits: false generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml index b4d5409a61..11d917fc8b 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml @@ -9,6 +9,11 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-30ba3b-24n8g-async-8off policy: + megatron_cfg: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 8 + sequence_parallel: false generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml index 21b9746f4b..c4749c0faf 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml @@ -14,6 +14,7 @@ policy: optimizer: null scheduler: null make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml index 2270d5e272..d2a4eb24b5 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml @@ -14,6 +14,7 @@ policy: optimizer: null scheduler: null make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml index 797244576f..4cc5981460 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.yaml @@ -9,6 +9,11 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g-async-1off policy: + megatron_cfg: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 2 + expert_model_parallel_size: 8 + sequence_parallel: false generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml index 795764d3ee..6a029c6fde 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml @@ -7,6 +7,7 @@ checkpointing: checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g policy: model_name: Qwen/Qwen3-30B-A3B + megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config train_micro_batch_size: 1 max_total_sequence_length: 4096 dtensor_cfg: diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml index 6da999f169..a9837c87f2 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml @@ -9,6 +9,11 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-30ba3b-8n4g-async-1off policy: + megatron_cfg: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 16 + sequence_parallel: false generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml index 2e441cdb5f..d17dad323a 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml @@ -14,6 +14,7 @@ policy: optimizer: null scheduler: null make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_recipe: megatron_bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml index ad780ebc50..7b33ced71a 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml @@ -14,6 +14,7 @@ policy: optimizer: null scheduler: null make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_recipe: megatron_bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml index f50db7fde8..4f8a0a03bb 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml @@ -9,6 +9,10 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-32b-8n4g-async-1off policy: + megatron_cfg: + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + sequence_parallel: true generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml index a54a8e7747..9f20f34f40 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n8g-async-1off.yaml @@ -9,6 +9,10 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-qwen3-32b-8n8g-async-1off policy: + megatron_cfg: + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 4 + sequence_parallel: true generation: colocated: enabled: false From 64e681b29927c1d8e6475a5001e16db1fa56d7ff Mon Sep 17 00:00:00 2001 From: Sherif Fawzy Date: Mon, 9 Feb 2026 12:52:44 -0800 Subject: [PATCH 6/8] recipe/llm done. --- examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml | 1 + ...2b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml | 6 ++++++ ...2b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml | 1 + ...ama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml | 4 ++++ .../llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml | 7 +++++++ .../recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml | 3 +++ .../recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml | 2 +- .../llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml | 2 +- .../llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml | 6 ++++++ .../grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml | 1 + .../llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml | 2 +- .../recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml | 3 +++ .../recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml | 2 +- .../llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml | 2 ++ .../llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml | 2 +- .../recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml | 4 ++++ .../recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml | 2 +- ...grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml | 2 +- .../sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml | 2 ++ .../llm/sft-qwen2.5-math7b-2n4g-megatron.yaml | 4 ++++ .../llm/sft-qwen2.5-math7b-2n8g-megatron.yaml | 2 +- ...2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml | 12 +++++++++++- 22 files changed, 63 insertions(+), 9 deletions(-) diff --git a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml index 9035a3598c..8f615b4361 100644 --- a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml +++ b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml @@ -34,6 +34,7 @@ policy: dtensor_cfg: _v2: false context_parallel_size: 4 + megatron_recipe: megatron.bridge.recipes.qwen.qwen25_7b_finetune_config megatron_cfg: tensor_model_parallel_size: 4 pipeline_model_parallel_size: 2 diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml index 31100ce7b9..95c9e85573 100644 --- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml +++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack.yaml @@ -1,4 +1,10 @@ defaults: ./distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml +policy: + megatron_cfg: + tensor_model_parallel_size: 1 +teacher: + megatron_cfg: + tensor_model_parallel_size: 2 checkpointing: checkpoint_dir: checkpoints/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack logger: diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml index 6ae8b1ff1a..d8cce7d5d0 100644 --- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml +++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml @@ -31,6 +31,7 @@ teacher: enabled: false sequence_packing: enabled: true + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 4 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml index 1f75679f39..8324173dfc 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick.yaml @@ -1,4 +1,8 @@ defaults: ./dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml +policy: + megatron_cfg: + tensor_model_parallel_size: 1 + sequence_parallel: false logger: wandb: name: dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick diff --git a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml index 627a00574e..fb4a4bc880 100644 --- a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-32n4g-megatron.yaml @@ -1,5 +1,12 @@ defaults: ./grpo-dapomath17k-dsv3-megatron.yaml policy: + megatron_cfg: + tensor_model_parallel_size: 4 + expert_model_parallel_size: 16 + pipeline_model_parallel_size: 4 + context_parallel_size: 2 + num_layers_in_first_pipeline_stage: 15 + num_layers_in_last_pipeline_stage: 14 make_sequence_length_divisible_by: 4 generation: vllm_cfg: diff --git a/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml index ef033bc67f..c9719f381f 100644 --- a/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-gptoss-20b-8n4g-megatron.yaml @@ -1,5 +1,8 @@ defaults: ./grpo-gptoss-20b-8n8g-megatron.yaml policy: + megatron_cfg: + expert_model_parallel_size: 4 + tensor_model_parallel_size: 2 generation: vllm_cfg: tensor_parallel_size: 1 diff --git a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml index 58655c471e..4f2a8ee3ec 100755 --- a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml @@ -8,7 +8,7 @@ policy: model_name: openai/gpt-oss-20b train_micro_batch_size: 1 max_total_sequence_length: 4096 - megatron_recipe: megatron.bridge.recipes.openai.gptoss.gptoss_20b_pretrain_config + megatron_recipe: megatron.bridge.recipes.openai.gpt_oss.gpt_oss_20b_pretrain_config megatron_cfg: enabled: true expert_model_parallel_size: 8 diff --git a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml index 071623c60f..5c8d8594fd 100644 --- a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml +++ b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml @@ -20,7 +20,7 @@ policy: make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null scheduler: null - megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config + megatron_recipe: megatron.bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_finetune_config megatron_cfg: enabled: true converter_type: LlamaForCausalLM diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml index 4459adc9dd..97d6ffede7 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n4g-megatron.yaml @@ -1,6 +1,12 @@ defaults: ./grpo-moonlight-16ba3b-4n8g-megatron.yaml checkpointing: checkpoint_dir: results/grpo-moonlight-16ba3b-4n4g-megatron +policy: + megatron_cfg: + expert_model_parallel_size: 2 + pipeline_model_parallel_size: 2 + num_layers_in_first_pipeline_stage: 14 + num_layers_in_last_pipeline_stage: 13 logger: wandb: name: grpo-moonlight-16ba3b-4n4g-megatron diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml index 27108c55c7..951bb0371f 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml @@ -18,6 +18,7 @@ policy: enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.moonlight.moonlight_16b_pretrain_config megatron_cfg: enabled: true moe_router_dtype: fp32 diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml index 2c20dcf8ba..8674bdf00a 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml @@ -20,7 +20,7 @@ policy: algorithm: modified_ffd make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null - megatron_recipe: megatron.bridge.recipes.nvidia.moonlight.moonlight_16b_a3b_pretrain_config + megatron_recipe: megatron.bridge.recipes.moonlight.moonlight_16b_pretrain_config megatron_cfg: enabled: true expert_model_parallel_size: 4 diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml index ba30e6490e..da8301a19b 100644 --- a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n4g-megatron.yaml @@ -1,6 +1,9 @@ defaults: ./grpo-nano-v2-12b-1n8g-megatron.yaml checkpointing: checkpoint_dir: results/grpo-nano-v2-12b-1n4g-megatron +policy: + megatron_cfg: + tensor_model_parallel_size: 4 logger: log_dir: logs/grpo-nano-v2-12b-1n4g-megatron wandb: diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml index 31ed4e07b0..cd7a7c8b96 100644 --- a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml @@ -8,7 +8,7 @@ policy: tokenizer: name: nvidia/NVIDIA-Nemotron-Nano-12B-v2 optimizer: null - megatron_recipe: megatron.bridge.recipes.nvidia.nemotron.nemotron_nano_12b_v2_pretrain_config + megatron_recipe: megatron.bridge.recipes.nemotronh.nemotron_nano_12b_v2_pretrain_config megatron_cfg: enabled: true bias_activation_fusion: false diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml index 4029f002e8..b21c9dd51f 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n4g-megatron.yaml @@ -1,5 +1,7 @@ defaults: ./grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml policy: + megatron_cfg: + tensor_model_parallel_size: 1 make_sequence_length_divisible_by: 2 generation: vllm_cfg: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml index 51b0a9c5b6..e37c892929 100755 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml @@ -14,7 +14,7 @@ policy: max_total_sequence_length: 4096 dtensor_cfg: enabled: false - megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_7b_pretrain_config + megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen25_7b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 2 diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml index f26cbf49b2..79fbda389d 100644 --- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n4g-megatron.yaml @@ -1,5 +1,9 @@ defaults: ./grpo-qwen3-30ba3b-8n8g-megatron.yaml policy: + megatron_cfg: + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 2 + expert_model_parallel_size: 2 make_sequence_length_divisible_by: 2 generation: vllm_cfg: diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml index 830a259e98..c7f3eca79f 100755 --- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml @@ -17,7 +17,7 @@ policy: enabled: false algorithm: modified_ffd make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} - megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config + megatron_recipe: megatron.bridge.recipes.qwen.qwen3_30b_a3b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml index aefc0a09ef..777100853f 100644 --- a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml @@ -15,7 +15,7 @@ policy: enabled: false optimizer: null scheduler: null - megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_8b_pretrain_config + megatron_recipe: megatron.bridge.recipes.qwen.qwen3_8b_pretrain_config megatron_cfg: enabled: true converter_type: Qwen3ForCausalLM diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml index 798d5e0617..77c175fadf 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron.yaml @@ -1,5 +1,7 @@ defaults: ./sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml policy: + megatron_cfg: + tensor_model_parallel_size: 2 make_sequence_length_divisible_by: 2 checkpointing: checkpoint_dir: results/sft-llama3.1-70b-8n4g-tp2pp2-long-megatron diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml index 903db6113b..aad3f5c8e0 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n4g-megatron.yaml @@ -1,4 +1,8 @@ defaults: ./sft-qwen2.5-math7b-2n8g-megatron.yaml +policy: + megatron_cfg: + tensor_model_parallel_size: 2 + context_parallel_size: 1 logger: wandb: name: sft-qwen2.5-math7b-2n4g-megatron diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml index cfe381fd33..0b3388f915 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml @@ -9,7 +9,7 @@ policy: max_total_sequence_length: 16384 dtensor_cfg: enabled: false - megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_math_7b_pretrain_config + megatron_recipe: megatron.bridge.recipes.qwen.qwen25_7b_finetune_config megatron_cfg: enabled: true tensor_model_parallel_size: 4 diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml index ac8e882875..45188bc54e 100644 --- a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml +++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml @@ -7,8 +7,18 @@ policy: enabled: false dynamic_batching: enabled: false + make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null - megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen2_5_vl_3b_pretrain_config + megatron_cfg: + enabled: true + optimizer: + lr: 5.0e-07 + min_lr: 5.0e-08 + scheduler: + lr_warmup_iters: 50 + lr_warmup_init: 5.0e-08 + distributed_data_parallel_config: + overlap_grad_reduce: false logger: wandb: name: vlm-grpo-3b-megatron From ef54ee32bef8d591feea94be3e1c3d6e8e891ca6 Mon Sep 17 00:00:00 2001 From: Sherif Fawzy Date: Mon, 9 Feb 2026 13:37:13 -0800 Subject: [PATCH 7/8] more fixes. --- examples/configs/grpo_math_70B_megatron_fp8.yaml | 13 ++++++++++++- examples/configs/grpo_math_qwen30ba3b_megatron.yaml | 2 +- nemo_rl/models/megatron/recipe_config.py | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/examples/configs/grpo_math_70B_megatron_fp8.yaml b/examples/configs/grpo_math_70B_megatron_fp8.yaml index 322aa6c0f2..df239cd8ff 100644 --- a/examples/configs/grpo_math_70B_megatron_fp8.yaml +++ b/examples/configs/grpo_math_70B_megatron_fp8.yaml @@ -8,4 +8,15 @@ policy: generation: vllm_cfg: precision: "fp8" - use_deep_gemm: true \ No newline at end of file + use_deep_gemm: true + megatron_cfg: + pipeline_model_parallel_size: 8 + fp8_cfg: + enabled: true + fp8: "e4m3" + fp8_recipe: "blockwise" + fp8_param: false + optimizer: + use_precision_aware_optimizer: false + env_vars: + NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" \ No newline at end of file diff --git a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml index 81d812372e..2d4f0f3151 100644 --- a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml +++ b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml @@ -26,7 +26,7 @@ policy: scheduler: null # remove default FSDP scheduler - megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_30b_pretrain_config + megatron_recipe: megatron.bridge.recipes.qwen.qwen3_30b_a3b_finetune_config megatron_cfg: enabled: true diff --git a/nemo_rl/models/megatron/recipe_config.py b/nemo_rl/models/megatron/recipe_config.py index 21861fc403..4bf3d900fd 100644 --- a/nemo_rl/models/megatron/recipe_config.py +++ b/nemo_rl/models/megatron/recipe_config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 1ee4ebb198dc22b091db5f48f2fb34fa6f5cf77d Mon Sep 17 00:00:00 2001 From: root Date: Mon, 9 Feb 2026 20:45:58 -0800 Subject: [PATCH 8/8] Multiple fixes. --- nemo_rl/models/megatron/data.py | 23 ++++-- nemo_rl/models/megatron/setup.py | 82 +++++++++++-------- .../policy/workers/megatron_policy_worker.py | 11 ++- 3 files changed, 69 insertions(+), 47 deletions(-) diff --git a/nemo_rl/models/megatron/data.py b/nemo_rl/models/megatron/data.py index f884e95e1b..87ffd9e83f 100644 --- a/nemo_rl/models/megatron/data.py +++ b/nemo_rl/models/megatron/data.py @@ -128,6 +128,7 @@ def get_microbatch_iterator( mbs: int, straggler_timer: StragglerDetector, seq_length_key: Optional[str] = None, + model_cfg: Optional[Any] = None, ) -> Tuple[Iterator[ProcessedMicrobatch], int, int, int, int]: """Create a processed microbatch iterator from a batch of data. @@ -140,6 +141,8 @@ def get_microbatch_iterator( cfg: Configuration dictionary mbs: Microbatch size seq_length_key: Key for sequence lengths in data dict (auto-detected if None) + model_cfg: Optional Megatron model config (ConfigContainer). When provided, + parallelism settings are read from here instead of the raw config dict. Returns: Tuple containing the iterator and metadata @@ -175,6 +178,7 @@ def get_microbatch_iterator( ) = _get_pack_sequence_parameters_for_megatron( cfg["megatron_cfg"], pack_seq_dim_size, + model_cfg=model_cfg, ) micro_batch_size = 1 else: @@ -528,12 +532,15 @@ def _pack_sequences_for_megatron( def _get_pack_sequence_parameters_for_megatron( megatron_cfg: dict, max_seq_len_in_batch: int, + model_cfg: Optional[Any] = None, ): """Get pack sequence parameters for Megatron model processing with optional context parallelism. Args: - megatron_cfg: Megatron configuration + megatron_cfg: Megatron configuration dict (from YAML) max_seq_len_in_batch: Maximum sequence length in batch + model_cfg: Optional Megatron model config (ConfigContainer). When provided, + parallelism settings are read from here instead of the raw config dict. Returns: Tuple of: @@ -541,10 +548,16 @@ def _get_pack_sequence_parameters_for_megatron( - pad_packed_seq_to_multiple_of: Pad packed sequences to a multiple of this value - pad_packed_seq_to: Pad packed sequences to this value (before CP) """ - tp_size = megatron_cfg["tensor_model_parallel_size"] - sp = megatron_cfg["sequence_parallel"] - pp_size = megatron_cfg["pipeline_model_parallel_size"] - cp_size = megatron_cfg["context_parallel_size"] + if model_cfg is not None: + tp_size = model_cfg.tensor_model_parallel_size + sp = model_cfg.sequence_parallel + pp_size = model_cfg.pipeline_model_parallel_size + cp_size = model_cfg.context_parallel_size + else: + tp_size = megatron_cfg["tensor_model_parallel_size"] + sp = megatron_cfg.get("sequence_parallel", False) + pp_size = megatron_cfg["pipeline_model_parallel_size"] + cp_size = megatron_cfg["context_parallel_size"] fp8_cfg = megatron_cfg.get("fp8_cfg", None) or {} use_fp8 = fp8_cfg.get("enabled", False) use_blockwise_fp8 = fp8_cfg.get("fp8_recipe", None) == "blockwise" diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index 19963ede7f..b9eb13c6e3 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -372,13 +372,13 @@ def _apply_parallelism_config(model_cfg: Any, config: PolicyConfig) -> None: model_cfg.pipeline_model_parallel_size = config["megatron_cfg"][ "pipeline_model_parallel_size" ] - model_cfg.num_layers_in_first_pipeline_stage = config["megatron_cfg"][ - "num_layers_in_first_pipeline_stage" - ] - model_cfg.num_layers_in_last_pipeline_stage = config["megatron_cfg"][ - "num_layers_in_last_pipeline_stage" - ] - model_cfg.sequence_parallel = config["megatron_cfg"]["sequence_parallel"] + model_cfg.num_layers_in_first_pipeline_stage = config["megatron_cfg"].get( + "num_layers_in_first_pipeline_stage", None + ) + model_cfg.num_layers_in_last_pipeline_stage = config["megatron_cfg"].get( + "num_layers_in_last_pipeline_stage", None + ) + model_cfg.sequence_parallel = config["megatron_cfg"].get("sequence_parallel", False) model_cfg.context_parallel_size = config["megatron_cfg"]["context_parallel_size"] if model_cfg.context_parallel_size > 1: @@ -389,41 +389,49 @@ def _apply_parallelism_config(model_cfg: Any, config: PolicyConfig) -> None: def _apply_moe_config(model_cfg: Any, config: PolicyConfig) -> None: """Apply Mixture of Experts configuration.""" - model_cfg.expert_tensor_parallel_size = config["megatron_cfg"][ - "expert_tensor_parallel_size" - ] - model_cfg.expert_model_parallel_size = config["megatron_cfg"][ - "expert_model_parallel_size" - ] + megatron_cfg = config["megatron_cfg"] + model_cfg.expert_tensor_parallel_size = megatron_cfg.get( + "expert_tensor_parallel_size", 1 + ) + model_cfg.expert_model_parallel_size = megatron_cfg.get( + "expert_model_parallel_size", 1 + ) # MoE stability settings # Setting moe_router_dtype to higher precision (e.g. fp64) can improve numerical stability, # especially when using many experts. - model_cfg.moe_router_dtype = config["megatron_cfg"]["moe_router_dtype"] + if "moe_router_dtype" in megatron_cfg: + model_cfg.moe_router_dtype = megatron_cfg["moe_router_dtype"] # The below two configs (and "freeze_moe_router") are used to stabilize moe training # by preventing updates to the moe router. We found that this is helpful in reducing # logprob error during training. # Set this to "none" to disable load balancing loss. - model_cfg.moe_router_load_balancing_type = config["megatron_cfg"][ - "moe_router_load_balancing_type" - ] + if "moe_router_load_balancing_type" in megatron_cfg: + model_cfg.moe_router_load_balancing_type = megatron_cfg[ + "moe_router_load_balancing_type" + ] # Set this to 0.0 to disable updates to the moe router expert bias - model_cfg.moe_router_bias_update_rate = config["megatron_cfg"][ - "moe_router_bias_update_rate" - ] + if "moe_router_bias_update_rate" in megatron_cfg: + model_cfg.moe_router_bias_update_rate = megatron_cfg[ + "moe_router_bias_update_rate" + ] - model_cfg.moe_enable_deepep = config["megatron_cfg"]["moe_enable_deepep"] - model_cfg.moe_token_dispatcher_type = config["megatron_cfg"][ - "moe_token_dispatcher_type" - ] - model_cfg.moe_shared_expert_overlap = config["megatron_cfg"][ - "moe_shared_expert_overlap" - ] + if "moe_enable_deepep" in megatron_cfg: + model_cfg.moe_enable_deepep = megatron_cfg["moe_enable_deepep"] + if "moe_token_dispatcher_type" in megatron_cfg: + model_cfg.moe_token_dispatcher_type = megatron_cfg[ + "moe_token_dispatcher_type" + ] + if "moe_shared_expert_overlap" in megatron_cfg: + model_cfg.moe_shared_expert_overlap = megatron_cfg[ + "moe_shared_expert_overlap" + ] - model_cfg.moe_permute_fusion = config["megatron_cfg"]["moe_permute_fusion"] + if "moe_permute_fusion" in megatron_cfg: + model_cfg.moe_permute_fusion = megatron_cfg["moe_permute_fusion"] def _apply_precision_config( @@ -454,8 +462,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: """Apply performance optimization configuration.""" model_cfg.parallel_output = True + megatron_cfg = config["megatron_cfg"] + # Activation checkpointing - if config["megatron_cfg"]["activation_checkpointing"]: + if megatron_cfg.get("activation_checkpointing", False): model_cfg.recompute_granularity = "full" model_cfg.recompute_method = "uniform" model_cfg.recompute_num_layers = 1 @@ -470,8 +480,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: ) # Fusion settings - model_cfg.apply_rope_fusion = config["megatron_cfg"]["apply_rope_fusion"] - model_cfg.bias_activation_fusion = config["megatron_cfg"]["bias_activation_fusion"] + if "apply_rope_fusion" in megatron_cfg: + model_cfg.apply_rope_fusion = megatron_cfg["apply_rope_fusion"] + if "bias_activation_fusion" in megatron_cfg: + model_cfg.bias_activation_fusion = megatron_cfg["bias_activation_fusion"] # FP8 configuration fp8_cfg = config["megatron_cfg"].get("fp8_cfg", None) @@ -741,7 +753,7 @@ def setup_model_and_optimizer( use_peft = policy_cfg["megatron_cfg"].get("peft", {}).get("enabled", False) mixed_precision_wrapper = Float16Module - if policy_cfg["megatron_cfg"]["freeze_moe_router"]: + if policy_cfg["megatron_cfg"].get("freeze_moe_router", False): if use_peft: raise ValueError( "Freezing the MOE router is not currently supported when using PEFT" @@ -1008,10 +1020,8 @@ def finalize_megatron_setup( ) should_disable_forward_pre_hook = ( - config["megatron_cfg"]["optimizer"]["use_distributed_optimizer"] - and config["megatron_cfg"]["distributed_data_parallel_config"][ - "overlap_param_gather" - ] + megatron_cfg.optimizer.use_distributed_optimizer + and megatron_cfg.ddp.overlap_param_gather ) return megatron_tokenizer, megatron_bridge, should_disable_forward_pre_hook, dp_size diff --git a/nemo_rl/models/policy/workers/megatron_policy_worker.py b/nemo_rl/models/policy/workers/megatron_policy_worker.py index 83d541f2ea..ef23ff556e 100644 --- a/nemo_rl/models/policy/workers/megatron_policy_worker.py +++ b/nemo_rl/models/policy/workers/megatron_policy_worker.py @@ -278,19 +278,15 @@ def __init__( self.model, self.optimizer, ) - print("HELLO") + # Dump ConfigContainer to YAML for inspection (only on rank 0) if self.rank == 0: - config_dump_path = "/lustre/fsw/portfolios/coreai/users/sfawzy/final_megatron_config_6.yaml" + config_dump_path = "/lustre/fsw/portfolios/coreai/users/sfawzy/final_megatron_config.yaml" try: self.megatron_cfg.to_yaml(config_dump_path) print(f"[DEBUG] Saved final ConfigContainer to: {config_dump_path}") except Exception as e: print(f"[WARNING] Failed to save ConfigContainer to YAML: {e}") - # Exit early after dumping config for inspection - import sys - print("[DEBUG] Exiting after ConfigContainer dump") - sys.exit(0) # vars used for refit ## will be initialized in prepare_refit_info @@ -385,6 +381,7 @@ def train( self.cfg, mbs, straggler_timer=self.mcore_state.straggler_timer, + model_cfg=self.megatron_cfg.model, ) # Track total microbatches for MoE aux-loss averaging total_num_microbatches += int(num_microbatches) @@ -569,6 +566,7 @@ def get_logprobs( self.cfg, logprob_batch_size, straggler_timer=self.mcore_state.straggler_timer, + model_cfg=self.megatron_cfg.model, ) def forward_step_fn( @@ -776,6 +774,7 @@ def get_topk_logits( self.cfg, logprob_batch_size, straggler_timer=self.mcore_state.straggler_timer, + model_cfg=self.megatron_cfg.model, ) def forward_step_fn(