Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
687538d
Add refactored recipe for qwen2, qwen3
athitten Jan 23, 2026
c058e98
Add refactored recipe for qwen3_moe, qwen3_next
athitten Jan 23, 2026
607c76e
Add refactored recipe for llama2, llama3
athitten Jan 24, 2026
3aee7d6
Add deepseek_v2, deepseek_v3 refactored recipe
athitten Jan 24, 2026
98875bc
Add refactored recipe for gemma2, gemma3
athitten Jan 24, 2026
5a639b1
Add refactored recipe for glm45
athitten Jan 25, 2026
7bd57e5
Add GPT OSS refactored recipe
athitten Jan 25, 2026
e56fbf5
Add kimi k2 refactored recipe
athitten Jan 25, 2026
127efed
Add refactored recipe for moonlight_16b
athitten Jan 26, 2026
7fd9fd8
Add refactored recipe nemotron nano v2
athitten Jan 26, 2026
ea7120a
Add refactored recipe for Nemotron-H
athitten Jan 26, 2026
b571a30
Add OLMoE 7B refactored config
athitten Jan 26, 2026
6c05491
Add refactored recipe for GPt3 175B
athitten Jan 26, 2026
59ede93
Add new pretrain_configs to the recipes and remove *_new.py recipes
athitten Jan 27, 2026
d3d44b7
Fix lint errors
athitten Jan 28, 2026
68bce75
Add missing _model_config and remove comments
athitten Jan 28, 2026
30b1680
Add dataset comment
athitten Jan 28, 2026
e40cd81
Remove unwanted comments
athitten Jan 28, 2026
5376714
Fix lint error
athitten Jan 28, 2026
84b4026
Fix recipe tests test_gpt3_175b.py,test_kimi_k2.py
athitten Jan 28, 2026
16e074b
Fix more tests
athitten Jan 28, 2026
b48c1a5
Fix nemotron recipe unit tests
athitten Jan 28, 2026
6aea639
Add refactored recipe for nemotron nanov3 and update test file
athitten Jan 28, 2026
4c6be48
Fix functional tests
athitten Jan 28, 2026
63e7311
fix lint check error
athitten Jan 28, 2026
54548b5
Fix functional tests CI errors
athitten Jan 29, 2026
7c9f47d
Fix for test_qat_workflow.py
athitten Jan 29, 2026
ad67f81
Add lint check fix
athitten Jan 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions examples/quantization/pretrain_quantized_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,17 @@ def main() -> None:
logger.info("------------------------------------------------------------------")

# Load base configuration from the recipe as a Python dataclass
# If --hf-path is provided, pass it to the recipe function
recipe_kwargs = {}
# Pretrain configs use parameterless API
cfg: ConfigContainer = pretrain_config()
logger.info("Loaded base configuration")

# If --hf-path is provided, override the model's HuggingFace path
if args.hf_path:
logger.info(f"Using custom HuggingFace path: {args.hf_path}")
recipe_kwargs["hf_path"] = args.hf_path
# Import AutoBridge to create a new model provider with the custom HF path
from megatron.bridge.models import AutoBridge

cfg: ConfigContainer = pretrain_config(**recipe_kwargs)
logger.info("Loaded base configuration")
cfg.model = AutoBridge.from_hf_pretrained(args.hf_path).to_megatron_provider(load_weights=False)
Comment on lines +159 to +165

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yaoyu-33 won't this be an issue since users now have to re-apply other model configs set as default in the recipe?


# Print configuration on rank 0
if get_rank_safe() == 0:
Expand Down
125 changes: 125 additions & 0 deletions src/megatron/bridge/recipes/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from megatron.core.distributed import DistributedDataParallelConfig

from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
from megatron.bridge.training.config import (
CheckpointConfig,
ConfigContainer,
DistributedInitConfig,
GPTDatasetConfig,
LoggerConfig,
RNGConfig,
TokenizerConfig,
TrainingConfig,
)


def _pretrain_common() -> ConfigContainer:
"""Create a base pre-training ConfigContainer with common defaults for any language model.

This function returns a ConfigContainer template with sensible defaults.
The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model` before use.

Returns:
ConfigContainer: Base configuration template for pre-training.
"""
# Default output directories
base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
run_output_dir = os.path.join(base_output_dir, "default")
checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
tensorboard_dir = os.path.join(run_output_dir, "tb_logs")

# Default optimizer and scheduler
opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
lr_warmup_iters=500,
lr_decay_iters=None, # Defaults to train_iters during validation
max_lr=3e-4,
min_lr=3e-5,
)

cfg = ConfigContainer(
# Model - MUST be set by each recipe before use
model=None, # type: ignore[arg-type]
# Training config
train=TrainingConfig(
train_iters=300000,
eval_interval=500,
eval_iters=32,
global_batch_size=32,
micro_batch_size=2,
manual_gc=True,
manual_gc_interval=100,
manual_gc_eval=100,
),
# Optimizer and scheduler
optimizer=opt_cfg,
scheduler=scheduler_cfg,
# DDP config - these are the commonly overridden settings
ddp=DistributedDataParallelConfig(
check_for_nan_in_grad=True,
grad_reduce_in_fp32=True,
overlap_grad_reduce=True,
overlap_param_gather=True,
average_in_collective=True,
data_parallel_sharding_strategy="optim_grads_params",
use_distributed_optimizer=True,
),
# Dataset config - uses mock data by default
dataset=GPTDatasetConfig(
random_seed=1234,
reset_attention_mask=False,
reset_position_ids=False,
eod_mask_loss=False,
seq_length=4096,
num_dataset_builder_threads=1,
blend=None, # Mock data mode
blend_per_split=None,
split="9999,8,2",
data_sharding=True,
dataloader_type="single",
skip_getting_attention_mask_from_dataset=True,
),
# Logger config
logger=LoggerConfig(
log_interval=10,
tensorboard_dir=tensorboard_dir,
log_timers_to_tensorboard=True,
),
# Tokenizer - placeholder, each recipe should set tokenizer_model
tokenizer=TokenizerConfig(
tokenizer_type="HuggingFaceTokenizer",
tokenizer_model=None, # Must be set by each recipe
),
# Checkpoint config
checkpoint=CheckpointConfig(
save_interval=500,
save=checkpoint_dir,
load=checkpoint_dir,
ckpt_format="torch_dist",
fully_parallel_save=True,
),
# RNG config
rng=RNGConfig(seed=1234),
# Distributed init config
dist=DistributedInitConfig(),
comm_overlap=None,
# Mixed precision - bf16 by default
mixed_precision="bf16_mixed",
)

return cfg
Loading
Loading