Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def create_config(
fp16=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
use_distributed_optimizer=True,
clip_grad=1.0,
lr=1e-4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def create_config(
fp16=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
use_distributed_optimizer=True,
clip_grad=1.0,
lr=1e-4,
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/deepseek/deepseek_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def _deepseek_common(
lr_decay_iters=lr_decay_iters,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
weight_decay=0.1,
max_lr=lr,
min_lr=min_lr,
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/deepseek/deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def _deepseek_v3_common(
lr_decay_iters=lr_decay_iters,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
weight_decay=0.1,
max_lr=lr,
min_lr=min_lr,
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/llama/llama2.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def _llama2_common(
lr_decay_iters=lr_decay_iters,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
weight_decay=0.1,
max_lr=lr,
min_lr=min_lr,
Expand Down
4 changes: 2 additions & 2 deletions src/megatron/bridge/recipes/moonlight/moonlight_16b.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def _moonlight_common(
lr_decay_iters=train_iters,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
weight_decay=0.1,
max_lr=lr,
min_lr=min_lr,
Expand Down Expand Up @@ -616,7 +616,7 @@ def _moonlight_finetune_common(
min_lr=min_lr,
adam_beta1=0.9,
adam_beta2=0.98,
adam_eps=1e-5,
adam_eps=1e-8,
weight_decay=0.1,
)

Expand Down
1 change: 0 additions & 1 deletion src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,6 @@ def _nemotron_3_nano_finetune_common(
lr_decay_iters=lr_decay_iters,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-8,
weight_decay=0.1,
max_lr=finetune_lr,
min_lr=min_lr,
Expand Down
2 changes: 1 addition & 1 deletion src/megatron/bridge/recipes/nemotronh/nemotronh.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def _nemotronh_common(
lr_decay_iters=lr_decay_iters,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
weight_decay=0.1,
max_lr=lr,
min_lr=min_lr,
Expand Down
4 changes: 2 additions & 2 deletions src/megatron/bridge/recipes/utils/optimizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def distributed_fused_adam_with_cosine_annealing(
lr_decay_iters: Optional[int] = None,
adam_beta1: float = 0.9,
adam_beta2: float = 0.95,
adam_eps: float = 1e-5,
adam_eps: float = 1e-8,
weight_decay: float = 0.1,
max_lr: float = 1e-4,
min_lr: Optional[float] = None,
Expand Down Expand Up @@ -156,7 +156,7 @@ def distributed_fused_adam_with_cosine_annealing_samples(
lr_decay_samples: Optional[int] = None,
adam_beta1: float = 0.9,
adam_beta2: float = 0.95,
adam_eps: float = 1e-5,
adam_eps: float = 1e-8,
weight_decay: float = 0.1,
max_lr: float = 1e-4,
min_lr: Optional[float] = None,
Expand Down
159 changes: 158 additions & 1 deletion src/megatron/bridge/training/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
import signal
from abc import ABC, abstractmethod
from dataclasses import dataclass, field, fields
from dataclasses import MISSING, dataclass, field, fields
from pathlib import Path
from typing import Any, Dict, Literal, Optional, Tuple, Union

Expand All @@ -30,6 +30,8 @@
)
from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.transformer_config import MLATransformerConfig as MCoreMLATransformerConfig
from megatron.core.transformer.transformer_config import TransformerConfig as MCoreTransformerConfig

from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
from megatron.bridge.models import GPTModelProvider, T5ModelProvider
Expand Down Expand Up @@ -1658,6 +1660,161 @@ def _calculate_scheduler_steps(self) -> None:
else:
self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_iters * self.train.global_batch_size

def log_non_default_values(self) -> None:
"""Log configuration values that differ from Megatron Core defaults.

For configs that inherit from Megatron Core (e.g., OptimizerConfig, DDPConfig,
TransformerConfig), this method logs only the values that differ from the Mcore
defaults. This makes it easier to spot unintended deviations from baseline settings.

For configs that don't inherit from Mcore, key values are logged via
`_get_key_config_values`, which excludes None values and callables.
"""
# Determine the correct Mcore parent class for the model config
# Some models (e.g., DeepSeek) use MLATransformerConfig instead of TransformerConfig
model_mcore_class = _get_mcore_transformer_parent(self.model)

# Map of config names to their (config object, Mcore parent class or None)
mcore_configs = [
("optimizer", self.optimizer, MCoreOptimizerConfig),
("ddp", self.ddp, MCoreDistributedDataParallelConfig),
("model", self.model, model_mcore_class),
]

# Non-Mcore configs - log all values
non_mcore_configs = [
("train", self.train),
("scheduler", self.scheduler),
("checkpoint", self.checkpoint),
("logger", self.logger),
("tokenizer", self.tokenizer),
("rng", self.rng),
]

log_lines = [""]
log_lines.append("=" * 70)
log_lines.append("Configuration Summary (Non-Default Values vs Megatron Core)")
log_lines.append("=" * 70)

# Log non-default values for Mcore configs
for config_name, config_obj, mcore_class in mcore_configs:
non_defaults = _get_non_default_values(config_obj, mcore_class)
if non_defaults:
log_lines.append(f"\n[{config_name}] Non-default values (vs Mcore {mcore_class.__name__}):")
for field_name, (current_val, default_val) in sorted(non_defaults.items()):
log_lines.append(f" {field_name}: {current_val!r} (Mcore default: {default_val!r})")

# Log key values for non-Mcore configs
log_lines.append("\n" + "-" * 70)
log_lines.append("Other Configuration Values:")
log_lines.append("-" * 70)

for config_name, config_obj in non_mcore_configs:
if config_obj is None:
continue
key_values = _get_key_config_values(config_obj)
if key_values:
log_lines.append(f"\n[{config_name}]:")
for field_name, value in sorted(key_values.items()):
log_lines.append(f" {field_name}: {value!r}")

log_lines.append("\n" + "=" * 70)

print_rank_0("\n".join(log_lines))


def _get_mcore_transformer_parent(model_config: Any) -> type:
"""Determine the correct Mcore TransformerConfig parent class for a model.

Some models (e.g., DeepSeek v2/v3) inherit from MLATransformerConfig instead of
the base TransformerConfig. This function checks the inheritance chain to find
the appropriate Mcore class to use as the baseline for comparison.

Args:
model_config: The model configuration object.

Returns:
The appropriate Mcore TransformerConfig class (MCoreMLATransformerConfig or
MCoreTransformerConfig).
"""
# Check if the model inherits from MLATransformerConfig
if isinstance(model_config, MCoreMLATransformerConfig):
return MCoreMLATransformerConfig
return MCoreTransformerConfig


def _get_non_default_values(config_obj: Any, mcore_class: type) -> Dict[str, Tuple[Any, Any]]:
"""Get values that differ from Mcore parent class defaults.

Args:
config_obj: The config object to compare.
mcore_class: The Megatron Core parent class to compare against.

Returns:
Dictionary mapping field name to (current_value, default_value) for non-default fields.
"""
non_defaults = {}

# Get default values from Mcore class
mcore_defaults = {}
for f in fields(mcore_class):
if f.name.startswith("_"):
continue
if f.default is not MISSING:
mcore_defaults[f.name] = f.default
elif f.default_factory is not MISSING:
mcore_defaults[f.name] = f.default_factory()

# Compare current values against Mcore defaults
for f in fields(config_obj):
if f.name.startswith("_"):
continue
field_name = f.name
current_value = getattr(config_obj, field_name, None)

if field_name in mcore_defaults:
default_value = mcore_defaults[field_name]
# Skip callable values (like functions) and complex objects
if callable(current_value) or callable(default_value):
continue
# Compare values
try:
if current_value != default_value:
non_defaults[field_name] = (current_value, default_value)
except (TypeError, ValueError):
# Some types may not be directly comparable (e.g., torch.dtype)
if str(current_value) != str(default_value):
non_defaults[field_name] = (current_value, default_value)

return non_defaults


def _get_key_config_values(config_obj: Any) -> Dict[str, Any]:
"""Get key configuration values for non-Mcore configs.

Args:
config_obj: The config object to extract values from.

Returns:
Dictionary mapping field name to value for key fields.
"""
values = {}
if not hasattr(config_obj, "__dataclass_fields__"):
return values

for f in fields(config_obj):
if f.name.startswith("_"):
continue
value = getattr(config_obj, f.name, None)
# Skip None values and complex objects
if value is None:
continue
if callable(value):
continue
values[f.name] = value

return values


def runtime_config_update(cfg: ConfigContainer) -> None:
"""Apply runtime configuration updates prior to initialization.
Expand Down
15 changes: 10 additions & 5 deletions src/megatron/bridge/training/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def setup(
SetupOutput containing the populated state, model, optimizer, scheduler, dataloaders, and ckpt context.
"""
cfg = state.cfg
maybe_log_and_save_config(cfg)

# Conditionally enable experimental features for Megatron Core
set_experimental_flag(cfg.dist.enable_megatron_core_experimental)
Expand Down Expand Up @@ -305,7 +306,6 @@ def modelopt_pre_wrap_hook(model):
# Print setup timing.
print_rank_0("done with setup ...")
timers.log(["model-and-optimizer-setup", "train/valid/test-data-iterators-setup"], barrier=True)
maybe_log_and_save_config(cfg)

return SetupOutput(
state,
Expand Down Expand Up @@ -484,7 +484,14 @@ def _validate_and_set_vocab_size(model_vocab_size: Optional[int], tokenizer_voca


def maybe_log_and_save_config(cfg: ConfigContainer) -> None:
"""Save configuration to disk and log it on rank 0."""
"""Save configuration to disk and log non-default values on rank 0.

Instead of printing the full config YAML, this now logs only the values
that differ from Megatron Core defaults, making it easier to spot
unintended configuration deviations.

The full config can still be saved to a file via logger.save_config_filepath.
"""

if get_rank_safe() != 0:
return
Expand All @@ -495,6 +502,4 @@ def maybe_log_and_save_config(cfg: ConfigContainer) -> None:
except Exception as e:
print_rank_0(f"Error saving config to file {cfg.logger.save_config_filepath}: {e}")

print("------- Task Configuration -------")
cfg.print_yaml()
print("----------------------------------")
cfg.log_non_default_values()
2 changes: 1 addition & 1 deletion tests/functional_tests/training/test_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def test_callbacks(self):
fp16=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
use_distributed_optimizer=True,
clip_grad=1.0,
lr=3e-3,
Expand Down
12 changes: 6 additions & 6 deletions tests/functional_tests/training/test_decentralized_pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def test_pretrain_with_decentralized_pg(self, tmp_path):
fp16=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
use_distributed_optimizer=True,
clip_grad=1.0,
lr=3e-3,
Expand Down Expand Up @@ -257,7 +257,7 @@ def test_pretrain_with_decentralized_pg_disabled(self, tmp_path):
fp16=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
use_distributed_optimizer=True,
clip_grad=1.0,
lr=3e-3,
Expand Down Expand Up @@ -389,7 +389,7 @@ def test_pretrain_with_decentralized_pg_and_pp(self, tmp_path):
fp16=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
use_distributed_optimizer=True,
clip_grad=1.0,
lr=3e-3,
Expand Down Expand Up @@ -521,7 +521,7 @@ def test_pretrain_with_decentralized_pg_and_cp(self, tmp_path):
fp16=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
use_distributed_optimizer=True,
clip_grad=1.0,
lr=3e-3,
Expand Down Expand Up @@ -653,7 +653,7 @@ def test_pretrain_with_decentralized_pg_combined_parallelism(self, tmp_path):
fp16=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
use_distributed_optimizer=True,
clip_grad=1.0,
lr=3e-3,
Expand Down Expand Up @@ -785,7 +785,7 @@ def test_pretrain_with_decentralized_pg_and_tp(self, tmp_path):
fp16=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
use_distributed_optimizer=True,
clip_grad=1.0,
lr=3e-3,
Expand Down
2 changes: 1 addition & 1 deletion tests/functional_tests/training/test_finetune_dora.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def _create_optimizer_config(self, lr=3e-3):
fp16=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
use_distributed_optimizer=True,
clip_grad=1.0,
lr=lr,
Expand Down
2 changes: 1 addition & 1 deletion tests/functional_tests/training/test_finetune_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def _create_optimizer_config(self, lr=3e-3):
optimizer="adam",
adam_beta1=0.9,
adam_beta2=0.95,
adam_eps=1e-5,
adam_eps=1e-8,
use_distributed_optimizer=True,
clip_grad=1.0,
lr=lr,
Expand Down
Loading