diff --git a/examples/resiliency/fault_tolerance/fault_tolerance_example.py b/examples/resiliency/fault_tolerance/fault_tolerance_example.py index 0e59bf0531..ff348a4ed5 100644 --- a/examples/resiliency/fault_tolerance/fault_tolerance_example.py +++ b/examples/resiliency/fault_tolerance/fault_tolerance_example.py @@ -150,7 +150,7 @@ def create_config( fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=1e-4, diff --git a/examples/resiliency/straggler_detection/straggler_detection_example.py b/examples/resiliency/straggler_detection/straggler_detection_example.py index dbab3b04e2..1354753df3 100644 --- a/examples/resiliency/straggler_detection/straggler_detection_example.py +++ b/examples/resiliency/straggler_detection/straggler_detection_example.py @@ -118,7 +118,7 @@ def create_config( fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=1e-4, diff --git a/src/megatron/bridge/recipes/deepseek/deepseek_v2.py b/src/megatron/bridge/recipes/deepseek/deepseek_v2.py index 401fdd5d00..2cdb8676c2 100644 --- a/src/megatron/bridge/recipes/deepseek/deepseek_v2.py +++ b/src/megatron/bridge/recipes/deepseek/deepseek_v2.py @@ -196,7 +196,7 @@ def _deepseek_common( lr_decay_iters=lr_decay_iters, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, weight_decay=0.1, max_lr=lr, min_lr=min_lr, diff --git a/src/megatron/bridge/recipes/deepseek/deepseek_v3.py b/src/megatron/bridge/recipes/deepseek/deepseek_v3.py index 9e1dc67598..c797b0aa9e 100644 --- a/src/megatron/bridge/recipes/deepseek/deepseek_v3.py +++ b/src/megatron/bridge/recipes/deepseek/deepseek_v3.py @@ -271,7 +271,7 @@ def _deepseek_v3_common( lr_decay_iters=lr_decay_iters, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, weight_decay=0.1, max_lr=lr, min_lr=min_lr, diff --git a/src/megatron/bridge/recipes/llama/llama2.py b/src/megatron/bridge/recipes/llama/llama2.py index 99d1b6c642..8b4529af8a 100644 --- a/src/megatron/bridge/recipes/llama/llama2.py +++ b/src/megatron/bridge/recipes/llama/llama2.py @@ -194,7 +194,7 @@ def _llama2_common( lr_decay_iters=lr_decay_iters, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, weight_decay=0.1, max_lr=lr, min_lr=min_lr, diff --git a/src/megatron/bridge/recipes/moonlight/moonlight_16b.py b/src/megatron/bridge/recipes/moonlight/moonlight_16b.py index 4bfa6d9b64..30609c18b5 100644 --- a/src/megatron/bridge/recipes/moonlight/moonlight_16b.py +++ b/src/megatron/bridge/recipes/moonlight/moonlight_16b.py @@ -277,7 +277,7 @@ def _moonlight_common( lr_decay_iters=train_iters, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, weight_decay=0.1, max_lr=lr, min_lr=min_lr, @@ -616,7 +616,7 @@ def _moonlight_finetune_common( min_lr=min_lr, adam_beta1=0.9, adam_beta2=0.98, - adam_eps=1e-5, + adam_eps=1e-8, weight_decay=0.1, ) diff --git a/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py b/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py index 5e8a98a7a9..b70a6f09d3 100644 --- a/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py +++ b/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py @@ -496,7 +496,6 @@ def _nemotron_3_nano_finetune_common( lr_decay_iters=lr_decay_iters, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-8, weight_decay=0.1, max_lr=finetune_lr, min_lr=min_lr, diff --git a/src/megatron/bridge/recipes/nemotronh/nemotronh.py b/src/megatron/bridge/recipes/nemotronh/nemotronh.py index fff3db3965..04d380567c 100644 --- a/src/megatron/bridge/recipes/nemotronh/nemotronh.py +++ b/src/megatron/bridge/recipes/nemotronh/nemotronh.py @@ -282,7 +282,7 @@ def _nemotronh_common( lr_decay_iters=lr_decay_iters, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, weight_decay=0.1, max_lr=lr, min_lr=min_lr, diff --git a/src/megatron/bridge/recipes/utils/optimizer_utils.py b/src/megatron/bridge/recipes/utils/optimizer_utils.py index 28c685eaff..456b719ba1 100644 --- a/src/megatron/bridge/recipes/utils/optimizer_utils.py +++ b/src/megatron/bridge/recipes/utils/optimizer_utils.py @@ -88,7 +88,7 @@ def distributed_fused_adam_with_cosine_annealing( lr_decay_iters: Optional[int] = None, adam_beta1: float = 0.9, adam_beta2: float = 0.95, - adam_eps: float = 1e-5, + adam_eps: float = 1e-8, weight_decay: float = 0.1, max_lr: float = 1e-4, min_lr: Optional[float] = None, @@ -156,7 +156,7 @@ def distributed_fused_adam_with_cosine_annealing_samples( lr_decay_samples: Optional[int] = None, adam_beta1: float = 0.9, adam_beta2: float = 0.95, - adam_eps: float = 1e-5, + adam_eps: float = 1e-8, weight_decay: float = 0.1, max_lr: float = 1e-4, min_lr: Optional[float] = None, diff --git a/src/megatron/bridge/training/config.py b/src/megatron/bridge/training/config.py index 408ec20b5d..6b2fbd341f 100644 --- a/src/megatron/bridge/training/config.py +++ b/src/megatron/bridge/training/config.py @@ -16,7 +16,7 @@ import os import signal from abc import ABC, abstractmethod -from dataclasses import dataclass, field, fields +from dataclasses import MISSING, dataclass, field, fields from pathlib import Path from typing import Any, Dict, Literal, Optional, Tuple, Union @@ -30,6 +30,8 @@ ) from megatron.core.transformer.enums import AttnBackend, CudaGraphScope from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.transformer_config import MLATransformerConfig as MCoreMLATransformerConfig +from megatron.core.transformer.transformer_config import TransformerConfig as MCoreTransformerConfig from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs from megatron.bridge.models import GPTModelProvider, T5ModelProvider @@ -1658,6 +1660,161 @@ def _calculate_scheduler_steps(self) -> None: else: self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_iters * self.train.global_batch_size + def log_non_default_values(self) -> None: + """Log configuration values that differ from Megatron Core defaults. + + For configs that inherit from Megatron Core (e.g., OptimizerConfig, DDPConfig, + TransformerConfig), this method logs only the values that differ from the Mcore + defaults. This makes it easier to spot unintended deviations from baseline settings. + + For configs that don't inherit from Mcore, key values are logged via + `_get_key_config_values`, which excludes None values and callables. + """ + # Determine the correct Mcore parent class for the model config + # Some models (e.g., DeepSeek) use MLATransformerConfig instead of TransformerConfig + model_mcore_class = _get_mcore_transformer_parent(self.model) + + # Map of config names to their (config object, Mcore parent class or None) + mcore_configs = [ + ("optimizer", self.optimizer, MCoreOptimizerConfig), + ("ddp", self.ddp, MCoreDistributedDataParallelConfig), + ("model", self.model, model_mcore_class), + ] + + # Non-Mcore configs - log all values + non_mcore_configs = [ + ("train", self.train), + ("scheduler", self.scheduler), + ("checkpoint", self.checkpoint), + ("logger", self.logger), + ("tokenizer", self.tokenizer), + ("rng", self.rng), + ] + + log_lines = [""] + log_lines.append("=" * 70) + log_lines.append("Configuration Summary (Non-Default Values vs Megatron Core)") + log_lines.append("=" * 70) + + # Log non-default values for Mcore configs + for config_name, config_obj, mcore_class in mcore_configs: + non_defaults = _get_non_default_values(config_obj, mcore_class) + if non_defaults: + log_lines.append(f"\n[{config_name}] Non-default values (vs Mcore {mcore_class.__name__}):") + for field_name, (current_val, default_val) in sorted(non_defaults.items()): + log_lines.append(f" {field_name}: {current_val!r} (Mcore default: {default_val!r})") + + # Log key values for non-Mcore configs + log_lines.append("\n" + "-" * 70) + log_lines.append("Other Configuration Values:") + log_lines.append("-" * 70) + + for config_name, config_obj in non_mcore_configs: + if config_obj is None: + continue + key_values = _get_key_config_values(config_obj) + if key_values: + log_lines.append(f"\n[{config_name}]:") + for field_name, value in sorted(key_values.items()): + log_lines.append(f" {field_name}: {value!r}") + + log_lines.append("\n" + "=" * 70) + + print_rank_0("\n".join(log_lines)) + + +def _get_mcore_transformer_parent(model_config: Any) -> type: + """Determine the correct Mcore TransformerConfig parent class for a model. + + Some models (e.g., DeepSeek v2/v3) inherit from MLATransformerConfig instead of + the base TransformerConfig. This function checks the inheritance chain to find + the appropriate Mcore class to use as the baseline for comparison. + + Args: + model_config: The model configuration object. + + Returns: + The appropriate Mcore TransformerConfig class (MCoreMLATransformerConfig or + MCoreTransformerConfig). + """ + # Check if the model inherits from MLATransformerConfig + if isinstance(model_config, MCoreMLATransformerConfig): + return MCoreMLATransformerConfig + return MCoreTransformerConfig + + +def _get_non_default_values(config_obj: Any, mcore_class: type) -> Dict[str, Tuple[Any, Any]]: + """Get values that differ from Mcore parent class defaults. + + Args: + config_obj: The config object to compare. + mcore_class: The Megatron Core parent class to compare against. + + Returns: + Dictionary mapping field name to (current_value, default_value) for non-default fields. + """ + non_defaults = {} + + # Get default values from Mcore class + mcore_defaults = {} + for f in fields(mcore_class): + if f.name.startswith("_"): + continue + if f.default is not MISSING: + mcore_defaults[f.name] = f.default + elif f.default_factory is not MISSING: + mcore_defaults[f.name] = f.default_factory() + + # Compare current values against Mcore defaults + for f in fields(config_obj): + if f.name.startswith("_"): + continue + field_name = f.name + current_value = getattr(config_obj, field_name, None) + + if field_name in mcore_defaults: + default_value = mcore_defaults[field_name] + # Skip callable values (like functions) and complex objects + if callable(current_value) or callable(default_value): + continue + # Compare values + try: + if current_value != default_value: + non_defaults[field_name] = (current_value, default_value) + except (TypeError, ValueError): + # Some types may not be directly comparable (e.g., torch.dtype) + if str(current_value) != str(default_value): + non_defaults[field_name] = (current_value, default_value) + + return non_defaults + + +def _get_key_config_values(config_obj: Any) -> Dict[str, Any]: + """Get key configuration values for non-Mcore configs. + + Args: + config_obj: The config object to extract values from. + + Returns: + Dictionary mapping field name to value for key fields. + """ + values = {} + if not hasattr(config_obj, "__dataclass_fields__"): + return values + + for f in fields(config_obj): + if f.name.startswith("_"): + continue + value = getattr(config_obj, f.name, None) + # Skip None values and complex objects + if value is None: + continue + if callable(value): + continue + values[f.name] = value + + return values + def runtime_config_update(cfg: ConfigContainer) -> None: """Apply runtime configuration updates prior to initialization. diff --git a/src/megatron/bridge/training/setup.py b/src/megatron/bridge/training/setup.py index 42b70c07b4..c2081c29c1 100644 --- a/src/megatron/bridge/training/setup.py +++ b/src/megatron/bridge/training/setup.py @@ -110,6 +110,7 @@ def setup( SetupOutput containing the populated state, model, optimizer, scheduler, dataloaders, and ckpt context. """ cfg = state.cfg + maybe_log_and_save_config(cfg) # Conditionally enable experimental features for Megatron Core set_experimental_flag(cfg.dist.enable_megatron_core_experimental) @@ -305,7 +306,6 @@ def modelopt_pre_wrap_hook(model): # Print setup timing. print_rank_0("done with setup ...") timers.log(["model-and-optimizer-setup", "train/valid/test-data-iterators-setup"], barrier=True) - maybe_log_and_save_config(cfg) return SetupOutput( state, @@ -484,7 +484,14 @@ def _validate_and_set_vocab_size(model_vocab_size: Optional[int], tokenizer_voca def maybe_log_and_save_config(cfg: ConfigContainer) -> None: - """Save configuration to disk and log it on rank 0.""" + """Save configuration to disk and log non-default values on rank 0. + + Instead of printing the full config YAML, this now logs only the values + that differ from Megatron Core defaults, making it easier to spot + unintended configuration deviations. + + The full config can still be saved to a file via logger.save_config_filepath. + """ if get_rank_safe() != 0: return @@ -495,6 +502,4 @@ def maybe_log_and_save_config(cfg: ConfigContainer) -> None: except Exception as e: print_rank_0(f"Error saving config to file {cfg.logger.save_config_filepath}: {e}") - print("------- Task Configuration -------") - cfg.print_yaml() - print("----------------------------------") + cfg.log_non_default_values() diff --git a/tests/functional_tests/training/test_callbacks.py b/tests/functional_tests/training/test_callbacks.py index 221ed2e221..1c2e255abf 100644 --- a/tests/functional_tests/training/test_callbacks.py +++ b/tests/functional_tests/training/test_callbacks.py @@ -182,7 +182,7 @@ def test_callbacks(self): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, diff --git a/tests/functional_tests/training/test_decentralized_pg.py b/tests/functional_tests/training/test_decentralized_pg.py index a65b791626..7b222483d7 100644 --- a/tests/functional_tests/training/test_decentralized_pg.py +++ b/tests/functional_tests/training/test_decentralized_pg.py @@ -132,7 +132,7 @@ def test_pretrain_with_decentralized_pg(self, tmp_path): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, @@ -257,7 +257,7 @@ def test_pretrain_with_decentralized_pg_disabled(self, tmp_path): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, @@ -389,7 +389,7 @@ def test_pretrain_with_decentralized_pg_and_pp(self, tmp_path): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, @@ -521,7 +521,7 @@ def test_pretrain_with_decentralized_pg_and_cp(self, tmp_path): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, @@ -653,7 +653,7 @@ def test_pretrain_with_decentralized_pg_combined_parallelism(self, tmp_path): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, @@ -785,7 +785,7 @@ def test_pretrain_with_decentralized_pg_and_tp(self, tmp_path): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, diff --git a/tests/functional_tests/training/test_finetune_dora.py b/tests/functional_tests/training/test_finetune_dora.py index fd2c41302c..8ab317604c 100644 --- a/tests/functional_tests/training/test_finetune_dora.py +++ b/tests/functional_tests/training/test_finetune_dora.py @@ -126,7 +126,7 @@ def _create_optimizer_config(self, lr=3e-3): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=lr, diff --git a/tests/functional_tests/training/test_finetune_lora.py b/tests/functional_tests/training/test_finetune_lora.py index 13b9cba4ce..13a18b27f5 100644 --- a/tests/functional_tests/training/test_finetune_lora.py +++ b/tests/functional_tests/training/test_finetune_lora.py @@ -244,7 +244,7 @@ def _create_optimizer_config(self, lr=3e-3): optimizer="adam", adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=lr, diff --git a/tests/functional_tests/training/test_inprocess_restart.py b/tests/functional_tests/training/test_inprocess_restart.py index 711a669a37..bd5561e185 100644 --- a/tests/functional_tests/training/test_inprocess_restart.py +++ b/tests/functional_tests/training/test_inprocess_restart.py @@ -104,7 +104,7 @@ def build_test_config( fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, diff --git a/tests/functional_tests/training/test_nvrx_straggler.py b/tests/functional_tests/training/test_nvrx_straggler.py index 0ed6db1760..478346cd27 100644 --- a/tests/functional_tests/training/test_nvrx_straggler.py +++ b/tests/functional_tests/training/test_nvrx_straggler.py @@ -92,7 +92,7 @@ def create_functional_test_config(enable_nvrx: bool = True) -> ConfigContainer: fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, diff --git a/tests/functional_tests/training/test_pretrain.py b/tests/functional_tests/training/test_pretrain.py index b458cdaebf..537a969236 100644 --- a/tests/functional_tests/training/test_pretrain.py +++ b/tests/functional_tests/training/test_pretrain.py @@ -99,7 +99,7 @@ def test_pretrain_with_checkpoint(self, tmp_path): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, @@ -210,7 +210,7 @@ def test_pretrain_vpp(self, tmp_path): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, diff --git a/tests/functional_tests/training/test_pretrain_resume.py b/tests/functional_tests/training/test_pretrain_resume.py index c6bde6eebd..962c1bfb7d 100644 --- a/tests/functional_tests/training/test_pretrain_resume.py +++ b/tests/functional_tests/training/test_pretrain_resume.py @@ -99,7 +99,7 @@ def test_pretrain_save_load(self, tmp_path): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, @@ -181,7 +181,7 @@ def test_pretrain_save_load(self, tmp_path): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, diff --git a/tests/functional_tests/training/test_sft.py b/tests/functional_tests/training/test_sft.py index 2e17fbf9b7..7212f3c23c 100644 --- a/tests/functional_tests/training/test_sft.py +++ b/tests/functional_tests/training/test_sft.py @@ -127,7 +127,7 @@ def _create_config( fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=lr, diff --git a/tests/functional_tests/training/test_tensor_inspect.py b/tests/functional_tests/training/test_tensor_inspect.py index 02e2121856..6c70f2ab5a 100644 --- a/tests/functional_tests/training/test_tensor_inspect.py +++ b/tests/functional_tests/training/test_tensor_inspect.py @@ -119,7 +119,7 @@ def test_pretrain_with_bf16_tensor_stats(self, tmp_path): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, lr=3e-3, diff --git a/tests/unit_tests/recipes/test_run_plugins.py b/tests/unit_tests/recipes/test_run_plugins.py index 5d19286019..049a038966 100644 --- a/tests/unit_tests/recipes/test_run_plugins.py +++ b/tests/unit_tests/recipes/test_run_plugins.py @@ -110,7 +110,7 @@ def create_test_config(**kwargs): fp16=False, adam_beta1=0.9, adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, use_distributed_optimizer=True, clip_grad=1.0, ), diff --git a/tests/unit_tests/recipes/utils/test_optimizer_utils.py b/tests/unit_tests/recipes/utils/test_optimizer_utils.py index 05c3e1117d..02540c95e6 100644 --- a/tests/unit_tests/recipes/utils/test_optimizer_utils.py +++ b/tests/unit_tests/recipes/utils/test_optimizer_utils.py @@ -31,7 +31,7 @@ def test_optimizer_config(self): optim_cfg, _ = distributed_fused_adam_with_cosine_annealing( adam_beta2=0.98, - adam_eps=1e-5, + adam_eps=1e-8, weight_decay=0.01, max_lr=3e-4, min_lr=3e-5, @@ -61,7 +61,7 @@ def test_sample_based_optimizer_config(self): optim_cfg, _ = distributed_fused_adam_with_cosine_annealing_samples( precision="bf16-mixed", adam_beta2=0.95, - adam_eps=1e-5, + adam_eps=1e-8, weight_decay=0.1, max_lr=1e-4, min_lr=1e-5, diff --git a/tests/unit_tests/training/test_log_non_default_values.py b/tests/unit_tests/training/test_log_non_default_values.py new file mode 100644 index 0000000000..f2649e289d --- /dev/null +++ b/tests/unit_tests/training/test_log_non_default_values.py @@ -0,0 +1,352 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the log_non_default_values functionality in config.py.""" + +from unittest.mock import patch + +from megatron.core.distributed.distributed_data_parallel_config import ( + DistributedDataParallelConfig as MCoreDistributedDataParallelConfig, +) +from megatron.core.optimizer.optimizer_config import OptimizerConfig as MCoreOptimizerConfig +from megatron.core.transformer.transformer_config import TransformerConfig as MCoreTransformerConfig + +from megatron.bridge.models.deepseek.deepseek_provider import DeepSeekModelProvider +from megatron.bridge.models.gpt_provider import GPTModelProvider +from megatron.bridge.training.config import ( + CheckpointConfig, + ConfigContainer, + DistributedDataParallelConfig, + GPTDatasetConfig, + LoggerConfig, + OptimizerConfig, + RNGConfig, + SchedulerConfig, + TokenizerConfig, + TrainingConfig, + _get_key_config_values, + _get_mcore_transformer_parent, + _get_non_default_values, +) + + +class TestGetMcoreTransformerParent: + """Tests for _get_mcore_transformer_parent function.""" + + def test_gpt_provider_returns_transformer_config(self): + """GPTModelProvider should return MCoreTransformerConfig as parent.""" + config = GPTModelProvider( + num_layers=2, + hidden_size=128, + num_attention_heads=4, + seq_length=512, + ) + parent = _get_mcore_transformer_parent(config) + assert parent is MCoreTransformerConfig + + def test_deepseek_provider_returns_mla_transformer_config(self): + """DeepSeekModelProvider should return MCoreMLATransformerConfig as parent.""" + from megatron.core.transformer.transformer_config import ( + MLATransformerConfig as MCoreMLATransformerConfig, + ) + + config = DeepSeekModelProvider( + num_layers=2, + hidden_size=128, + num_attention_heads=4, + seq_length=512, + ) + parent = _get_mcore_transformer_parent(config) + assert parent is MCoreMLATransformerConfig + + +class TestGetNonDefaultValues: + """Tests for _get_non_default_values function.""" + + def test_detects_non_default_optimizer_values(self): + """Should detect values that differ from Mcore OptimizerConfig defaults.""" + # Create optimizer with non-default values + optimizer = OptimizerConfig( + lr=0.001, + adam_beta2=0.95, # Mcore default is 0.999 + weight_decay=0.1, # Mcore default is 0.01 + ) + + non_defaults = _get_non_default_values(optimizer, MCoreOptimizerConfig) + + assert "adam_beta2" in non_defaults + assert non_defaults["adam_beta2"] == (0.95, 0.999) + + assert "weight_decay" in non_defaults + assert non_defaults["weight_decay"] == (0.1, 0.01) + + def test_does_not_include_matching_defaults(self): + """Should not include values that match Mcore defaults.""" + # Create optimizer with default adam_eps value + optimizer = OptimizerConfig( + lr=0.001, + adam_eps=1e-8, # Matches Mcore default + ) + + non_defaults = _get_non_default_values(optimizer, MCoreOptimizerConfig) + + # adam_eps should not be in non_defaults since it matches + assert "adam_eps" not in non_defaults + + def test_detects_non_default_ddp_values(self): + """Should detect non-default values in DDP config.""" + ddp = DistributedDataParallelConfig( + use_distributed_optimizer=True, # Mcore default is False + overlap_grad_reduce=True, # Mcore default is False + ) + + non_defaults = _get_non_default_values(ddp, MCoreDistributedDataParallelConfig) + + assert "use_distributed_optimizer" in non_defaults + assert non_defaults["use_distributed_optimizer"] == (True, False) + + assert "overlap_grad_reduce" in non_defaults + assert non_defaults["overlap_grad_reduce"] == (True, False) + + def test_handles_model_config(self): + """Should detect non-default values in model config.""" + model = GPTModelProvider( + num_layers=2, + hidden_size=128, + num_attention_heads=4, + seq_length=512, + add_bias_linear=False, # Mcore default is True + hidden_dropout=0.0, # Mcore default is 0.1 + ) + + non_defaults = _get_non_default_values(model, MCoreTransformerConfig) + + assert "add_bias_linear" in non_defaults + assert non_defaults["add_bias_linear"] == (False, True) + + assert "hidden_dropout" in non_defaults + assert non_defaults["hidden_dropout"] == (0.0, 0.1) + + def test_skips_private_fields(self): + """Should skip fields that start with underscore.""" + optimizer = OptimizerConfig(lr=0.001) + + non_defaults = _get_non_default_values(optimizer, MCoreOptimizerConfig) + + # No private fields should be in the result + for field_name in non_defaults: + assert not field_name.startswith("_") + + +class TestGetKeyConfigValues: + """Tests for _get_key_config_values function.""" + + def test_extracts_training_config_values(self): + """Should extract key values from TrainingConfig.""" + train = TrainingConfig( + global_batch_size=128, + train_iters=1000, + micro_batch_size=4, + ) + + values = _get_key_config_values(train) + + assert values["global_batch_size"] == 128 + assert values["train_iters"] == 1000 + assert values["micro_batch_size"] == 4 + + def test_skips_none_values(self): + """Should skip fields that are None.""" + train = TrainingConfig( + global_batch_size=128, + train_iters=1000, + ) + + values = _get_key_config_values(train) + + # Check that no None values are included + for value in values.values(): + assert value is not None + + def test_skips_private_fields(self): + """Should skip fields that start with underscore.""" + scheduler = SchedulerConfig(lr_decay_style="cosine") + + values = _get_key_config_values(scheduler) + + for field_name in values: + assert not field_name.startswith("_") + + def test_extracts_checkpoint_config_values(self): + """Should extract key values from CheckpointConfig.""" + checkpoint = CheckpointConfig( + ckpt_format="torch_dist", + save_interval=500, + async_save=True, + ) + + values = _get_key_config_values(checkpoint) + + assert values["ckpt_format"] == "torch_dist" + assert values["save_interval"] == 500 + assert values["async_save"] is True + + +class TestLogNonDefaultValues: + """Tests for ConfigContainer.log_non_default_values method.""" + + def _create_minimal_config_container(self, model_provider=None) -> ConfigContainer: + """Create a minimal ConfigContainer for testing.""" + if model_provider is None: + model_provider = GPTModelProvider( + num_layers=2, + hidden_size=128, + num_attention_heads=4, + seq_length=512, + add_bias_linear=False, # Non-default + ) + + return ConfigContainer( + model=model_provider, + optimizer=OptimizerConfig( + lr=0.001, + adam_beta2=0.95, # Non-default (Mcore default is 0.999) + ), + scheduler=SchedulerConfig(lr_decay_style="cosine"), + train=TrainingConfig( + global_batch_size=64, + train_iters=500, + ), + ddp=DistributedDataParallelConfig( + overlap_grad_reduce=True, # Non-default + ), + checkpoint=CheckpointConfig(ckpt_format="torch_dist"), + logger=LoggerConfig(), + tokenizer=TokenizerConfig(), + rng=RNGConfig(), + dataset=GPTDatasetConfig( + random_seed=1234, + seq_length=512, + reset_position_ids=False, + reset_attention_mask=False, + eod_mask_loss=False, + ), + ) + + @patch("megatron.bridge.training.config.print_rank_0") + def test_logs_non_default_values(self, mock_print_rank_0): + """Should log non-default values compared to Mcore.""" + cfg = self._create_minimal_config_container() + + cfg.log_non_default_values() + + # Verify print_rank_0 was called + mock_print_rank_0.assert_called_once() + + # Get the logged output + log_output = mock_print_rank_0.call_args[0][0] + + # Verify header is present + assert "Configuration Summary (Non-Default Values vs Megatron Core)" in log_output + + # Verify optimizer non-defaults are logged + assert "[optimizer] Non-default values" in log_output + assert "adam_beta2: 0.95" in log_output + assert "Mcore default: 0.999" in log_output + + # Verify ddp non-defaults are logged + assert "[ddp] Non-default values" in log_output + assert "overlap_grad_reduce: True" in log_output + assert "Mcore default: False" in log_output + + # Verify model non-defaults are logged + assert "[model] Non-default values" in log_output + assert "add_bias_linear: False" in log_output + assert "Mcore default: True" in log_output + + @patch("megatron.bridge.training.config.print_rank_0") + def test_logs_other_config_values(self, mock_print_rank_0): + """Should log key values from non-Mcore configs.""" + cfg = self._create_minimal_config_container() + + cfg.log_non_default_values() + + log_output = mock_print_rank_0.call_args[0][0] + + # Verify other configs section is present + assert "Other Configuration Values:" in log_output + + # Verify train config values + assert "[train]:" in log_output + assert "global_batch_size: 64" in log_output + assert "train_iters: 500" in log_output + + # Verify scheduler config values + assert "[scheduler]:" in log_output + + @patch("megatron.bridge.training.config.print_rank_0") + def test_handles_deepseek_model_correctly(self, mock_print_rank_0): + """Should use MLATransformerConfig for DeepSeek models.""" + deepseek_model = DeepSeekModelProvider( + num_layers=2, + hidden_size=128, + num_attention_heads=4, + seq_length=512, + ) + + cfg = self._create_minimal_config_container(model_provider=deepseek_model) + + cfg.log_non_default_values() + + log_output = mock_print_rank_0.call_args[0][0] + + # Should use MLATransformerConfig for comparison + assert "MLATransformerConfig" in log_output + + @patch("megatron.bridge.training.config.print_rank_0") + def test_adam_eps_not_logged_when_default(self, mock_print_rank_0): + """adam_eps should not appear in logs when set to Mcore default (1e-8).""" + cfg = ConfigContainer( + model=GPTModelProvider( + num_layers=2, + hidden_size=128, + num_attention_heads=4, + seq_length=512, + ), + optimizer=OptimizerConfig( + lr=0.001, + adam_eps=1e-8, # Mcore default + ), + scheduler=SchedulerConfig(lr_decay_style="cosine"), + train=TrainingConfig(global_batch_size=64, train_iters=500), + ddp=DistributedDataParallelConfig(), + checkpoint=CheckpointConfig(ckpt_format="torch_dist"), + logger=LoggerConfig(), + tokenizer=TokenizerConfig(), + rng=RNGConfig(), + dataset=GPTDatasetConfig( + random_seed=1234, + seq_length=512, + reset_position_ids=False, + reset_attention_mask=False, + eod_mask_loss=False, + ), + ) + + cfg.log_non_default_values() + + log_output = mock_print_rank_0.call_args[0][0] + + # adam_eps should NOT be in the log since it matches Mcore default + assert "adam_eps:" not in log_output diff --git a/tests/unit_tests/training/test_setup.py b/tests/unit_tests/training/test_setup.py index bd7952c03b..e3612062b5 100644 --- a/tests/unit_tests/training/test_setup.py +++ b/tests/unit_tests/training/test_setup.py @@ -63,35 +63,33 @@ class TestMaybeLogAndSaveConfig: """Tests for maybe_log_and_save_config.""" @patch("megatron.bridge.training.setup.get_rank_safe", return_value=0) - def test_rank_zero_saves_and_logs(self, mock_get_rank, tmp_path, capsys): + def test_rank_zero_saves_and_logs(self, mock_get_rank, tmp_path): filepath = tmp_path / "config.yaml" cfg = Mock() cfg.logger.save_config_filepath = str(filepath) cfg.to_yaml = Mock() - cfg.print_yaml = Mock() + cfg.log_non_default_values = Mock() maybe_log_and_save_config(cfg) cfg.to_yaml.assert_called_once_with(str(filepath)) - cfg.print_yaml.assert_called_once() - captured = capsys.readouterr() - assert "------- Task Configuration -------" in captured.out - assert "----------------------------------" in captured.out + cfg.log_non_default_values.assert_called_once() @patch("megatron.bridge.training.setup.get_rank_safe", return_value=1) def test_non_zero_rank_noop(self, mock_get_rank): cfg = Mock() cfg.logger.save_config_filepath = "unused" cfg.to_yaml = Mock() - cfg.print_yaml = Mock() + cfg.log_non_default_values = Mock() maybe_log_and_save_config(cfg) cfg.to_yaml.assert_not_called() - cfg.print_yaml.assert_not_called() + cfg.log_non_default_values.assert_not_called() @patch("megatron.bridge.training.setup.get_rank_safe", return_value=0) - def test_save_failure_is_logged(self, mock_get_rank, capsys): + @patch("megatron.bridge.training.setup.print_rank_0") + def test_save_failure_is_logged(self, mock_print, mock_get_rank): cfg = Mock() cfg.logger.save_config_filepath = "path" @@ -99,10 +97,11 @@ def raise_io_error(_): raise IOError("boom") cfg.to_yaml.side_effect = raise_io_error - cfg.print_yaml = Mock() + cfg.log_non_default_values = Mock() maybe_log_and_save_config(cfg) - captured = capsys.readouterr() - assert "Error saving config" in captured.out - cfg.print_yaml.assert_called_once() + # Check that error was logged via print_rank_0 + mock_print.assert_called_once() + assert "Error saving config" in mock_print.call_args[0][0] + cfg.log_non_default_values.assert_called_once()