diff --git a/examples/quantization/pretrain_quantized_llama3_8b.py b/examples/quantization/pretrain_quantized_llama3_8b.py index 4302786f20..330b6703e9 100644 --- a/examples/quantization/pretrain_quantized_llama3_8b.py +++ b/examples/quantization/pretrain_quantized_llama3_8b.py @@ -152,14 +152,17 @@ def main() -> None: logger.info("------------------------------------------------------------------") # Load base configuration from the recipe as a Python dataclass - # If --hf-path is provided, pass it to the recipe function - recipe_kwargs = {} + # Pretrain configs use parameterless API + cfg: ConfigContainer = pretrain_config() + logger.info("Loaded base configuration") + + # If --hf-path is provided, override the model's HuggingFace path if args.hf_path: logger.info(f"Using custom HuggingFace path: {args.hf_path}") - recipe_kwargs["hf_path"] = args.hf_path + # Import AutoBridge to create a new model provider with the custom HF path + from megatron.bridge.models import AutoBridge - cfg: ConfigContainer = pretrain_config(**recipe_kwargs) - logger.info("Loaded base configuration") + cfg.model = AutoBridge.from_hf_pretrained(args.hf_path).to_megatron_provider(load_weights=False) # Print configuration on rank 0 if get_rank_safe() == 0: diff --git a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py index 67db288f25..d30c9b2f14 100644 --- a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py +++ b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py @@ -18,7 +18,12 @@ from utils.precision import get_precision_config from utils.utils import get_workload_base_config -from megatron.bridge.recipes.deepseek.deepseek_v3 import deepseek_v3_pretrain_config as pretrain_config +from megatron.bridge.recipes.deepseek.deepseek_v3 import ( + deepseek_v3_pretrain_config as pretrain_config, +) +from megatron.bridge.recipes.deepseek.deepseek_v3 import ( + set_deepseek_v3_pipeline_model_parallel_layout, +) from megatron.bridge.training.config import ConfigContainer @@ -54,14 +59,19 @@ def deepseek_v3_pretrain_config_gb300( ) precision_config = get_precision_config(precision) - cfg = pretrain_config( - mock=mock, - precision_config=precision_config, - pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size, - virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size, - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - layout=base_cfg.pp_layout, - ) + cfg = pretrain_config() + cfg.mixed_precision = precision_config + + # Apply model-specific settings that were previously passed as constructor args + cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size + cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + if base_cfg.pp_layout: + cfg.model.pipeline_model_parallel_layout = base_cfg.pp_layout + else: + # Recompute layout based on updated PP/VP sizes + set_deepseek_v3_pipeline_model_parallel_layout(cfg.model) + set_deepseek_v3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -89,14 +99,19 @@ def deepseek_v3_pretrain_config_gb200( ) precision_config = get_precision_config(precision) - cfg = pretrain_config( - mock=mock, - precision_config=precision_config, - pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size, - virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size, - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - layout=base_cfg.pp_layout, - ) + cfg = pretrain_config() + cfg.mixed_precision = precision_config + + # Apply model-specific settings that were previously passed as constructor args + cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size + cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + if base_cfg.pp_layout: + cfg.model.pipeline_model_parallel_layout = base_cfg.pp_layout + else: + # Recompute layout based on updated PP/VP sizes + set_deepseek_v3_pipeline_model_parallel_layout(cfg.model) + set_deepseek_v3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -124,14 +139,16 @@ def deepseek_v3_pretrain_config_b300( ) precision_config = get_precision_config(precision) - cfg = pretrain_config( - mock=mock, - precision_config=precision_config, - pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size, - virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size, - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - layout=None, - ) + cfg = pretrain_config() + cfg.mixed_precision = precision_config + + # Apply model-specific settings that were previously passed as constructor args + cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size + cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + # Recompute layout based on updated PP/VP sizes + set_deepseek_v3_pipeline_model_parallel_layout(cfg.model) + set_deepseek_v3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -154,14 +171,16 @@ def deepseek_v3_pretrain_config_b200( ) precision_config = get_precision_config(precision) - cfg = pretrain_config( - mock=mock, - precision_config=precision_config, - pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size, - virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size, - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - layout=None, - ) + cfg = pretrain_config() + cfg.mixed_precision = precision_config + + # Apply model-specific settings that were previously passed as constructor args + cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size + cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + # Recompute layout based on updated PP/VP sizes + set_deepseek_v3_pipeline_model_parallel_layout(cfg.model) + set_deepseek_v3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -184,14 +203,15 @@ def deepseek_v3_pretrain_config_h100( ) precision_config = get_precision_config(precision) - cfg = pretrain_config( - mock=mock, - precision_config=precision_config, - pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size, - virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size, - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - layout="Et|(tt|)*30mL", - ) + cfg = pretrain_config() + cfg.mixed_precision = precision_config + + # Apply model-specific settings that were previously passed as constructor args + cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size + cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + cfg.model.pipeline_model_parallel_layout = "Et|(tt|)*30mL" + set_deepseek_v3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py b/scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py index ebe4893dab..ce2aaeaea0 100644 --- a/scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py +++ b/scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py @@ -49,10 +49,8 @@ def gpt_oss_120b_pretrain_config_gb300( ) precision_config = get_precision_config(precision) - cfg = gpt_oss_120b_pretrain_config( - mock=mock, - precision_config=precision_config, - ) + cfg = gpt_oss_120b_pretrain_config() + cfg.mixed_precision = precision_config set_gpt_oss_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -73,10 +71,8 @@ def gpt_oss_120b_pretrain_config_gb200( ) precision_config = get_precision_config(precision) - cfg = gpt_oss_120b_pretrain_config( - mock=mock, - precision_config=precision_config, - ) + cfg = gpt_oss_120b_pretrain_config() + cfg.mixed_precision = precision_config set_gpt_oss_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -97,10 +93,8 @@ def gpt_oss_120b_pretrain_config_b300( ) precision_config = get_precision_config(precision) - cfg = gpt_oss_120b_pretrain_config( - mock=mock, - precision_config=precision_config, - ) + cfg = gpt_oss_120b_pretrain_config() + cfg.mixed_precision = precision_config set_gpt_oss_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -121,10 +115,8 @@ def gpt_oss_120b_pretrain_config_b200( ) precision_config = get_precision_config(precision) - cfg = gpt_oss_120b_pretrain_config( - mock=mock, - precision_config=precision_config, - ) + cfg = gpt_oss_120b_pretrain_config() + cfg.mixed_precision = precision_config set_gpt_oss_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -145,10 +137,8 @@ def gpt_oss_120b_pretrain_config_h100( ) precision_config = get_precision_config(precision) - cfg = gpt_oss_120b_pretrain_config( - mock=mock, - precision_config=precision_config, - ) + cfg = gpt_oss_120b_pretrain_config() + cfg.mixed_precision = precision_config set_gpt_oss_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/llama/llama31_llm_pretrain.py b/scripts/performance/configs/llama/llama31_llm_pretrain.py index fbc85ddc5c..bf682fde38 100644 --- a/scripts/performance/configs/llama/llama31_llm_pretrain.py +++ b/scripts/performance/configs/llama/llama31_llm_pretrain.py @@ -62,7 +62,8 @@ def llama31_405b_pretrain_config_gb300( else: comm_overlap_cfg = userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192 - cfg = llama31_405b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama31_405b_pretrain_config() + cfg.mixed_precision = precision_config set_llama31_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -95,7 +96,8 @@ def llama31_405b_pretrain_config_gb200( else: comm_overlap_cfg = userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192 - cfg = llama31_405b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama31_405b_pretrain_config() + cfg.mixed_precision = precision_config set_llama31_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -129,7 +131,8 @@ def llama31_405b_pretrain_config_b300( else: comm_overlap_cfg = userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192 - cfg = llama31_405b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama31_405b_pretrain_config() + cfg.mixed_precision = precision_config set_llama31_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -158,7 +161,8 @@ def llama31_405b_pretrain_config_b200( else: comm_overlap_cfg = userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192 - cfg = llama31_405b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama31_405b_pretrain_config() + cfg.mixed_precision = precision_config set_llama31_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -187,7 +191,8 @@ def llama31_405b_pretrain_config_h100( else: comm_overlap_cfg = userbuffers_fp8_h100_h16384_tp8_cp2_mbs1_seqlen8192 - cfg = llama31_405b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama31_405b_pretrain_config() + cfg.mixed_precision = precision_config set_llama31_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/llama/llama3_llm_pretrain.py b/scripts/performance/configs/llama/llama3_llm_pretrain.py index 84c043ba7c..8e41e2af60 100644 --- a/scripts/performance/configs/llama/llama3_llm_pretrain.py +++ b/scripts/performance/configs/llama/llama3_llm_pretrain.py @@ -66,7 +66,8 @@ def llama3_70b_pretrain_config_gb300( else: comm_overlap_cfg = userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192 - cfg = llama3_70b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama3_70b_pretrain_config() + cfg.mixed_precision = precision_config set_llama3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -100,7 +101,8 @@ def llama3_70b_pretrain_config_gb200( else: comm_overlap_cfg = userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192 - cfg = llama3_70b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama3_70b_pretrain_config() + cfg.mixed_precision = precision_config set_llama3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -134,7 +136,8 @@ def llama3_70b_pretrain_config_b300( else: comm_overlap_cfg = userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192 - cfg = llama3_70b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama3_70b_pretrain_config() + cfg.mixed_precision = precision_config set_llama3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -168,7 +171,8 @@ def llama3_70b_pretrain_config_b200( else: comm_overlap_cfg = userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192 - cfg = llama3_70b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama3_70b_pretrain_config() + cfg.mixed_precision = precision_config set_llama3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -202,7 +206,8 @@ def llama3_70b_pretrain_config_h100( else: comm_overlap_cfg = userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192 - cfg = llama3_70b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama3_70b_pretrain_config() + cfg.mixed_precision = precision_config set_llama3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -228,7 +233,8 @@ def llama3_8b_pretrain_config_gb300( ) precision_config = get_precision_config(precision) - cfg = llama3_8b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama3_8b_pretrain_config() + cfg.mixed_precision = precision_config set_llama3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -252,7 +258,8 @@ def llama3_8b_pretrain_config_gb200( ) precision_config = get_precision_config(precision) - cfg = llama3_8b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama3_8b_pretrain_config() + cfg.mixed_precision = precision_config set_llama3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -276,7 +283,8 @@ def llama3_8b_pretrain_config_b300( ) precision_config = get_precision_config(precision) - cfg = llama3_8b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama3_8b_pretrain_config() + cfg.mixed_precision = precision_config set_llama3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -300,7 +308,8 @@ def llama3_8b_pretrain_config_b200( ) precision_config = get_precision_config(precision) - cfg = llama3_8b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama3_8b_pretrain_config() + cfg.mixed_precision = precision_config set_llama3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -324,7 +333,8 @@ def llama3_8b_pretrain_config_h100( ) precision_config = get_precision_config(precision) - cfg = llama3_8b_pretrain_config(mock=mock, precision_config=precision_config) + cfg = llama3_8b_pretrain_config() + cfg.mixed_precision = precision_config set_llama3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/nemotronh/nemotronh_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotronh_llm_pretrain.py index eb925862d0..2f90bb8aa6 100644 --- a/scripts/performance/configs/nemotronh/nemotronh_llm_pretrain.py +++ b/scripts/performance/configs/nemotronh/nemotronh_llm_pretrain.py @@ -46,10 +46,8 @@ def nemotronh_56b_pretrain_config_gb300( ) precision_config = get_precision_config(precision) - cfg = nemotronh_56b_pretrain_config( - mock=mock, - precision_config=precision_config, - ) + cfg = nemotronh_56b_pretrain_config() + cfg.mixed_precision = precision_config set_nemotronh_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -71,10 +69,8 @@ def nemotronh_56b_pretrain_config_gb200( ) precision_config = get_precision_config(precision) - cfg = nemotronh_56b_pretrain_config( - mock=mock, - precision_config=precision_config, - ) + cfg = nemotronh_56b_pretrain_config() + cfg.mixed_precision = precision_config set_nemotronh_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -96,10 +92,8 @@ def nemotronh_56b_pretrain_config_b300( ) precision_config = get_precision_config(precision) - cfg = nemotronh_56b_pretrain_config( - mock=mock, - precision_config=precision_config, - ) + cfg = nemotronh_56b_pretrain_config() + cfg.mixed_precision = precision_config set_nemotronh_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -121,10 +115,8 @@ def nemotronh_56b_pretrain_config_b200( ) precision_config = get_precision_config(precision) - cfg = nemotronh_56b_pretrain_config( - mock=mock, - precision_config=precision_config, - ) + cfg = nemotronh_56b_pretrain_config() + cfg.mixed_precision = precision_config set_nemotronh_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -146,10 +138,8 @@ def nemotronh_56b_pretrain_config_h100( ) precision_config = get_precision_config(precision) - cfg = nemotronh_56b_pretrain_config( - mock=mock, - precision_config=precision_config, - ) + cfg = nemotronh_56b_pretrain_config() + cfg.mixed_precision = precision_config set_nemotronh_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py index 32fe4d53dd..1eee066c76 100644 --- a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py +++ b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py @@ -74,12 +74,11 @@ def qwen3_235b_a22b_pretrain_config_gb300( ) precision_config = get_precision_config(precision) - cfg = qwen3_235b_a22b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - ) + cfg = qwen3_235b_a22b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -100,12 +99,11 @@ def qwen3_235b_a22b_pretrain_config_gb200( ) precision_config = get_precision_config(precision) - cfg = qwen3_235b_a22b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - ) + cfg = qwen3_235b_a22b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -126,12 +124,11 @@ def qwen3_235b_a22b_pretrain_config_b300( ) precision_config = get_precision_config(precision) - cfg = qwen3_235b_a22b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - ) + cfg = qwen3_235b_a22b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -152,12 +149,11 @@ def qwen3_235b_a22b_pretrain_config_b200( ) precision_config = get_precision_config(precision) - cfg = qwen3_235b_a22b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - ) + cfg = qwen3_235b_a22b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -178,12 +174,11 @@ def qwen3_235b_a22b_pretrain_config_h100( ) precision_config = get_precision_config(precision) - cfg = qwen3_235b_a22b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=False), - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - ) + cfg = qwen3_235b_a22b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -204,12 +199,11 @@ def qwen3_30b_a3b_pretrain_config_gb300( ) precision_config = get_precision_config(precision) - cfg = qwen3_30b_a3b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - ) + cfg = qwen3_30b_a3b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -230,12 +224,11 @@ def qwen3_30b_a3b_pretrain_config_gb200( ) precision_config = get_precision_config(precision) - cfg = qwen3_30b_a3b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - ) + cfg = qwen3_30b_a3b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -256,12 +249,11 @@ def qwen3_30b_a3b_pretrain_config_b300( ) precision_config = get_precision_config(precision) - cfg = qwen3_30b_a3b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - ) + cfg = qwen3_30b_a3b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -282,12 +274,11 @@ def qwen3_30b_a3b_pretrain_config_b200( ) precision_config = get_precision_config(precision) - cfg = qwen3_30b_a3b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - ) + cfg = qwen3_30b_a3b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -308,12 +299,11 @@ def qwen3_30b_a3b_pretrain_config_h100( ) precision_config = get_precision_config(precision) - cfg = qwen3_30b_a3b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - ) + cfg = qwen3_30b_a3b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -334,11 +324,10 @@ def qwen3_next_80b_a3b_pretrain_config_gb200( ) precision_config = get_precision_config(precision) - cfg = qwen3_next_80b_a3b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - ) + cfg = qwen3_next_80b_a3b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + set_qwen3_next_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -359,11 +348,10 @@ def qwen3_next_80b_a3b_pretrain_config_b300( ) precision_config = get_precision_config(precision) - cfg = qwen3_next_80b_a3b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - ) + cfg = qwen3_next_80b_a3b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + set_qwen3_next_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -384,11 +372,10 @@ def qwen3_next_80b_a3b_pretrain_config_b200( ) precision_config = get_precision_config(precision) - cfg = qwen3_next_80b_a3b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - ) + cfg = qwen3_next_80b_a3b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + set_qwen3_next_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -409,11 +396,10 @@ def qwen3_next_80b_a3b_pretrain_config_gb300( ) precision_config = get_precision_config(precision) - cfg = qwen3_next_80b_a3b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - ) + cfg = qwen3_next_80b_a3b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + set_qwen3_next_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -434,11 +420,10 @@ def qwen3_next_80b_a3b_pretrain_config_h100( ) precision_config = get_precision_config(precision) - cfg = qwen3_next_80b_a3b_pretrain_config( - mock=mock, - precision_config=precision_config, - comm_overlap_config=CommOverlapConfig(tp_comm_overlap=True), - ) + cfg = qwen3_next_80b_a3b_pretrain_config() + cfg.mixed_precision = precision_config + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) + set_qwen3_next_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/utils/utils.py b/scripts/performance/utils/utils.py index 4c65ea1c39..7ab8a8dbbb 100644 --- a/scripts/performance/utils/utils.py +++ b/scripts/performance/utils/utils.py @@ -252,7 +252,19 @@ def get_perf_optimized_recipe( def get_library_recipe(model_family_name: str, model_recipe_name: str, train_task: str, wandb_experiment_name: str): - """Get the library recipe.""" + """Get the library recipe. + + Note: Library pretrain recipes no longer accept kwargs. This function calls the recipe + without arguments and then configures the output directories on the returned config. + + The old API was: recipe_builder(dir="/nemo_run/", name=wandb_experiment_name) + This set: + - run_output_dir = "/nemo_run/{name}" + - checkpoint_dir = "/nemo_run/{name}/checkpoints" + - tensorboard_dir = "/nemo_run/{name}/tb_logs" + """ + import os + family_pkg_path = f"megatron.bridge.recipes.{model_family_name}" family_pkg = importlib.import_module(family_pkg_path) @@ -264,7 +276,27 @@ def get_library_recipe(model_family_name: str, model_recipe_name: str, train_tas model_recipe_name = f"{model_recipe_name}_finetune_config" recipe_builder = getattr(family_pkg, model_recipe_name) - return recipe_builder(dir="/nemo_run/", name=wandb_experiment_name) + + # Library pretrain recipes no longer accept kwargs - call without args + # and configure the returned ConfigContainer + cfg = recipe_builder() + + # Set output directories that were previously configured via dir="/nemo_run/" and name=wandb_experiment_name + base_output_dir = "/nemo_run" + run_output_dir = os.path.join(base_output_dir, wandb_experiment_name) + checkpoint_dir = os.path.join(run_output_dir, "checkpoints") + tensorboard_dir = os.path.join(run_output_dir, "tb_logs") + + # Checkpoint paths + cfg.checkpoint.save = checkpoint_dir + cfg.checkpoint.load = checkpoint_dir + + # Logger paths + cfg.logger.tensorboard_dir = tensorboard_dir + cfg.logger.wandb_exp_name = wandb_experiment_name + cfg.logger.wandb_save_dir = os.path.join(run_output_dir, "wandb") + + return cfg class _Colors: diff --git a/src/megatron/bridge/recipes/common.py b/src/megatron/bridge/recipes/common.py new file mode 100644 index 0000000000..5322e96915 --- /dev/null +++ b/src/megatron/bridge/recipes/common.py @@ -0,0 +1,125 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from megatron.core.distributed import DistributedDataParallelConfig + +from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing +from megatron.bridge.training.config import ( + CheckpointConfig, + ConfigContainer, + DistributedInitConfig, + GPTDatasetConfig, + LoggerConfig, + RNGConfig, + TokenizerConfig, + TrainingConfig, +) + + +def _pretrain_common() -> ConfigContainer: + """Create a base pre-training ConfigContainer with common defaults for any language model. + + This function returns a ConfigContainer template with sensible defaults. + The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model` before use. + + Returns: + ConfigContainer: Base configuration template for pre-training. + """ + # Default output directories + base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") + run_output_dir = os.path.join(base_output_dir, "default") + checkpoint_dir = os.path.join(run_output_dir, "checkpoints") + tensorboard_dir = os.path.join(run_output_dir, "tb_logs") + + # Default optimizer and scheduler + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=500, + lr_decay_iters=None, # Defaults to train_iters during validation + max_lr=3e-4, + min_lr=3e-5, + ) + + cfg = ConfigContainer( + # Model - MUST be set by each recipe before use + model=None, # type: ignore[arg-type] + # Training config + train=TrainingConfig( + train_iters=300000, + eval_interval=500, + eval_iters=32, + global_batch_size=32, + micro_batch_size=2, + manual_gc=True, + manual_gc_interval=100, + manual_gc_eval=100, + ), + # Optimizer and scheduler + optimizer=opt_cfg, + scheduler=scheduler_cfg, + # DDP config - these are the commonly overridden settings + ddp=DistributedDataParallelConfig( + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + average_in_collective=True, + data_parallel_sharding_strategy="optim_grads_params", + use_distributed_optimizer=True, + ), + # Dataset config - uses mock data by default + dataset=GPTDatasetConfig( + random_seed=1234, + reset_attention_mask=False, + reset_position_ids=False, + eod_mask_loss=False, + seq_length=4096, + num_dataset_builder_threads=1, + blend=None, # Mock data mode + blend_per_split=None, + split="9999,8,2", + data_sharding=True, + dataloader_type="single", + skip_getting_attention_mask_from_dataset=True, + ), + # Logger config + logger=LoggerConfig( + log_interval=10, + tensorboard_dir=tensorboard_dir, + log_timers_to_tensorboard=True, + ), + # Tokenizer - placeholder, each recipe should set tokenizer_model + tokenizer=TokenizerConfig( + tokenizer_type="HuggingFaceTokenizer", + tokenizer_model=None, # Must be set by each recipe + ), + # Checkpoint config + checkpoint=CheckpointConfig( + save_interval=500, + save=checkpoint_dir, + load=checkpoint_dir, + ckpt_format="torch_dist", + fully_parallel_save=True, + ), + # RNG config + rng=RNGConfig(seed=1234), + # Distributed init config + dist=DistributedInitConfig(), + comm_overlap=None, + # Mixed precision - bf16 by default + mixed_precision="bf16_mixed", + ) + + return cfg diff --git a/src/megatron/bridge/recipes/deepseek/deepseek_v2.py b/src/megatron/bridge/recipes/deepseek/deepseek_v2.py index 2cdb8676c2..7e042bb3e3 100644 --- a/src/megatron/bridge/recipes/deepseek/deepseek_v2.py +++ b/src/megatron/bridge/recipes/deepseek/deepseek_v2.py @@ -12,260 +12,246 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import List, Optional, Union - import torch -from typing_extensions import TypedDict, Unpack from megatron.bridge import AutoBridge -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths -from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DistributedDataParallelConfig, - GPTDatasetConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, -) -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig - - -class DeepSeekCommonKwargs(TypedDict, total=False): - """Typed options accepted by DeepSeek V2/V2-Lite recipe helper functions.""" - - # Core identifiers - hf_path: str - dir: Optional[str] - name: str - # Dataset configuration - data_paths: Optional[List[str]] - data_args_path: Optional[str] - train_data_path: Optional[List[str]] - valid_data_path: Optional[List[str]] - test_data_path: Optional[List[str]] - per_split_data_args_path: Optional[str] - mock: bool - # Model configuration - tensor_model_parallel_size: int - pipeline_model_parallel_size: int - pipeline_dtype: Optional[torch.dtype] - virtual_pipeline_model_parallel_size: Optional[int] - context_parallel_size: int - expert_model_parallel_size: Optional[int] - sequence_parallel: bool - use_megatron_fsdp: bool - check_for_nan_in_grad: bool - # Recompute configuration - recompute_granularity: Optional[str] - recompute_method: Optional[str] - recompute_num_layers: Optional[int] - # Training hyperparameters - train_iters: int - global_batch_size: int - micro_batch_size: int - seq_length: int - lr: float - min_lr: float - lr_warmup_iters: int - lr_decay_iters: Optional[int] - eval_interval: int - save_interval: int - use_null_tokenizer: bool - # Precision / overlap configs - precision_config: Optional[Union[MixedPrecisionConfig, str]] - comm_overlap_config: Optional[CommOverlapConfig] - - -def deepseek_v2_lite_pretrain_config(**user_kwargs: Unpack[DeepSeekCommonKwargs]) -> ConfigContainer: +from megatron.bridge.training.config import ConfigContainer + + +def deepseek_v2_lite_pretrain_config() -> ConfigContainer: """Return a pre-training config for DeepSeek-V2-Lite. - See `_deepseek_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=1, EP=8. """ - recommended_kwargs: DeepSeekCommonKwargs = { - "hf_path": "deepseek-ai/DeepSeek-V2-Lite", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "expert_model_parallel_size": 8, - # Match old defaults - "pipeline_dtype": None, - "recompute_granularity": "full", - "recompute_method": "uniform", - "recompute_num_layers": 1, - } - combined_kwargs: DeepSeekCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _deepseek_common(**combined_kwargs) - - -def deepseek_v2_pretrain_config(**user_kwargs: Unpack[DeepSeekCommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("deepseek-ai/DeepSeek-V2-Lite").to_megatron_provider(load_weights=False) + + # Tokenizer - uses NullTokenizer by default (no HF tokenizer download needed) + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Parallelism settings (MoE-specific: includes expert_model_parallel_size) + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None # Custom pipeline layout, None uses default + cfg.model.pipeline_dtype = None # None for PP=1 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.expert_model_parallel_size = 8 # MoE-specific: Expert parallelism + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.rotary_base = float(cfg.model.rotary_base) # Ensure rotary_base is float + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" # Default from DeepSeekModelProvider + cfg.model.moe_flex_dispatcher_backend = "deepep" # Options: None, deepep, hybridep + cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend + + # Training config (DIFFERENT from _pretrain_common: train_iters, global_batch_size, micro_batch_size, eval_interval) + cfg.train.train_iters = 1_000_000 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # Scheduler config (DIFFERENT from _pretrain_common: lr_warmup_iters=2000 vs 500) + cfg.scheduler.lr_warmup_iters = 2000 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections (includes MoE-specific kernels) + cfg.model.attention_backend = None # None means auto selection + cfg.model.moe_router_fusion = False # MoE-specific: Fuse router computation + cfg.model.moe_permute_fusion = True # MoE-specific: Fuse permute operations (default from DeepSeekModelProvider) + cfg.model.moe_grouped_gemm = ( + True # MoE-specific: Use grouped GEMM for experts (default from DeepSeekModelProvider) + ) + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" # Default from DeepSeekModelProvider + + # Memory saving (recompute & offloading) - ENABLED for V2-Lite + cfg.model.recompute_granularity = "full" + cfg.model.recompute_method = "uniform" + cfg.model.recompute_num_layers = 1 + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + cfg.model.moe_router_padding_for_fp8 = False # Pad router for FP8 alignment, MoE FP8 setting + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) + cfg.comm_overlap.delay_wgrad_compute = False + cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = ( + True # Overlap shared expert computation (default from DeepSeekModelProvider) + ) + + # Checkpoint config (DIFFERENT from _pretrain_common: save_interval=2000 vs 500) + cfg.checkpoint.save_interval = 2000 + cfg.checkpoint.async_save = False + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False + + return cfg + + +def deepseek_v2_pretrain_config() -> ConfigContainer: """Return a pre-training config for DeepSeek-V2. - See `_deepseek_common` for the full list of parameters. - """ - recommended_kwargs: DeepSeekCommonKwargs = { - "hf_path": "deepseek-ai/DeepSeek-V2", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 4, - "expert_model_parallel_size": 32, - "recompute_granularity": "full", - "recompute_method": "uniform", - "recompute_num_layers": 1, - } - combined_kwargs: DeepSeekCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _deepseek_common(**combined_kwargs) - - -def _deepseek_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: Optional[torch.dtype] = torch.bfloat16, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - expert_model_parallel_size: Optional[int] = None, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - check_for_nan_in_grad: bool = True, - # Recompute configuration - recompute_granularity: Optional[str] = None, - recompute_method: Optional[str] = None, - recompute_num_layers: Optional[int] = None, - # Training hyperparameters - train_iters: int = 1_000_000, - global_batch_size: int = 512, - micro_batch_size: int = 1, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 2000, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 2000, - save_interval: int = 2000, - use_null_tokenizer: bool = True, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, -) -> ConfigContainer: + Recommended parallelism: TP=1, PP=4, EP=32. """ - Create a pre-training configuration for DeepSeek V2/V2-Lite models using a given HuggingFace path. - """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") + cfg = _pretrain_common() - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock - ) + # Model config + cfg.model = AutoBridge.from_hf_pretrained("deepseek-ai/DeepSeek-V2").to_megatron_provider(load_weights=False) + + # Tokenizer - uses NullTokenizer by default (no HF tokenizer download needed) + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Parallelism settings (MoE-specific: includes expert_model_parallel_size) + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_layout = None # Custom pipeline layout, None uses default + cfg.model.pipeline_dtype = torch.bfloat16 # Required for PP > 1 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.expert_model_parallel_size = 32 # MoE-specific: Expert parallelism + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.rotary_base = float(cfg.model.rotary_base) # Ensure rotary_base is float - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - if expert_model_parallel_size is not None: - model_cfg.expert_model_parallel_size = expert_model_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.seq_length = seq_length - model_cfg.rotary_base = float(model_cfg.rotary_base) - - model_cfg.recompute_granularity = recompute_granularity - model_cfg.recompute_method = recompute_method - model_cfg.recompute_num_layers = recompute_num_layers - - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - adam_beta1=0.9, - adam_beta2=0.95, - adam_eps=1e-8, - weight_decay=0.1, - max_lr=lr, - min_lr=min_lr, + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" # Default from DeepSeekModelProvider + cfg.model.moe_flex_dispatcher_backend = "deepep" # Options: None, deepep, hybridep + cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend + + # Training config (DIFFERENT from _pretrain_common: train_iters, global_batch_size, micro_batch_size, eval_interval) + cfg.train.train_iters = 1_000_000 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # Scheduler config (DIFFERENT from _pretrain_common: lr_warmup_iters=2000 vs 500) + cfg.scheduler.lr_warmup_iters = 2000 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections (includes MoE-specific kernels) + cfg.model.attention_backend = None # None means auto selection + cfg.model.moe_router_fusion = False # MoE-specific: Fuse router computation + cfg.model.moe_permute_fusion = True # MoE-specific: Fuse permute operations (default from DeepSeekModelProvider) + cfg.model.moe_grouped_gemm = ( + True # MoE-specific: Use grouped GEMM for experts (default from DeepSeekModelProvider) ) + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" # Default from DeepSeekModelProvider - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=check_for_nan_in_grad, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - data_sharding=True, - dataloader_type="single", - num_workers=8, - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - async_save=False, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + # Memory saving (recompute & offloading) - ENABLED for V2 + cfg.model.recompute_granularity = "full" + cfg.model.recompute_method = "uniform" + cfg.model.recompute_num_layers = 1 + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + cfg.model.moe_router_padding_for_fp8 = False # Pad router for FP8 alignment, MoE FP8 setting + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) + cfg.comm_overlap.delay_wgrad_compute = False + cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = ( + True # Overlap shared expert computation (default from DeepSeekModelProvider) ) - if cfg.comm_overlap is None: - cfg.comm_overlap = CommOverlapConfig( - tp_comm_overlap=False, - ) + # Checkpoint config (DIFFERENT from _pretrain_common: save_interval=2000 vs 500) + cfg.checkpoint.save_interval = 2000 + cfg.checkpoint.async_save = False + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False return cfg diff --git a/src/megatron/bridge/recipes/deepseek/deepseek_v3.py b/src/megatron/bridge/recipes/deepseek/deepseek_v3.py index c797b0aa9e..c31ecbc0bc 100644 --- a/src/megatron/bridge/recipes/deepseek/deepseek_v3.py +++ b/src/megatron/bridge/recipes/deepseek/deepseek_v3.py @@ -12,28 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from typing import List, Optional, Union import torch -from typing_extensions import TypedDict, Unpack from megatron.bridge import AutoBridge from megatron.bridge.models import GPTModelProvider -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths -from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DistributedDataParallelConfig, - GPTDatasetConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, -) +from megatron.bridge.training.config import ConfigContainer from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend from megatron.bridge.training.mixed_precision import MixedPrecisionConfig @@ -61,301 +49,286 @@ def set_deepseek_v3_pipeline_model_parallel_layout( model_cfg.pipeline_model_parallel_layout = layout_map[(pp_size, vp_size)] -class DeepSeekV3CommonKwargs(TypedDict, total=False): - """Typed options accepted by DeepSeek V3 recipe helper functions.""" - - # Core identifiers - hf_path: str - dir: Optional[str] - name: str - # Dataset configuration - data_paths: Optional[List[str]] - data_args_path: Optional[str] - train_data_path: Optional[List[str]] - valid_data_path: Optional[List[str]] - test_data_path: Optional[List[str]] - per_split_data_args_path: Optional[str] - mock: bool - # Model configuration - tensor_model_parallel_size: int - pipeline_model_parallel_size: int - pipeline_dtype: Optional[torch.dtype] - virtual_pipeline_model_parallel_size: Optional[int] - context_parallel_size: int - expert_model_parallel_size: int - sequence_parallel: bool - use_megatron_fsdp: bool - check_for_nan_in_grad: bool - # Recompute configuration - recompute_granularity: Optional[str] - recompute_modules: Optional[List[str]] - recompute_method: Optional[str] - recompute_num_layers: Optional[int] - # MTP support - mtp_num_layers: Optional[int] - mtp_loss_scaling_factor: Optional[float] - # Training hyperparameters - train_iters: int - global_batch_size: int - micro_batch_size: int - seq_length: int - lr: float - min_lr: float - lr_warmup_iters: int - lr_decay_iters: Optional[int] - eval_interval: int - save_interval: int - use_null_tokenizer: bool - # Precision / overlap configs - precision_config: Optional[Union[MixedPrecisionConfig, str]] - comm_overlap_config: Optional[CommOverlapConfig] - moe_flex_dispatcher_backend: str - apply_rope_fusion: bool - layout: Optional[Union[str, List[List[str]]]] - - -def deepseek_v3_pretrain_config(**user_kwargs: Unpack[DeepSeekV3CommonKwargs]) -> ConfigContainer: - """Return a pre-training config for DeepSeek-V3. - - See `_deepseek_v3_common` for the full list of parameters. +def deepseek_v3_pretrain_config() -> ConfigContainer: + """Return a pre-training config for DeepSeek-V3 (671B). + + Recommended parallelism: TP=2, PP=16, EP=64. """ - recommended_kwargs: DeepSeekV3CommonKwargs = { - "hf_path": "deepseek-ai/DeepSeek-V3", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 16, - "expert_model_parallel_size": 64, - "pipeline_dtype": torch.bfloat16, - # Old recipe-compatible defaults passed via wrapper - "recompute_granularity": "selective", - "precision_config": MixedPrecisionConfig( - bf16=True, - params_dtype=torch.bfloat16, - pipeline_dtype=torch.bfloat16, - autocast_enabled=False, - grad_reduce_in_fp32=False, - ), - } - combined_kwargs: DeepSeekV3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _deepseek_v3_common(**combined_kwargs) + cfg = _pretrain_common() + # Model config + cfg.model = AutoBridge.from_hf_pretrained("deepseek-ai/DeepSeek-V3").to_megatron_provider(load_weights=False) -def deepseek_v3_pretrain_config_32nodes(**user_kwargs: Unpack[DeepSeekV3CommonKwargs]) -> ConfigContainer: - """ - Create a pre-training configuration for DeepSeek-V3 (671B) model with minimal number of nodes (32). + # Tokenizer - uses NullTokenizer by default (no HF tokenizer download needed) + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE - Returns: - ConfigContainer: Configuration for pre-training. - """ - recommended_kwargs: DeepSeekV3CommonKwargs = { - "hf_path": "deepseek-ai/DeepSeek-V3", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 8, - "expert_model_parallel_size": 32, - # Maintain old recipe defaults via wrapper overrides - "precision_config": MixedPrecisionConfig( - bf16=True, - params_dtype=torch.bfloat16, - pipeline_dtype=torch.bfloat16, - autocast_enabled=False, - grad_reduce_in_fp32=False, - ), - "recompute_granularity": "full", - "recompute_method": "uniform", - "recompute_num_layers": 1, - } - combined_kwargs: DeepSeekV3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return deepseek_v3_pretrain_config(**combined_kwargs) - - -def _deepseek_v3_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 2, - pipeline_model_parallel_size: int = 16, - pipeline_dtype: Optional[torch.dtype] = torch.bfloat16, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - expert_model_parallel_size: int = 64, - sequence_parallel: bool = True, - use_megatron_fsdp: bool = False, - check_for_nan_in_grad: bool = True, - # Recompute configuration - recompute_granularity: Optional[str] = "selective", - recompute_modules: Optional[List[str]] = None, - recompute_method: Optional[str] = None, - recompute_num_layers: Optional[int] = None, - # MTP support - mtp_num_layers: Optional[int] = 1, - mtp_loss_scaling_factor: Optional[float] = 0.1, - # Training hyperparameters - train_iters: int = 1_000_000, - global_batch_size: int = 4096, - micro_batch_size: int = 1, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 2000, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 2000, - save_interval: int = 2000, - use_null_tokenizer: bool = True, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = None, - comm_overlap_config: Optional[CommOverlapConfig] = None, - moe_flex_dispatcher_backend: str = None, - apply_rope_fusion: bool = False, - layout: Optional[Union[str, List[List[str]]]] = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for DeepSeek-V3 models using a given HuggingFace path. - """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock - ) + # Parallelism settings (MoE-specific: includes expert_model_parallel_size) + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 16 + cfg.model.pipeline_model_parallel_layout = None # Will be set by set_deepseek_v3_pipeline_model_parallel_layout + cfg.model.pipeline_dtype = torch.bfloat16 # Required for PP > 1 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.expert_model_parallel_size = 64 # MoE-specific: Expert parallelism + cfg.model.expert_tensor_parallel_size = 1 # MoE-specific: Expert tensor parallelism + cfg.model.sequence_parallel = True + cfg.model.seq_length = 4096 - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.expert_model_parallel_size = expert_model_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.seq_length = seq_length - - model_cfg.expert_tensor_parallel_size = 1 - # MTP configuration (allow None to disable by setting to 0) - model_cfg.mtp_num_layers = 0 if mtp_num_layers is None else mtp_num_layers - model_cfg.mtp_loss_scaling_factor = mtp_loss_scaling_factor - model_cfg.init_method_std = 0.006 - model_cfg.rotary_base = 10000.0 - model_cfg.rotary_scaling_factor = 40 - model_cfg.rotary_base = float(model_cfg.rotary_base) - model_cfg.rotary_scaling_factor = int(model_cfg.rotary_scaling_factor) - - model_cfg.recompute_granularity = recompute_granularity - model_cfg.recompute_modules = recompute_modules - model_cfg.recompute_method = recompute_method - model_cfg.recompute_num_layers = recompute_num_layers - - set_deepseek_v3_pipeline_model_parallel_layout(model_cfg, layout) - - # Pipeline split for asymmetric stages are specified with map_pp_vp_to_layout below - model_cfg.account_for_embedding_in_pipeline_split = False - model_cfg.account_for_loss_in_pipeline_split = False - model_cfg.num_layers_in_first_pipeline_stage = None - model_cfg.num_layers_in_last_pipeline_stage = None - - # Performance optimization knobs - model_cfg.moe_permute_fusion = True - apply_flex_dispatcher_backend(model_cfg, moe_flex_dispatcher_backend) - - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - adam_beta1=0.9, - adam_beta2=0.95, - adam_eps=1e-8, - weight_decay=0.1, - max_lr=lr, - min_lr=min_lr, - ) - opt_config.use_precision_aware_optimizer = True - opt_config.main_params_dtype = torch.float32 - opt_config.main_grads_dtype = torch.bfloat16 - opt_config.exp_avg_dtype = torch.bfloat16 - opt_config.exp_avg_sq_dtype = torch.bfloat16 - - if precision_config is None: - precision_config = MixedPrecisionConfig( - bf16=True, - params_dtype=torch.bfloat16, - pipeline_dtype=torch.bfloat16, - autocast_enabled=False, - grad_reduce_in_fp32=False, - ) - - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=5, - manual_gc_eval=5, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=check_for_nan_in_grad, - grad_reduce_in_fp32=False, # V3 recipe sets this to False - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, # need use_distributed_optimizer=True - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - data_sharding=True, - dataloader_type="single", - num_workers=8, - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - async_save=False, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + # MTP (Multi-Token Prediction) configuration + cfg.model.mtp_num_layers = 1 # Set to 0 or None to disable MTP + cfg.model.mtp_loss_scaling_factor = 0.1 + + # Model-specific settings + cfg.model.init_method_std = 0.006 + cfg.model.rotary_base = 10000.0 + cfg.model.rotary_scaling_factor = 40 + cfg.model.rotary_base = float(cfg.model.rotary_base) # Ensure rotary_base is float + cfg.model.rotary_scaling_factor = int(cfg.model.rotary_scaling_factor) + + # Pipeline split settings (asymmetric stages handled by layout) + cfg.model.account_for_embedding_in_pipeline_split = False + cfg.model.account_for_loss_in_pipeline_split = False + cfg.model.num_layers_in_first_pipeline_stage = None + cfg.model.num_layers_in_last_pipeline_stage = None + + # Set pipeline layout + set_deepseek_v3_pipeline_model_parallel_layout(cfg.model) + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" # Default from DeepSeekModelProvider + apply_flex_dispatcher_backend(cfg.model, None) + cfg.model.moe_flex_dispatcher_backend = "deepep" # Options: None, deepep, hybridep + cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend + + # Training config (DIFFERENT from _pretrain_common) + cfg.train.train_iters = 1_000_000 + cfg.train.global_batch_size = 4096 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 5 # Different from default 100 + cfg.train.manual_gc_eval = 5 + + # Scheduler config (DIFFERENT from _pretrain_common: lr_warmup_iters=2000 vs 500) + cfg.scheduler.lr_warmup_iters = 2000 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections (includes MoE-specific kernels) + cfg.model.attention_backend = None # None means auto selection + cfg.model.moe_router_fusion = False # MoE-specific: Fuse router computation + cfg.model.moe_permute_fusion = True # MoE-specific: Fuse permute operations + cfg.model.moe_grouped_gemm = True # MoE-specific: Use grouped GEMM for experts + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" # Default from DeepSeekModelProvider + + # Memory saving (recompute & offloading) - selective recompute for V3 + cfg.model.recompute_granularity = "selective" + cfg.model.recompute_modules = None + cfg.model.recompute_method = None + cfg.model.recompute_num_layers = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - DeepSeek V3 uses custom MixedPrecisionConfig (NOT "bf16_mixed" string) + cfg.mixed_precision = MixedPrecisionConfig( + bf16=True, + params_dtype=torch.bfloat16, + pipeline_dtype=torch.bfloat16, + autocast_enabled=False, + grad_reduce_in_fp32=False, ) - if apply_rope_fusion: + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.model.moe_router_padding_for_fp8 = False # Pad router for FP8 alignment + + # Optimizer settings - DeepSeek V3 uses precision-aware optimizer with bf16 moments + cfg.optimizer.use_precision_aware_optimizer = True + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.main_grads_dtype = torch.bfloat16 # Different from default float32 + cfg.optimizer.exp_avg_dtype = torch.bfloat16 # Different from default float32 + cfg.optimizer.exp_avg_sq_dtype = torch.bfloat16 # Different from default float32 + + # Communication overlap + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) + cfg.comm_overlap.delay_wgrad_compute = False + cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = True # Default from DeepSeekModelProvider + + # Checkpoint config (DIFFERENT from _pretrain_common: save_interval=2000 vs 500) + cfg.checkpoint.save_interval = 2000 + cfg.checkpoint.async_save = False + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config (DIFFERENT: grad_reduce_in_fp32=False) + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = False # Different from default True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False + + if cfg.model.apply_rope_fusion: cfg.dist.enable_megatron_core_experimental = True # mla rope fusion is experimental - # Ensure comm_overlap exists with old default tp_comm_overlap=False when not provided - if cfg.comm_overlap is None: - cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) + return cfg + + +def deepseek_v3_pretrain_config_32nodes() -> ConfigContainer: + """Return a pre-training config for DeepSeek-V3 (671B) with minimal nodes (32). + + Recommended parallelism: TP=2, PP=8, EP=32. + Uses full recompute for memory efficiency. + """ + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("deepseek-ai/DeepSeek-V3").to_megatron_provider(load_weights=False) + + # Tokenizer - uses NullTokenizer by default (no HF tokenizer download needed) + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Parallelism settings (32 nodes configuration) + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.expert_model_parallel_size = 32 # Reduced for 32 nodes + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.sequence_parallel = True + cfg.model.seq_length = 4096 + + # MTP (Multi-Token Prediction) configuration + cfg.model.mtp_num_layers = 1 + cfg.model.mtp_loss_scaling_factor = 0.1 + + # Model-specific settings + cfg.model.init_method_std = 0.006 + cfg.model.rotary_base = 10000.0 + cfg.model.rotary_scaling_factor = 40 + cfg.model.rotary_base = float(cfg.model.rotary_base) + cfg.model.rotary_scaling_factor = int(cfg.model.rotary_scaling_factor) + + # Pipeline split settings + cfg.model.account_for_embedding_in_pipeline_split = False + cfg.model.account_for_loss_in_pipeline_split = False + cfg.model.num_layers_in_first_pipeline_stage = None + cfg.model.num_layers_in_last_pipeline_stage = None + + # Set pipeline layout + set_deepseek_v3_pipeline_model_parallel_layout(cfg.model) + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" + apply_flex_dispatcher_backend(cfg.model, None) + cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_hybridep_num_sms = 16 + + # Training config + cfg.train.train_iters = 1_000_000 + cfg.train.global_batch_size = 4096 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 5 + cfg.train.manual_gc_eval = 5 + + # Scheduler config + cfg.scheduler.lr_warmup_iters = 2000 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving - FULL recompute for 32 nodes (memory efficiency) + cfg.model.recompute_granularity = "full" + cfg.model.recompute_method = "uniform" + cfg.model.recompute_num_layers = 1 + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - DeepSeek V3 uses custom MixedPrecisionConfig + cfg.mixed_precision = MixedPrecisionConfig( + bf16=True, + params_dtype=torch.bfloat16, + pipeline_dtype=torch.bfloat16, + autocast_enabled=False, + grad_reduce_in_fp32=False, + ) + cfg.model.moe_router_padding_for_fp8 = False + + # Optimizer settings - precision-aware optimizer with bf16 moments + cfg.optimizer.use_precision_aware_optimizer = True + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.main_grads_dtype = torch.bfloat16 + cfg.optimizer.exp_avg_dtype = torch.bfloat16 + cfg.optimizer.exp_avg_sq_dtype = torch.bfloat16 + + # Communication overlap + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) + cfg.comm_overlap.delay_wgrad_compute = False + cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = True + + # Checkpoint config + cfg.checkpoint.save_interval = 2000 + cfg.checkpoint.async_save = False + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False + + if cfg.model.apply_rope_fusion: + cfg.dist.enable_megatron_core_experimental = True # mla rope fusion is experimental return cfg diff --git a/src/megatron/bridge/recipes/gemma/gemma2.py b/src/megatron/bridge/recipes/gemma/gemma2.py index e51c417934..8aa9bb7988 100644 --- a/src/megatron/bridge/recipes/gemma/gemma2.py +++ b/src/megatron/bridge/recipes/gemma/gemma2.py @@ -22,15 +22,13 @@ from megatron.bridge import AutoBridge from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, - GPTDatasetConfig, LoggerConfig, RNGConfig, TokenizerConfig, @@ -116,167 +114,250 @@ class Gemma2FinetuneKwargs(TypedDict, total=False): # Pretrain Configs -def gemma2_2b_pretrain_config(**user_kwargs: Unpack[Gemma2CommonKwargs]) -> ConfigContainer: +def gemma2_2b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Gemma2 2B. Default parallelism: TP=2, PP=1 """ - recommended_kwargs: Gemma2CommonKwargs = { - "hf_path": "google/gemma-2-2b", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 1, - } - combined_kwargs: Gemma2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _gemma2_common(**combined_kwargs) + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("google/gemma-2-2b").to_megatron_provider(load_weights=False) + + # Tokenizer - uses HuggingFaceTokenizer + cfg.tokenizer.tokenizer_type = "HuggingFaceTokenizer" + cfg.tokenizer.tokenizer_model = "google/gemma-2-2b" + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 # --num-workers for dataloader + + # Parallelism settings + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + + # Training config - all match _pretrain_common defaults + # Note: train_iters=300000, global_batch_size=32, micro_batch_size=2, eval_interval=500 are defaults + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None # None means auto selection + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" # Gemma2 uses native + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - uses "bf16_mixed" from _pretrain_common + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Optimizer settings (commented - enable for precision-aware optimizer) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config - matches _pretrain_common defaults (save_interval=500) + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config (Gemma2 doesn't set grad_reduce_in_fp32/average_in_collective, uses megatron-core defaults) + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.average_in_collective = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg -def gemma2_9b_pretrain_config(**user_kwargs: Unpack[Gemma2CommonKwargs]) -> ConfigContainer: +def gemma2_9b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Gemma2 9B. Default parallelism: TP=8, PP=1 """ - recommended_kwargs: Gemma2CommonKwargs = { - "hf_path": "google/gemma-2-9b", - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 1, - "pipeline_dtype": torch.bfloat16, - } - combined_kwargs: Gemma2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _gemma2_common(**combined_kwargs) + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("google/gemma-2-9b").to_megatron_provider(load_weights=False) + + # Tokenizer - uses HuggingFaceTokenizer + cfg.tokenizer.tokenizer_type = "HuggingFaceTokenizer" + cfg.tokenizer.tokenizer_model = "google/gemma-2-9b" + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Parallelism settings + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 # Required for larger models + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" # Gemma2 uses native + + # Memory saving + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - uses "bf16_mixed" from _pretrain_common + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Optimizer settings (commented - enable for precision-aware optimizer) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.average_in_collective = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg -def gemma2_27b_pretrain_config(**user_kwargs: Unpack[Gemma2CommonKwargs]) -> ConfigContainer: +def gemma2_27b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Gemma2 27B. Default parallelism: TP=8, PP=2 """ - recommended_kwargs: Gemma2CommonKwargs = { - "hf_path": "google/gemma-2-27b", - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 2, - "pipeline_dtype": torch.bfloat16, - } - combined_kwargs: Gemma2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _gemma2_common(**combined_kwargs) - - -def _gemma2_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: Optional[torch.dtype] = None, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - # Training hyperparameters - train_iters: int = 300000, - global_batch_size: int = 32, - micro_batch_size: int = 2, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 500, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 500, - save_interval: int = 500, - use_null_tokenizer: bool = False, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, -) -> ConfigContainer: - """Create a pre-training configuration for Gemma2 models.""" - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock - ) + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("google/gemma-2-27b").to_megatron_provider(load_weights=False) + + # Tokenizer - uses HuggingFaceTokenizer + cfg.tokenizer.tokenizer_type = "HuggingFaceTokenizer" + cfg.tokenizer.tokenizer_model = "google/gemma-2-27b" + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Parallelism settings + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 # Required for PP > 1 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" # Gemma2 uses native + + # Memory saving + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - uses "bf16_mixed" from _pretrain_common + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Optimizer settings (commented - enable for precision-aware optimizer) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.seq_length = seq_length - - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - max_lr=lr, - min_lr=min_lr, - ) - - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, - ) + # DDP config + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.average_in_collective = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" return cfg diff --git a/src/megatron/bridge/recipes/gemma/gemma3.py b/src/megatron/bridge/recipes/gemma/gemma3.py index 224a429fcc..89adc6fb42 100644 --- a/src/megatron/bridge/recipes/gemma/gemma3.py +++ b/src/megatron/bridge/recipes/gemma/gemma3.py @@ -20,7 +20,7 @@ from megatron.bridge import AutoBridge from megatron.bridge.models.gemma.gemma3_provider import Gemma3ModelProvider1B from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE @@ -29,7 +29,6 @@ CheckpointConfig, ConfigContainer, DistributedDataParallelConfig, - GPTDatasetConfig, LoggerConfig, RNGConfig, TokenizerConfig, @@ -126,196 +125,95 @@ class Gemma3FinetuneKwargs(TypedDict, total=False): # Gemma3 models -def gemma3_1b_pretrain_config(**user_kwargs: Unpack[Gemma3CommonKwargs]) -> ConfigContainer: +def gemma3_1b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Gemma3 1B. - See `_gemma3_common` for the full list of parameters. + Default parallelism: TP=1, PP=1, seq_length=32K """ - recommended_kwargs: Gemma3CommonKwargs = { - "provider_class": Gemma3ModelProvider1B, - "hf_path": "google/gemma-3-1b-pt", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - "sequence_parallel": False, - "seq_length": SEQUENCE_LENGTH_32K, - } - combined_kwargs: Gemma3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _gemma3_common(**combined_kwargs) - - -def _gemma3_common( - provider_class: type, - hf_path: str | None = None, - dir: str | None = None, - name: str = "default", - # Dataset configuration - data_paths: list[str] | None = None, - data_args_path: str | None = None, - train_data_path: list[str] | None = None, - valid_data_path: list[str] | None = None, - test_data_path: list[str] | None = None, - per_split_data_args_path: str | None = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: torch.dtype | None = None, - virtual_pipeline_model_parallel_size: int | None = None, - context_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - account_for_embedding_in_pipeline_split: bool = False, - account_for_loss_in_pipeline_split: bool = False, - # Training hyperparameters - train_iters: int = 1168251, - global_batch_size: int = 512, - micro_batch_size: int = 1, - seq_length: int = 131072, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 2000, - lr_decay_iters: int | None = None, - eval_interval: int = 2000, - save_interval: int = 500, - use_null_tokenizer: bool = True, - # Precision recipe - precision_config: MixedPrecisionConfig | str | None = "bf16_mixed", - comm_overlap_config: CommOverlapConfig | None = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for Gemma3 family models. - - Args: - provider_class (type): Gemma3 model provider class (e.g., Gemma3ModelProvider1B). - hf_path (str | None): HuggingFace model path (e.g., "google/gemma-3-1b-pt"). - dir (str | None): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (list[str] | None): List of paths to dataset files. If None, mock data will be used. - data_args_path (str | None): Path to file containing data arguments. - train_data_path (list[str] | None): List of training data paths. - valid_data_path (list[str] | None): List of validation data paths. - test_data_path (list[str] | None): List of test data paths. - per_split_data_args_path (str | None): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (torch.dtype | None): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (int | None): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism. - sequence_parallel (bool): Whether to use sequence parallelism. - use_megatron_fsdp (bool): Whether to use Megatron FSDP. - account_for_embedding_in_pipeline_split (bool): Whether to account for embedding in pipeline split. - account_for_loss_in_pipeline_split (bool): Whether to account for loss in pipeline split. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - lr_decay_iters (int | None): Number of iterations over which to decay the LR. - eval_interval (int): Evaluation interval. - save_interval (int): Checkpoint save interval. - use_null_tokenizer (bool): Whether to use null tokenizer for synthetic data. - precision_config (MixedPrecisionConfig | str | None): Precision configuration for the model. - comm_overlap_config (CommOverlapConfig | None): Communication overlap configuration. - - Returns: - ConfigContainer: Configuration for pre-training. - """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock - ) - - # Instantiate the model provider - model_cfg = provider_class() - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.seq_length = seq_length - - # Large model specific pipeline split configurations - if account_for_embedding_in_pipeline_split: - model_cfg.account_for_embedding_in_pipeline_split = True - if account_for_loss_in_pipeline_split: - model_cfg.account_for_loss_in_pipeline_split = True - - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - max_lr=lr, - min_lr=min_lr, - ) - - # Config Container - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - # Dataloader config parameters - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, - ) + cfg = _pretrain_common() + + # Model config - uses provider class instead of AutoBridge + cfg.model = Gemma3ModelProvider1B() + + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = SEQUENCE_LENGTH_32K + + # Parallelism settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = SEQUENCE_LENGTH_32K # 32768 + + # Pipeline split settings (for larger models with PP > 1) + cfg.model.account_for_embedding_in_pipeline_split = False + cfg.model.account_for_loss_in_pipeline_split = False + + # Training config (DIFFERENT from _pretrain_common) + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None # None means auto selection + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" # Gemma3 uses native + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - uses "bf16_mixed" from _pretrain_common + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + # Optimizer settings (commented - enable for precision-aware optimizer) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" return cfg diff --git a/src/megatron/bridge/recipes/glm/glm45.py b/src/megatron/bridge/recipes/glm/glm45.py index ad811a93b5..0056eed6a5 100644 --- a/src/megatron/bridge/recipes/glm/glm45.py +++ b/src/megatron/bridge/recipes/glm/glm45.py @@ -20,7 +20,7 @@ from megatron.bridge import AutoBridge from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE @@ -137,217 +137,247 @@ class GLM45FinetuneKwargs(TypedDict, total=False): wandb_exp_name: Optional[str] -def glm45_355b_pretrain_config(**user_kwargs: Unpack[GLM45CommonKwargs]) -> ConfigContainer: - """Return a pre-training config for GLM 4.5 355B-A32B variant.""" - recommended: GLM45CommonKwargs = { - "hf_path": "zai-org/GLM-4.5", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 8, - "expert_model_parallel_size": 16, - "sequence_parallel": True, - "use_null_tokenizer": True, - "recompute_granularity": "selective", - } - kwargs: GLM45CommonKwargs = {**recommended, **user_kwargs} - return _glm45_common(**kwargs) +def glm45_355b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for GLM 4.5 355B-A32B variant. - -def glm45_air_106b_pretrain_config(**user_kwargs: Unpack[GLM45CommonKwargs]) -> ConfigContainer: - """Return a pre-training config for GLM 4.5 Air 106B-A12B variant.""" - recommended: GLM45CommonKwargs = { - "hf_path": "zai-org/GLM-4.5-Air", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 4, - "expert_model_parallel_size": 8, - "sequence_parallel": True, - "use_null_tokenizer": True, - "recompute_granularity": "selective", - } - kwargs: GLM45CommonKwargs = {**recommended, **user_kwargs} - return _glm45_common(**kwargs) - - -def _glm45_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, - # Dataset override option - dataset: Optional[Union[GPTDatasetConfig, FinetuningDatasetConfig, DatasetProvider]] = None, - # Model configuration - num_layers: int = None, # for ci testing - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: Optional[torch.dtype] = None, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - expert_model_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - account_for_embedding_in_pipeline_split: bool = False, - account_for_loss_in_pipeline_split: bool = False, - cp_comm_type: Optional[str] = None, - # Recompute configuration - recompute_granularity: Optional[str] = None, - recompute_modules: Optional[List[str]] = None, - recompute_method: Optional[str] = None, - recompute_num_layers: Optional[int] = None, - # MTP support (GLM models use MTP) - mtp_num_layers: Optional[int] = 1, - mtp_loss_scaling_factor: Optional[float] = 0.3, - # Training hyperparameters - train_iters: int = 1000000, - global_batch_size: int = 2048, - micro_batch_size: int = 1, - seq_length: int = 4096, - lr: float = 1e-4, - min_lr: float = 1e-5, - lr_warmup_iters: int = 2000, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 2000, - save_interval: int = 500, - use_null_tokenizer: bool = True, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, - # Checkpointing - pretrained_checkpoint: Optional[str] = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for GLM 4.5 family models using a given HuggingFace path. - Mirrors the structure used in gpt_oss recipes for consistency. + Recommended parallelism: TP=2, PP=8, EP=16 """ + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("zai-org/GLM-4.5").to_megatron_provider(load_weights=False) + + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 4096 + cfg.dataset.num_workers = 8 + + # Parallelism settings + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.expert_model_parallel_size = 16 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.sequence_parallel = True + cfg.model.seq_length = 4096 + + # Pipeline split settings + cfg.model.account_for_embedding_in_pipeline_split = False + cfg.model.account_for_loss_in_pipeline_split = False + + # MTP (Multi-Token Prediction) configuration + cfg.model.mtp_num_layers = 1 # Set to 0 or None to disable MTP + cfg.model.mtp_loss_scaling_factor = 0.3 + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" # Default + cfg.model.moe_flex_dispatcher_backend = "deepep" # Options: None, deepep, hybridep + cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend + + # Training config + cfg.train.train_iters = 1000000 + cfg.train.global_batch_size = 2048 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # Scheduler config + cfg.scheduler.lr_warmup_iters = 2000 + + # Optimizer config + cfg.optimizer.lr = 1e-4 + cfg.optimizer.min_lr = 1e-5 + cfg.optimizer.adam_beta2 = 0.95 + cfg.optimizer.adam_eps = 1e-8 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None # None means auto selection + cfg.model.moe_router_fusion = False # MoE-specific + cfg.model.moe_permute_fusion = True # MoE-specific: Fuse permute operations + cfg.model.moe_grouped_gemm = True # MoE-specific: Use grouped GEMM + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" # GLM uses native + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = "selective" + cfg.model.recompute_modules = None + cfg.model.recompute_method = None + cfg.model.recompute_num_layers = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - uses "bf16_mixed" from _pretrain_common + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.model.moe_router_padding_for_fp8 = False # Pad router for FP8 alignment + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap (default None, can pass CommOverlapConfig for advanced overlap) + # cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = True # Default from GLM model provider + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config (matches _pretrain_common) + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - if num_layers is not None: - model_cfg.num_layers = num_layers - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.expert_model_parallel_size = expert_model_parallel_size - model_cfg.expert_tensor_parallel_size = 1 - model_cfg.sequence_parallel = sequence_parallel - model_cfg.seq_length = seq_length - - if account_for_embedding_in_pipeline_split: - model_cfg.account_for_embedding_in_pipeline_split = True - if account_for_loss_in_pipeline_split: - model_cfg.account_for_loss_in_pipeline_split = True - model_cfg.cp_comm_type = cp_comm_type - - # Recompute configuration - model_cfg.recompute_granularity = recompute_granularity - model_cfg.recompute_modules = recompute_modules - model_cfg.recompute_method = recompute_method - model_cfg.recompute_num_layers = recompute_num_layers + return cfg - # MTP configuration (GLM models support MTP) - model_cfg.mtp_num_layers = 0 if mtp_num_layers is None else mtp_num_layers - model_cfg.mtp_loss_scaling_factor = mtp_loss_scaling_factor - # Performance optimization knobs - model_cfg.moe_permute_fusion = True +def glm45_air_106b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for GLM 4.5 Air 106B-A12B variant. - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - max_lr=lr, - min_lr=min_lr, - adam_beta1=0.9, - adam_beta2=0.95, - adam_eps=1e-8, - weight_decay=0.1, - ) - - # Build dataset config if not supplied directly - if dataset is None: - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, - data_args_path, - train_data_path, - valid_data_path, - test_data_path, - per_split_data_args_path, - mock, - ) - dataset_cfg = GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ) - else: - dataset_cfg = dataset - - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=dataset_cfg, - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - pretrained_checkpoint=pretrained_checkpoint, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, - ) + Recommended parallelism: TP=1, PP=4, EP=8 + """ + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("zai-org/GLM-4.5-Air").to_megatron_provider(load_weights=False) + + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 4096 + cfg.dataset.num_workers = 8 + + # Parallelism settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.expert_model_parallel_size = 8 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.sequence_parallel = True + cfg.model.seq_length = 4096 + + # Pipeline split settings + cfg.model.account_for_embedding_in_pipeline_split = False + cfg.model.account_for_loss_in_pipeline_split = False + + # MTP (Multi-Token Prediction) configuration + cfg.model.mtp_num_layers = 1 + cfg.model.mtp_loss_scaling_factor = 0.3 + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" + cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_hybridep_num_sms = 16 + + # Training config + cfg.train.train_iters = 1000000 + cfg.train.global_batch_size = 2048 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # Scheduler config + cfg.scheduler.lr_warmup_iters = 2000 + + # Optimizer config + cfg.optimizer.lr = 1e-4 + cfg.optimizer.min_lr = 1e-5 + cfg.optimizer.adam_beta2 = 0.95 + cfg.optimizer.adam_eps = 1e-8 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" # GLM uses native + + # Memory saving + cfg.model.recompute_granularity = "selective" + cfg.model.recompute_modules = None + cfg.model.recompute_method = None + cfg.model.recompute_num_layers = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision + cfg.model.moe_router_padding_for_fp8 = False + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap (default None, can pass CommOverlapConfig for advanced overlap) + # cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = True + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False return cfg diff --git a/src/megatron/bridge/recipes/gpt/gpt3_175b.py b/src/megatron/bridge/recipes/gpt/gpt3_175b.py index 9d12d6aa70..6efaf1070e 100644 --- a/src/megatron/bridge/recipes/gpt/gpt3_175b.py +++ b/src/megatron/bridge/recipes/gpt/gpt3_175b.py @@ -12,222 +12,115 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import List, Optional, Union - import torch from megatron.bridge.models.gpt_provider import GPTProvider175B -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths -from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.comm_overlap import CommOverlapConfig, userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048 -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DistributedDataParallelConfig, - GPTDatasetConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, -) -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, get_mixed_precision_config - - -def model_config( - tensor_model_parallel_size: int = 4, - pipeline_model_parallel_size: int = 8, - pipeline_dtype: Optional[torch.dtype] = torch.bfloat16, - virtual_pipeline_model_parallel_size: Optional[int] = 6, - context_parallel_size: int = 1, - sequence_parallel: bool = True, -) -> GPTProvider175B: - """ - Configure the GPT3 175B model. - - Args: - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (Optional[int]): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism. - sequence_parallel (bool): Whether to use sequence parallelism. - - Returns: - GPTProvider175B: Configuration for the GPT3 175B model. - """ - return GPTProvider175B( - tensor_model_parallel_size=tensor_model_parallel_size, - pipeline_model_parallel_size=pipeline_model_parallel_size, - pipeline_dtype=pipeline_dtype, - virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size, - context_parallel_size=context_parallel_size, - sequence_parallel=sequence_parallel, - ) +from megatron.bridge.training.config import ConfigContainer +from megatron.bridge.training.mixed_precision import get_mixed_precision_config -def pretrain_config( - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 4, - pipeline_model_parallel_size: int = 8, - pipeline_dtype: Optional[torch.dtype] = torch.bfloat16, - virtual_pipeline_model_parallel_size: Optional[int] = 6, - context_parallel_size: int = 1, - sequence_parallel: bool = True, - use_megatron_fsdp: bool = False, - # Training hyperparameters - train_iters: int = 1_168_251, - global_batch_size: int = 2048, - micro_batch_size: int = 2, - seq_length: int = 2048, - lr: float = 0.9e-4, - lr_warmup_iters: int = 2000, - lr_decay_iters: Optional[int] = None, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for GPT3 175B model. +def gpt3_175b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for GPT3-175B. The default configuration is expected to run on 64 nodes with 8 GPUs each. - - Args: - dir (Optional[str]): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used. - data_args_path (Optional[str]): Path to file containing data arguments. - train_data_path (Optional[List[str]]): List of training data paths. - valid_data_path (Optional[List[str]]): List of validation data paths. - test_data_path (Optional[List[str]]): List of test data paths. - per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (Optional[int]): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism to be passed to model_config. - sequence_parallel (bool): Whether to use sequence parallelism. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - lr_decay_iters (Optional[int]): Number of iterations for learning rate decay. - precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model. - comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration for the model. - - Returns: - ConfigContainer: Configuration for pre-training. + Default parallelism: TP=4, PP=8, VP=6, SP=True. """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock + cfg = _pretrain_common() + + # Model config - uses GPTProvider175B + cfg.model = GPTProvider175B( + tensor_model_parallel_size=4, + pipeline_model_parallel_size=8, + pipeline_dtype=torch.bfloat16, + virtual_pipeline_model_parallel_size=6, + context_parallel_size=1, + sequence_parallel=True, ) - model_cfg = model_config( - tensor_model_parallel_size=tensor_model_parallel_size, - pipeline_model_parallel_size=pipeline_model_parallel_size, - pipeline_dtype=pipeline_dtype, - virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size, - context_parallel_size=context_parallel_size, - sequence_parallel=sequence_parallel, + # Parallel settings + cfg.model.pipeline_model_parallel_layout = None + + # Tokenizer - uses NullTokenizer + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 2048 + cfg.dataset.num_workers = 8 + + # Training config + cfg.train.train_iters = 1_168_251 + cfg.train.global_batch_size = 2048 + cfg.train.micro_batch_size = 2 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Optimizer + cfg.scheduler.lr_warmup_iters = 2000 + cfg.optimizer.lr = 0.9e-4 + cfg.optimizer.min_lr = 0.9e-5 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" # GPT uses native + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - bf16_mixed with grad_reduce_in_fp32=False + cfg.mixed_precision = get_mixed_precision_config("bf16_mixed") + cfg.mixed_precision.grad_reduce_in_fp32 = False + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap - enabled with userbuffers config + cfg.comm_overlap = CommOverlapConfig( + tp_comm_overlap=True, + tp_comm_overlap_cfg=userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048, + defer_embedding_wgrad_compute=True, + wgrad_deferral_limit=50, + overlap_param_gather_with_optimizer_step=False, # Currently disabled due to issue with async checkpointing ) - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - max_lr=lr, - ) - opt_config.use_precision_aware_optimizer = False - - if isinstance(precision_config, str): - precision_config = get_mixed_precision_config(precision_config) - - precision_config.grad_reduce_in_fp32 = False - - if comm_overlap_config is None: - comm_overlap_config = CommOverlapConfig( - tp_comm_overlap=True, - tp_comm_overlap_cfg=userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048, - defer_embedding_wgrad_compute=True, - wgrad_deferral_limit=50, - overlap_param_gather_with_optimizer_step=False, # Currently disabled to an issue with async checkpointing - ) - - # Config Container - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=2000, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, # need use_distributed_optimizer=True - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - # Dataloader config parameters - data_sharding=True, - dataloader_type="single", - num_workers=8, - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE), - checkpoint=CheckpointConfig( - save_interval=2000, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, - ) + # Checkpoint config + cfg.checkpoint.save_interval = 2000 + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" return cfg diff --git a/src/megatron/bridge/recipes/gpt_oss/gpt_oss.py b/src/megatron/bridge/recipes/gpt_oss/gpt_oss.py index 7afa301d4e..ae986c1eee 100644 --- a/src/megatron/bridge/recipes/gpt_oss/gpt_oss.py +++ b/src/megatron/bridge/recipes/gpt_oss/gpt_oss.py @@ -20,7 +20,7 @@ from megatron.bridge import AutoBridge from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE @@ -129,193 +129,230 @@ class GPTOSSFinetuneKwargs(TypedDict, total=False): wandb_exp_name: Optional[str] -def gpt_oss_20b_pretrain_config(**user_kwargs: Unpack[GPTOSSCommonKwargs]) -> ConfigContainer: - """Return a pre-training config for GPT-OSS 20B variant.""" - recommended: GPTOSSCommonKwargs = { - "hf_path": "openai/gpt-oss-20b", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 4, - "expert_model_parallel_size": 4, - "sequence_parallel": True, - "use_null_tokenizer": True, - } - kwargs: GPTOSSCommonKwargs = {**recommended, **user_kwargs} - return _gpt_oss_common(**kwargs) - - -def gpt_oss_120b_pretrain_config(**user_kwargs: Unpack[GPTOSSCommonKwargs]) -> ConfigContainer: - """Return a pre-training config for GPT-OSS 120B variant.""" - recommended: GPTOSSCommonKwargs = { - "hf_path": "openai/gpt-oss-120b", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 4, - "expert_model_parallel_size": 16, - "sequence_parallel": True, - "use_null_tokenizer": True, - } - kwargs: GPTOSSCommonKwargs = {**recommended, **user_kwargs} - return _gpt_oss_common(**kwargs) - +def gpt_oss_20b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for GPT-OSS 20B variant. -def _gpt_oss_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, - # Dataset override option - dataset: Optional[Union[GPTDatasetConfig, FinetuningDatasetConfig, DatasetProvider]] = None, - # Model configuration - num_layers: int = None, # for ci testing - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: Optional[torch.dtype] = None, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - expert_model_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - account_for_embedding_in_pipeline_split: bool = False, - account_for_loss_in_pipeline_split: bool = False, - cp_comm_type: Optional[str] = None, - # Training hyperparameters - train_iters: int = 1000000, - global_batch_size: int = 512, - micro_batch_size: int = 1, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 2000, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 2000, - save_interval: int = 500, - use_null_tokenizer: bool = True, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, - # Checkpointing - pretrained_checkpoint: Optional[str] = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for GPT-OSS family models using a given HuggingFace path. - Mirrors the structure used in llama recipes for consistency. + Recommended parallelism: TP=2, PP=4, EP=4 """ + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("openai/gpt-oss-20b").to_megatron_provider(load_weights=False) + + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 4096 + cfg.dataset.num_workers = 8 + + # Parallelism settings + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.expert_model_parallel_size = 4 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.sequence_parallel = True + cfg.model.seq_length = 4096 + + # Pipeline split settings + cfg.model.account_for_embedding_in_pipeline_split = False + cfg.model.account_for_loss_in_pipeline_split = False + + if cfg.model.context_parallel_size > 1: + cfg.model.calculate_per_token_loss = True + cfg.model.cp_comm_type = "a2a" # only a2a cp is supported for sink attention. + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" # Default + cfg.model.moe_flex_dispatcher_backend = "deepep" # Options: None, deepep, hybridep + cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend + + # Training config + cfg.train.train_iters = 1000000 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # Scheduler config + cfg.scheduler.lr_warmup_iters = 2000 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - uses "bf16_mixed" from _pretrain_common + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.model.moe_router_padding_for_fp8 = False # Pad router for FP8 alignment + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap (default None, can pass CommOverlapConfig for advanced overlap) + # cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = False # GPT-OSS default + + # Checkpoint config + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config (matches _pretrain_common) + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = cfg.model.context_parallel_size == 1 + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - if num_layers is not None: - model_cfg.num_layers = num_layers - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.expert_model_parallel_size = expert_model_parallel_size - model_cfg.expert_tensor_parallel_size = 1 - model_cfg.sequence_parallel = sequence_parallel - model_cfg.seq_length = seq_length + return cfg - if account_for_embedding_in_pipeline_split: - model_cfg.account_for_embedding_in_pipeline_split = True - if account_for_loss_in_pipeline_split: - model_cfg.account_for_loss_in_pipeline_split = True - model_cfg.cp_comm_type = cp_comm_type - if context_parallel_size > 1: - model_cfg.calculate_per_token_loss = True - model_cfg.cp_comm_type = "a2a" # only a2a cp is supported for sink attention. - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - max_lr=lr, - min_lr=min_lr, - ) +def gpt_oss_120b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for GPT-OSS 120B variant. - # Build dataset config if not supplied directly - if dataset is None: - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, - data_args_path, - train_data_path, - valid_data_path, - test_data_path, - per_split_data_args_path, - mock, - ) - dataset_cfg = GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ) - else: - dataset_cfg = dataset - - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=context_parallel_size == 1, - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=dataset_cfg, - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - pretrained_checkpoint=pretrained_checkpoint, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, - ) + Recommended parallelism: TP=2, PP=4, EP=16 + """ + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("openai/gpt-oss-120b").to_megatron_provider(load_weights=False) + + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 4096 + cfg.dataset.num_workers = 8 + + # Parallelism settings (MoE-specific) + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.expert_model_parallel_size = 16 # Larger EP for 120B + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.sequence_parallel = True + cfg.model.seq_length = 4096 + + # Pipeline split settings + cfg.model.account_for_embedding_in_pipeline_split = False + cfg.model.account_for_loss_in_pipeline_split = False + if cfg.model.context_parallel_size > 1: + cfg.model.calculate_per_token_loss = True + cfg.model.cp_comm_type = "a2a" # only a2a cp is supported for sink attention. + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" + cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_hybridep_num_sms = 16 + + # Training config (DIFFERENT from _pretrain_common) + cfg.train.train_iters = 1000000 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # Scheduler config + cfg.scheduler.lr_warmup_iters = 2000 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" # GPT-OSS uses native + + # Memory saving + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision + cfg.model.moe_router_padding_for_fp8 = False + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap (default None, can pass CommOverlapConfig for advanced overlap) + # cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = False + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False return cfg diff --git a/src/megatron/bridge/recipes/kimi/kimi_k2.py b/src/megatron/bridge/recipes/kimi/kimi_k2.py index 67982591f7..57daf41416 100644 --- a/src/megatron/bridge/recipes/kimi/kimi_k2.py +++ b/src/megatron/bridge/recipes/kimi/kimi_k2.py @@ -12,161 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging -import os - import torch -from megatron.core.distributed import DistributedDataParallelConfig -from typing_extensions import TypedDict, Unpack from megatron.bridge.models.kimi import KimiK2Provider -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths -from megatron.bridge.recipes.utils.optimizer_utils import ( - distributed_fused_adam_with_cosine_annealing, - distributed_muon_with_cosine_annealing, -) +from megatron.bridge.recipes.common import _pretrain_common +from megatron.bridge.recipes.utils.optimizer_utils import distributed_muon_with_cosine_annealing from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - GPTDatasetConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, -) +from megatron.bridge.training.config import ConfigContainer from megatron.bridge.training.mixed_precision import MixedPrecisionConfig -logger = logging.getLogger(__name__) - - -class KimiK2CommonKwargs(TypedDict, total=False): - """Typed options accepted by Kimi-K2 recipe helper functions.""" - - # Core identifiers - dir: str | None - name: str - # Dataset configuration - data_paths: list[str] | None - data_args_path: str | None - train_data_path: list[str] | None - valid_data_path: list[str] | None - test_data_path: list[str] | None - per_split_data_args_path: str | None - mock: bool - # Model configuration - tensor_model_parallel_size: int - pipeline_model_parallel_size: int - pipeline_dtype: torch.dtype | None - virtual_pipeline_model_parallel_size: int | None - context_parallel_size: int - expert_model_parallel_size: int - sequence_parallel: bool - # Recomputation - recompute_granularity: str - recompute_modules: list[str] | None - recompute_method: str | None - recompute_num_layers: int | None - # DeePEP and RoPE - enable_deepep: bool - apply_rope_fusion: bool - # Training hyperparameters - train_iters: int - global_batch_size: int - micro_batch_size: int - seq_length: int - lr: float - min_lr: float - lr_warmup_iters: int - optimizer_type: str - # Precision / overlap configs - precision_config: MixedPrecisionConfig | str | None - comm_overlap_config: CommOverlapConfig | None - - -def kimi_k2_pretrain_config(**user_kwargs: Unpack[KimiK2CommonKwargs]) -> ConfigContainer: - """Return a pre-training config for Kimi-K2 (1T). - - See `_kimi_k2_common` for the full list of parameters. - """ - recommended_kwargs: KimiK2CommonKwargs = { - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 16, - "pipeline_dtype": torch.bfloat16, - "expert_model_parallel_size": 32, - "sequence_parallel": True, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: KimiK2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _kimi_k2_common(**combined_kwargs) - - -def _kimi_k2_model_config( - tensor_model_parallel_size: int = 2, - pipeline_model_parallel_size: int = 16, - pipeline_dtype: torch.dtype | None = None, - virtual_pipeline_model_parallel_size: int | None = None, - context_parallel_size: int = 1, - expert_model_parallel_size: int = 32, - sequence_parallel: bool = True, - # Recomputation - recompute_granularity: str = "selective", - recompute_modules: list[str] | None = None, - recompute_method: str | None = None, - recompute_num_layers: int | None = None, - enable_deepep: bool = False, - apply_rope_fusion: bool = False, -) -> KimiK2Provider: - """ - Configure the Kimi-K2 (1T) model. - - Args: - tensor_model_parallel_size: Degree of tensor model parallelism. - pipeline_model_parallel_size: Degree of pipeline model parallelism. - pipeline_dtype: Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size: Size of virtual pipeline parallelism. - context_parallel_size: Degree of context parallelism. - expert_model_parallel_size: Degree of expert model parallelism. - sequence_parallel: Whether to use sequence parallelism. - recompute_granularity: Granularity of recomputation. - recompute_modules: List of modules to recompute. - recompute_method: Method of recomputation. - recompute_num_layers: Number of layers to recompute. - enable_deepep: Whether to use DeePEP. - apply_rope_fusion: Whether to apply RoPE fusion. - - Returns: - KimiK2Provider: Configuration for the Kimi-K2 model. - """ - cfg = KimiK2Provider( - tensor_model_parallel_size=tensor_model_parallel_size, - pipeline_model_parallel_size=pipeline_model_parallel_size, - pipeline_dtype=pipeline_dtype, - virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size, - context_parallel_size=context_parallel_size, - expert_model_parallel_size=expert_model_parallel_size, - sequence_parallel=sequence_parallel, - expert_tensor_parallel_size=1, # Do not use ETP - # Recomputation - recompute_granularity=recompute_granularity, - recompute_modules=recompute_modules, - recompute_method=recompute_method, - recompute_num_layers=recompute_num_layers, - ) - - # Pipeline split for asymmetric stages as used in NeMo recipe - cfg.account_for_embedding_in_pipeline_split = False - cfg.account_for_loss_in_pipeline_split = False - cfg.num_layers_in_first_pipeline_stage = None - cfg.num_layers_in_last_pipeline_stage = None - - # Performance optimization knobs - cfg.moe_permute_fusion = True - if apply_rope_fusion: - cfg.apply_rope_fusion = True - - # Pipeline parallelism configs. We infer PP layout from the provided PP and VP size +def _get_kimi_k2_pipeline_layout(pp_size: int, vp_size: int): + """Get pipeline layout for Kimi-K2 based on PP and VP size.""" map_pp_vp_to_layout = { (1, 1): None, (4, 1): [["embedding"] + ["decoder"] * 16, ["decoder"] * 16, ["decoder"] * 16, ["decoder"] * 13 + ["loss"]], @@ -176,221 +33,154 @@ def _kimi_k2_model_config( (8, 2): [["embedding"] + ["decoder"] * 4] + [["decoder"] * 4] * 14 + [["decoder", "loss"]], (4, 4): [["embedding"] + ["decoder"] * 4] + [["decoder"] * 4] * 14 + [["decoder", "loss"]], } - pp_size = pipeline_model_parallel_size or 1 - vp_size = virtual_pipeline_model_parallel_size or 1 if (pp_size, vp_size) not in map_pp_vp_to_layout: raise ValueError( f"Invalid PP and VP size: {pp_size} and {vp_size} to infer PP layout " f"for Kimi-K2. Known PP and VP combinations: {map_pp_vp_to_layout.keys()}" ) - layout = map_pp_vp_to_layout[(pp_size, vp_size)] - if layout is not None: - layout = list([list(x) for x in layout]) # yield all the elements - cfg.pipeline_model_parallel_layout = layout - - if enable_deepep: - cfg.moe_token_dispatcher_type = "flex" - cfg.moe_enable_deepep = True - cfg.moe_shared_expert_overlap = False - - return cfg + layout = list([list(x) for x in layout]) + return layout -def _kimi_k2_common( - dir: str | None = None, - name: str = "default", - # Dataset configuration - data_paths: list[str] | None = None, - data_args_path: str | None = None, - train_data_path: list[str] | None = None, - valid_data_path: list[str] | None = None, - test_data_path: list[str] | None = None, - per_split_data_args_path: str | None = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 2, - pipeline_model_parallel_size: int = 16, - pipeline_dtype: torch.dtype | None = torch.bfloat16, - virtual_pipeline_model_parallel_size: int | None = None, - context_parallel_size: int = 1, - expert_model_parallel_size: int = 32, - sequence_parallel: bool = True, - # Recomputation - recompute_granularity: str = "selective", - recompute_modules: list[str] | None = None, - recompute_method: str | None = None, - recompute_num_layers: int | None = None, - enable_deepep: bool = False, - apply_rope_fusion: bool = False, - # Training hyperparameters - train_iters: int = 1_000_000, - global_batch_size: int = 4096, - micro_batch_size: int = 1, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 2000, - optimizer_type: str = "muon", - # Precision / overlap configs - precision_config: MixedPrecisionConfig | str | None = None, - comm_overlap_config: CommOverlapConfig | None = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for Kimi-K2 (1T) model. - - Args: - dir (str | None): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (list[str] | None): List of paths to dataset files. If None, mock data will be used. - data_args_path (str | None): Path to file containing data arguments. - train_data_path (list[str] | None): List of training data paths. - valid_data_path (list[str] | None): List of validation data paths. - test_data_path (list[str] | None): List of test data paths. - per_split_data_args_path (str | None): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (torch.dtype | None): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (int | None): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism. - expert_model_parallel_size (int): Degree of expert model parallelism. - sequence_parallel (bool): Whether to use sequence parallelism. - recompute_granularity (str): Granularity of recomputation. - recompute_modules (list[str] | None): List of modules to recompute. - recompute_method (str | None): Method of recomputation. - recompute_num_layers (int | None): Number of layers to recompute. - enable_deepep (bool): Whether to use DeePEP. - apply_rope_fusion (bool): Whether to apply RoPE fusion. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - optimizer_type (str): Type of optimizer ("adam" or "muon"). - precision_config (MixedPrecisionConfig | str | None): Precision configuration for the model. - comm_overlap_config (CommOverlapConfig | None): Communication overlap configuration. +def kimi_k2_pretrain_config() -> ConfigContainer: + """Return a pre-training config for Kimi-K2 (1T). - Returns: - ConfigContainer: Configuration for pre-training. + Recommended parallelism: TP=2, PP=16, EP=32 + Uses Muon optimizer by default. """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock + cfg = _pretrain_common() + + # Model config - uses KimiK2Provider instead of AutoBridge + cfg.model = KimiK2Provider( + tensor_model_parallel_size=2, + pipeline_model_parallel_size=16, + pipeline_dtype=torch.bfloat16, + virtual_pipeline_model_parallel_size=None, + context_parallel_size=1, + expert_model_parallel_size=32, + sequence_parallel=True, + expert_tensor_parallel_size=1, + recompute_granularity="selective", + recompute_modules=None, + recompute_method=None, + recompute_num_layers=None, ) - model_cfg = _kimi_k2_model_config( - tensor_model_parallel_size=tensor_model_parallel_size, - pipeline_model_parallel_size=pipeline_model_parallel_size, - pipeline_dtype=pipeline_dtype, - virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size, - context_parallel_size=context_parallel_size, - expert_model_parallel_size=expert_model_parallel_size, - sequence_parallel=sequence_parallel, - recompute_granularity=recompute_granularity, - recompute_modules=recompute_modules, - recompute_method=recompute_method, - recompute_num_layers=recompute_num_layers, - enable_deepep=enable_deepep, - apply_rope_fusion=apply_rope_fusion, + # Pipeline split settings (asymmetric stages) + cfg.model.account_for_embedding_in_pipeline_split = False + cfg.model.account_for_loss_in_pipeline_split = False + cfg.model.num_layers_in_first_pipeline_stage = None + cfg.model.num_layers_in_last_pipeline_stage = None + + # Set pipeline layout + cfg.model.pipeline_model_parallel_layout = _get_kimi_k2_pipeline_layout(16, 1) + + # Tokenizer - uses NullTokenizer with model vocab_size + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = cfg.model.vocab_size + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.sequence_length = 4096 + cfg.dataset.num_workers = 8 + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" + cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_hybridep_num_sms = 16 + + # Training config + cfg.train.train_iters = 1_000_000 + cfg.train.global_batch_size = 4096 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 5 + cfg.train.manual_gc_eval = 5 + + # Optimizer + opt_cfg, scheduler_cfg = distributed_muon_with_cosine_annealing( + lr_warmup_iters=2000, + lr_decay_iters=cfg.train.train_iters, + max_lr=3e-4, + min_lr=3e-5, ) - - if optimizer_type == "adam": - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=train_iters, - max_lr=lr, - min_lr=min_lr, - ) - - elif optimizer_type == "muon": - opt_cfg, scheduler_cfg = distributed_muon_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=train_iters, - max_lr=lr, - min_lr=min_lr, - ) - else: - raise ValueError(f"Invalid optimizer type: {optimizer_type}") - - if precision_config is None: - precision_config = MixedPrecisionConfig( - bf16=True, - params_dtype=torch.bfloat16, - pipeline_dtype=torch.bfloat16, - autocast_enabled=False, - grad_reduce_in_fp32=True, - ) - - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=2000, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=5, - manual_gc_eval=5, - ), - optimizer=opt_cfg, - scheduler=scheduler_cfg, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=False, # Muon needs this to be False - average_in_collective=True, - use_distributed_optimizer=False, # Muon needs this to be False - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - sequence_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - data_sharding=True, - dataloader_type="single", - num_workers=8, - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=model_cfg.vocab_size), - checkpoint=CheckpointConfig( - save_interval=2000, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - async_save=False, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving (recompute & offloading) - already set in KimiK2Provider + # cfg.model.recompute_granularity = "selective" + # cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - Kimi-K2 uses custom MixedPrecisionConfig (NOT "bf16_mixed" string) + cfg.mixed_precision = MixedPrecisionConfig( + bf16=True, + params_dtype=torch.bfloat16, + pipeline_dtype=torch.bfloat16, + autocast_enabled=False, + grad_reduce_in_fp32=True, ) - - if apply_rope_fusion: - cfg.dist.enable_megatron_core_experimental = True # for mla rope fusion - - if cfg.comm_overlap is None: - cfg.comm_overlap = CommOverlapConfig( - tp_comm_overlap=False, - ) + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.model.moe_router_padding_for_fp8 = False # Pad router for FP8 alignment + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) + cfg.comm_overlap.delay_wgrad_compute = False + cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = True + + # Checkpoint config + cfg.checkpoint.save_interval = 2000 + cfg.checkpoint.async_save = False + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config (DIFFERENT for Muon optimizer) + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = False # Muon needs this to be False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = False # Muon needs this to be False + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + if cfg.model.apply_rope_fusion: + cfg.dist.enable_megatron_core_experimental = True + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False return cfg diff --git a/src/megatron/bridge/recipes/llama/llama2.py b/src/megatron/bridge/recipes/llama/llama2.py index 8b4529af8a..fe657ecd2d 100644 --- a/src/megatron/bridge/recipes/llama/llama2.py +++ b/src/megatron/bridge/recipes/llama/llama2.py @@ -12,258 +12,109 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import List, Optional, Union - import torch -from typing_extensions import TypedDict, Unpack from megatron.bridge import AutoBridge -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths -from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DistributedDataParallelConfig, - GPTDatasetConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, -) -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig - - -class Llama2CommonKwargs(TypedDict, total=False): - """Typed options accepted by Llama2 recipe helper functions.""" - - # Core identifiers - hf_path: str - dir: Optional[str] - name: str - # Dataset configuration - data_paths: Optional[List[str]] - data_args_path: Optional[str] - train_data_path: Optional[List[str]] - valid_data_path: Optional[List[str]] - test_data_path: Optional[List[str]] - per_split_data_args_path: Optional[str] - mock: bool - # Model configuration - tensor_model_parallel_size: int - pipeline_model_parallel_size: int - pipeline_dtype: Optional[torch.dtype] - virtual_pipeline_model_parallel_size: Optional[int] - context_parallel_size: int - sequence_parallel: bool - use_megatron_fsdp: bool - # Training hyperparameters - train_iters: int - global_batch_size: int - micro_batch_size: int - seq_length: int - lr: float - min_lr: float - lr_warmup_iters: int - lr_decay_iters: Optional[int] - eval_interval: int - save_interval: int - use_null_tokenizer: bool - # Precision / overlap configs - precision_config: Optional[Union[MixedPrecisionConfig, str]] - comm_overlap_config: Optional[CommOverlapConfig] +from megatron.bridge.training.config import ConfigContainer -def llama2_7b_pretrain_config(**user_kwargs: Unpack[Llama2CommonKwargs]) -> ConfigContainer: +def llama2_7b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Llama-2 7B. - See `_llama2_common` for the full list of parameters. + Recommended parallelism: TP=2, PP=1. """ - recommended_kwargs: Llama2CommonKwargs = { - "hf_path": "meta-llama/Llama-2-7b-hf", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 1, - "train_iters": 1_168_251, - "global_batch_size": 512, - "micro_batch_size": 1, - "lr_warmup_iters": 2000, - "eval_interval": 2000, - "save_interval": 2000, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Llama2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama2_common(**combined_kwargs) - - -def _llama2_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 2, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: Optional[torch.dtype] = None, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - # Training hyperparameters - train_iters: int = 1_168_251, - global_batch_size: int = 512, - micro_batch_size: int = 1, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 2000, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 2000, - save_interval: int = 2000, - use_null_tokenizer: bool = True, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for Llama2 models using a given HuggingFace path. - - Args: - hf_path (str): HuggingFace model path (e.g., "meta-llama/Llama-2-7b-hf"). - dir (Optional[str]): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used. - data_args_path (Optional[str]): Path to file containing data arguments. - train_data_path (Optional[List[str]]): List of training data paths. - valid_data_path (Optional[List[str]]): List of validation data paths. - test_data_path (Optional[List[str]]): List of test data paths. - per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (Optional[int]): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism to be passed to model_config. - sequence_parallel (bool): Whether to use sequence parallelism. - use_megatron_fsdp (bool): Whether to use Megatron FSDP. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR. - eval_interval (int): Evaluation interval. - save_interval (int): Save interval. - precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model. - comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration for the model. - - Returns: - ConfigContainer: Configuration for pre-training. - """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock - ) - - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.seq_length = seq_length - - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - adam_beta1=0.9, - adam_beta2=0.95, - adam_eps=1e-8, - weight_decay=0.1, - max_lr=lr, - min_lr=min_lr, - ) - - # Config Container - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, # need use_distributed_optimizer=True - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - # Dataloader config parameters - data_sharding=True, - dataloader_type="single", - num_workers=8, - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, - ) - - if cfg.comm_overlap is None: - cfg.comm_overlap = CommOverlapConfig( - tp_comm_overlap=False, - ) + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Llama-2-7b-hf").to_megatron_provider(load_weights=False) + + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Parallelism settings + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + + # Training config + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # Scheduler config + cfg.scheduler.lr_warmup_iters = 2000 + + # Logger config + cfg.logger.log_timers_to_tensorboard = False + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config + cfg.checkpoint.save_interval = 2000 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # Communication overlap + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) return cfg diff --git a/src/megatron/bridge/recipes/llama/llama3.py b/src/megatron/bridge/recipes/llama/llama3.py index 59d6a8631a..9125b30e3d 100644 --- a/src/megatron/bridge/recipes/llama/llama3.py +++ b/src/megatron/bridge/recipes/llama/llama3.py @@ -19,7 +19,7 @@ from megatron.bridge import AutoBridge from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE @@ -32,7 +32,6 @@ CheckpointConfig, ConfigContainer, DistributedDataParallelConfig, - GPTDatasetConfig, LoggerConfig, RNGConfig, TokenizerConfig, @@ -133,113 +132,519 @@ class Llama3FinetuneKwargs(TypedDict, total=False): SEQUENCE_LENGTH_128K: int = 131072 -# Llama3.2 models -def llama32_1b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: +# ============================================================================= +# Llama 3.2 Pretrain Configs +# ============================================================================= + + +def llama32_1b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Llama 3.2 1B. - See `_llama3_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=1, CP=1. """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Llama-3.2-1B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - "sequence_parallel": False, - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) - - -def llama32_3b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-1B").to_megatron_provider(load_weights=False) + + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = 8192 + + # Parallelism settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 8192 + + # Training config + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # Scheduler config + cfg.scheduler.lr_warmup_iters = 2000 + + # Logger config + cfg.logger.log_timers_to_tensorboard = True + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg + + +def llama32_3b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Llama 3.2 3B. - See `_llama3_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=1, CP=1. """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Llama-3.2-3B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - "sequence_parallel": False, - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) - - -# Llama3 8B models -def llama3_8b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-3B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = 8192 + + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 8192 + + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg + + +# ============================================================================= +# Llama 3 8B Pretrain Configs +# ============================================================================= + + +def llama3_8b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Llama 3 8B. - See `_llama3_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=1, CP=2. """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Meta-Llama-3-8B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 2, - "sequence_parallel": False, - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) - - -def llama3_8b_16k_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3-8B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = 8192 + + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 2 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 8192 + + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg + + +def llama3_8b_16k_pretrain_config() -> ConfigContainer: """Return a pre-training config for Llama 3 8B 16K. - See `_llama3_common` for the full list of parameters. + Recommended parallelism: TP=4, PP=2, CP=2, SP=True. """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Meta-Llama-3-8B", - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 2, - "pipeline_dtype": torch.bfloat16, - "context_parallel_size": 2, - "sequence_parallel": True, - "seq_length": SEQUENCE_LENGTH_16K, - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) - - -def llama3_8b_64k_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3-8B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = SEQUENCE_LENGTH_16K + + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 2 + cfg.model.sequence_parallel = True + cfg.model.seq_length = SEQUENCE_LENGTH_16K + + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg + + +def llama3_8b_64k_pretrain_config() -> ConfigContainer: """Return a pre-training config for Llama 3 8B 64K. - See `_llama3_common` for the full list of parameters. + Recommended parallelism: TP=4, PP=2, CP=4, SP=True. """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Meta-Llama-3-8B", - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 2, - "pipeline_dtype": torch.bfloat16, - "context_parallel_size": 4, - "sequence_parallel": True, - "seq_length": SEQUENCE_LENGTH_64K, - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) - - -def llama3_8b_128k_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3-8B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = SEQUENCE_LENGTH_64K + + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 4 + cfg.model.sequence_parallel = True + cfg.model.seq_length = SEQUENCE_LENGTH_64K + + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg + + +def llama3_8b_128k_pretrain_config() -> ConfigContainer: """Return a pre-training config for Llama 3 8B 128K. - See `_llama3_common` for the full list of parameters. + Recommended parallelism: TP=4, PP=2, CP=8, SP=True. """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Meta-Llama-3-8B", - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 2, - "pipeline_dtype": torch.bfloat16, - "context_parallel_size": 8, - "sequence_parallel": True, - "seq_length": SEQUENCE_LENGTH_128K, - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) - - -def llama3_8b_low_precision_pretrain_config( - mixed_precision_recipe: str, **user_kwargs: Unpack[Llama3CommonKwargs] -) -> ConfigContainer: + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3-8B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = SEQUENCE_LENGTH_128K + + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 8 + cfg.model.sequence_parallel = True + cfg.model.seq_length = SEQUENCE_LENGTH_128K + + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg + + +def llama3_8b_low_precision_pretrain_config(mixed_precision_recipe: str) -> ConfigContainer: """Return a low precision (FP8 Current Scaling/MXFP8/NVFP4) pre-training config for Llama 3 8B. Args: @@ -247,356 +652,642 @@ def llama3_8b_low_precision_pretrain_config( - "bf16_with_mxfp8_mixed" - "bf16_with_fp8_current_scaling_mixed" - "bf16_with_nvfp4_mixed" - user_kwargs (Unpack[Llama3CommonKwargs]): Additional user-specified configuration options. Returns: ConfigContainer: The pre-training configuration for Llama 3 8B. - - See `_llama3_common` for the full list of parameters. """ assert mixed_precision_recipe in [ "bf16_with_mxfp8_mixed", "bf16_with_fp8_current_scaling_mixed", "bf16_with_nvfp4_mixed", ], f"Invalid low precision recipe: {mixed_precision_recipe}. This recipe has not been tested yet." + precision_config = get_mixed_precision_config(mixed_precision_recipe) - if ( - mixed_precision_recipe == "bf16_with_nvfp4_mixed" - ): # for llama3-8B nvfp4 recipe, we use BF16 for the last 4 layers + if mixed_precision_recipe == "bf16_with_nvfp4_mixed": + # For llama3-8B nvfp4 recipe, we use BF16 for the last 4 layers precision_config.first_last_layers_bf16 = True precision_config.num_layers_at_start_in_bf16 = 0 precision_config.num_layers_at_end_in_bf16 = 4 - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Meta-Llama-3-8B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 2, - "sequence_parallel": False, - "precision_config": precision_config, - "lr": 6e-4, - "min_lr": 6e-6, - "adam_eps": 1e-8, - "micro_batch_size": 1, - "global_batch_size": 768, - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) - - -# Llama3 70B models -def llama3_70b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3-8B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = 8192 + + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 2 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 8192 + + # Low precision specific training params + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 768 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # Low precision specific optimizer params + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Low precision specific optimizer params + cfg.optimizer.lr = 6e-4 + cfg.optimizer.min_lr = 6e-6 + cfg.optimizer.adam_eps = 1e-8 + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # Set the precision config + cfg.mixed_precision = precision_config + + return cfg + + +# ============================================================================= +# Llama 3 70B Pretrain Configs +# ============================================================================= + + +def llama3_70b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Llama 3 70B. - See `_llama3_common` for the full list of parameters. + Recommended parallelism: TP=4, PP=4, VPP=5, CP=2, SP=True with CommOverlap. """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Meta-Llama-3-70B", - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 4, - "pipeline_dtype": torch.bfloat16, - "virtual_pipeline_model_parallel_size": 5, - "context_parallel_size": 2, - "sequence_parallel": True, - "comm_overlap_config": CommOverlapConfig( - tp_comm_overlap=True, - tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, - ), - "precision_config": bf16_mixed(), - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3-70B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = 8192 + + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = 5 + cfg.model.context_parallel_size = 2 + cfg.model.sequence_parallel = True + cfg.model.seq_length = 8192 + + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # Communication overlap for 70B + cfg.comm_overlap = CommOverlapConfig( + tp_comm_overlap=True, + tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, + ) + + # Mixed precision - explicitly use bf16_mixed + cfg.mixed_precision = bf16_mixed() + + return cfg -def llama3_70b_16k_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: +def llama3_70b_16k_pretrain_config() -> ConfigContainer: """Return a pre-training config for Llama 3 70B 16K. - See `_llama3_common` for the full list of parameters. + Recommended parallelism: TP=8, PP=2, CP=2, SP=True with CommOverlap. """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Meta-Llama-3-70B", - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 2, - "pipeline_dtype": torch.bfloat16, - "virtual_pipeline_model_parallel_size": None, - "context_parallel_size": 2, - "sequence_parallel": True, - "seq_length": SEQUENCE_LENGTH_16K, - "comm_overlap_config": CommOverlapConfig( - tp_comm_overlap=True, - tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, - ), - "precision_config": bf16_mixed(), - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3-70B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = SEQUENCE_LENGTH_16K + + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 2 + cfg.model.sequence_parallel = True + cfg.model.seq_length = SEQUENCE_LENGTH_16K + + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # Communication overlap for 70B + cfg.comm_overlap = CommOverlapConfig( + tp_comm_overlap=True, + tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, + ) + + cfg.mixed_precision = bf16_mixed() + + return cfg -def llama3_70b_64k_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: +def llama3_70b_64k_pretrain_config() -> ConfigContainer: """Return a pre-training config for Llama 3 70B 64K. - See `_llama3_common` for the full list of parameters. + Recommended parallelism: TP=8, PP=4, CP=8, SP=True with CommOverlap. """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Meta-Llama-3-70B", - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 4, - "pipeline_dtype": torch.bfloat16, - "virtual_pipeline_model_parallel_size": None, - "context_parallel_size": 8, - "sequence_parallel": True, - "seq_length": SEQUENCE_LENGTH_64K, - "comm_overlap_config": CommOverlapConfig( - tp_comm_overlap=True, - tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, - ), - "precision_config": bf16_mixed(), - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3-70B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = SEQUENCE_LENGTH_64K + + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 8 + cfg.model.sequence_parallel = True + cfg.model.seq_length = SEQUENCE_LENGTH_64K + + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # Communication overlap for 70B + cfg.comm_overlap = CommOverlapConfig( + tp_comm_overlap=True, + tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, + ) + cfg.mixed_precision = bf16_mixed() -# Llama3.1 models -def llama31_8b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: + return cfg + + +# ============================================================================= +# Llama 3.1 Pretrain Configs +# ============================================================================= + + +def llama31_8b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Llama 3.1 8B. - See `_llama3_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=1, CP=2. """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Meta-Llama-3.1-8B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 2, - "sequence_parallel": False, - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) - - -def llama31_70b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: - """Return a pre-training config for Llama 3.1 70B. + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3.1-8B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = 8192 + + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 2 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 8192 + + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" - See `_llama3_common` for the full list of parameters. - """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Meta-Llama-3.1-70B", - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 4, - "pipeline_dtype": torch.bfloat16, - "virtual_pipeline_model_parallel_size": 5, - "context_parallel_size": 2, - "sequence_parallel": True, - "comm_overlap_config": CommOverlapConfig( - tp_comm_overlap=True, - tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, - ), - "precision_config": bf16_mixed(), - "seq_length": SEQUENCE_LENGTH_128K, - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) + return cfg -def llama31_405b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer: - """Return a pre-training config for Llama 3.1 405B. +def llama31_70b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for Llama 3.1 70B. - See `_llama3_common` for the full list of parameters. + Recommended parallelism: TP=4, PP=4, VPP=5, CP=2, SP=True with CommOverlap, seq=128K. """ - recommended_kwargs: Llama3CommonKwargs = { - "hf_path": "meta-llama/Meta-Llama-3.1-405B", - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 8, - "pipeline_dtype": torch.bfloat16, - "virtual_pipeline_model_parallel_size": 2, - "context_parallel_size": 4, - "sequence_parallel": True, - "account_for_embedding_in_pipeline_split": True, - "account_for_loss_in_pipeline_split": True, - "comm_overlap_config": CommOverlapConfig( - tp_comm_overlap=True, - tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192, - ), - "precision_config": bf16_mixed(), - "micro_batch_size": 1, - "seq_length": SEQUENCE_LENGTH_128K, - } - combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _llama3_common(**combined_kwargs) + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3.1-70B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = SEQUENCE_LENGTH_128K + + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = 5 + cfg.model.context_parallel_size = 2 + cfg.model.sequence_parallel = True + cfg.model.seq_length = SEQUENCE_LENGTH_128K + + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # Communication overlap for 70B + cfg.comm_overlap = CommOverlapConfig( + tp_comm_overlap=True, + tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, + ) + cfg.mixed_precision = bf16_mixed() -def _llama3_common( - hf_path: str, - dir: str | None = None, - name: str = "default", - load_weights: bool = False, - # Dataset configuration - data_paths: list[str] | None = None, - data_args_path: str | None = None, - train_data_path: list[str] | None = None, - valid_data_path: list[str] | None = None, - test_data_path: list[str] | None = None, - per_split_data_args_path: str | None = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: torch.dtype | None = None, - virtual_pipeline_model_parallel_size: int | None = None, - context_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - account_for_embedding_in_pipeline_split: bool = False, - account_for_loss_in_pipeline_split: bool = False, - # Training hyperparameters - train_iters: int = 1168251, - global_batch_size: int = 512, - micro_batch_size: int = 1, - seq_length: int = 8192, - lr: float = 3e-4, - min_lr: float = 3e-5, - adam_eps: float = 1e-5, - lr_warmup_iters: int = 2000, - lr_decay_iters: int | None = None, - eval_interval: int = 2000, - save_interval: int = 500, - use_null_tokenizer: bool = True, - # Precision recipe - precision_config: MixedPrecisionConfig | str | None = "bf16_mixed", - comm_overlap_config: CommOverlapConfig | None = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for Llama3 family models using a given HuggingFace path. + return cfg - Args: - hf_path (str): HuggingFace model path (e.g., "meta-llama/Meta-Llama-3-8B"). - dir (Optional[str]): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used. - data_args_path (Optional[str]): Path to file containing data arguments. - train_data_path (Optional[List[str]]): List of training data paths. - valid_data_path (Optional[List[str]]): List of validation data paths. - test_data_path (Optional[List[str]]): List of test data paths. - per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (Optional[int]): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism. - sequence_parallel (bool): Whether to use sequence parallelism. - use_megatron_fsdp (bool): Whether to use Megatron FSDP. - account_for_embedding_in_pipeline_split (bool): Whether to account for embedding in pipeline split. - account_for_loss_in_pipeline_split (bool): Whether to account for loss in pipeline split. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - adam_eps (float): AdamW epsilon. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR. - precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model. - comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration. - Returns: - ConfigContainer: Configuration for pre-training. +def llama31_405b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for Llama 3.1 405B. + + Recommended parallelism: TP=8, PP=8, VPP=2, CP=4, SP=True with CommOverlap, seq=128K. """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") + cfg = _pretrain_common() - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock + cfg.model = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3.1-405B").to_megatron_provider( + load_weights=False ) - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=load_weights) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.seq_length = seq_length - model_cfg.cross_entropy_fusion_impl = "te" + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE - # Large model specific pipeline split configurations - if account_for_embedding_in_pipeline_split: - model_cfg.account_for_embedding_in_pipeline_split = True - if account_for_loss_in_pipeline_split: - model_cfg.account_for_loss_in_pipeline_split = True + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.seq_length = SEQUENCE_LENGTH_128K - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - max_lr=lr, - min_lr=min_lr, - adam_eps=adam_eps, - ) + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = 2 + cfg.model.context_parallel_size = 4 + cfg.model.sequence_parallel = True + cfg.model.seq_length = SEQUENCE_LENGTH_128K - # Config Container - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - # Dataloader config parameters - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + # Large model specific pipeline split configurations + cfg.model.account_for_embedding_in_pipeline_split = True + cfg.model.account_for_loss_in_pipeline_split = True + + cfg.train.train_iters = 1168251 + cfg.train.global_batch_size = 512 + cfg.train.micro_batch_size = 1 # 405B uses micro_batch_size=1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = True + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + cfg.checkpoint.save_interval = 500 + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + # Communication overlap for 405B + cfg.comm_overlap = CommOverlapConfig( + tp_comm_overlap=True, + tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192, ) + cfg.mixed_precision = bf16_mixed() + return cfg diff --git a/src/megatron/bridge/recipes/moonlight/moonlight_16b.py b/src/megatron/bridge/recipes/moonlight/moonlight_16b.py index 30609c18b5..1159c5e3e9 100644 --- a/src/megatron/bridge/recipes/moonlight/moonlight_16b.py +++ b/src/megatron/bridge/recipes/moonlight/moonlight_16b.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import os from typing import List, Optional, Union @@ -22,14 +21,13 @@ from megatron.bridge.models.deepseek import MoonlightModelProvider16B from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, - GPTDatasetConfig, LoggerConfig, RNGConfig, TokenizerConfig, @@ -38,9 +36,6 @@ from megatron.bridge.training.mixed_precision import MixedPrecisionConfig -logger = logging.getLogger(__name__) - - class MoonlightCommonKwargs(TypedDict, total=False): """Typed options accepted by Moonlight family recipe helpers.""" @@ -132,238 +127,160 @@ class MoonlightFinetuneKwargs(TypedDict, total=False): wandb_exp_name: Optional[str] -def moonlight_16b_pretrain_config(**user_kwargs: Unpack[MoonlightCommonKwargs]) -> ConfigContainer: - """Return a pre-training config for Moonlight-16B. - - See `_moonlight_common` for the full list of parameters. - """ - recommended_kwargs: MoonlightCommonKwargs = { - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 1, - "pipeline_dtype": torch.bfloat16, - "virtual_pipeline_model_parallel_size": None, - "context_parallel_size": 1, - "expert_model_parallel_size": 8, - "sequence_parallel": True, - "recompute_granularity": "selective", - "enable_deepep": False, - "apply_rope_fusion": False, - "train_iters": 500_000, - "global_batch_size": 2048, - "micro_batch_size": 1, - "seq_length": 4096, - "lr": 3e-4, - "min_lr": 3e-5, - "lr_warmup_iters": 2000, - "optimizer_type": "adam", - "eval_interval": 2000, - "save_interval": 2000, +def _get_moonlight_pipeline_layout(pp_size: int, vp_size: int): + """Get pipeline layout for Moonlight-16B based on PP and VP size.""" + map_pp_vp_to_layout = { + (1, 1): None, + (2, 1): [["embedding"] + ["decoder"] * 14, ["decoder"] * 13 + ["loss"]], + (4, 1): [["embedding"] + ["decoder"] * 7] + [["decoder"] * 7] * 2 + [["decoder"] * 6 + ["loss"]], + (8, 1): [["embedding"] + ["decoder"] * 4] + [["decoder"] * 4] * 6 + [["decoder"] * 3 + ["loss"]], + (2, 2): [["embedding"] + ["decoder"] * 7] + [["decoder"] * 7] * 2 + [["decoder"] * 6 + ["loss"]], + (4, 2): [["embedding"] + ["decoder"] * 4] + [["decoder"] * 4] * 6 + [["decoder"] * 3 + ["loss"]], } - combined_kwargs: MoonlightCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _moonlight_common(**combined_kwargs) + if (pp_size, vp_size) not in map_pp_vp_to_layout: + raise ValueError( + f"Invalid PP and VP size: {pp_size} and {vp_size} to infer PP layout " + f"for Moonlight-16B. Known PP and VP combinations: {map_pp_vp_to_layout.keys()}" + ) + layout = map_pp_vp_to_layout[(pp_size, vp_size)] + if layout is not None: + layout = list([list(x) for x in layout]) + return layout -def _moonlight_common( - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 2, - pipeline_model_parallel_size: int = 2, - pipeline_dtype: Optional[torch.dtype] = torch.bfloat16, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - expert_model_parallel_size: int = 4, - sequence_parallel: bool = True, - # Recomputation - recompute_granularity: str = "selective", - recompute_modules: Optional[List[str]] = None, - recompute_method: Optional[str] = None, - recompute_num_layers: Optional[int] = None, - enable_deepep: bool = False, - apply_rope_fusion: bool = False, - # Training hyperparameters - train_iters: int = 500_000, - global_batch_size: int = 2048, - micro_batch_size: int = 1, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 2000, - optimizer_type: str = "adam", - eval_interval: int = 2000, - save_interval: int = 2000, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = None, - comm_overlap_config: Optional[CommOverlapConfig] = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for Moonlight-16B model. - - Args: - dir (Optional[str]): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used. - data_args_path (Optional[str]): Path to file containing data arguments. - train_data_path (Optional[List[str]]): List of training data paths. - valid_data_path (Optional[List[str]]): List of validation data paths. - test_data_path (Optional[List[str]]): List of test data paths. - per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (Optional[int]): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism. - expert_model_parallel_size (int): Degree of expert model parallelism. - sequence_parallel (bool): Whether to use sequence parallelism. - recompute_granularity (str): Recomputation granularity. - recompute_modules (Optional[List[str]]): Modules to recompute. - recompute_method (Optional[str]): Recomputation method. - recompute_num_layers (Optional[int]): Number of layers to recompute. - enable_deepep (bool): Whether to use DeePEP. - apply_rope_fusion (bool): Whether to apply RoPE fusion. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - optimizer_type (str): Type of optimizer to use. - eval_interval (int): Interval for evaluation. - save_interval (int): Interval for saving checkpoints. - precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model. - comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration. +def moonlight_16b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for Moonlight-16B. - Returns: - ConfigContainer: Configuration for pre-training. + Recommended parallelism: TP=2, PP=1, EP=8 + Uses precision-aware optimizer with bf16 gradients/moments. """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock + cfg = _pretrain_common() + + # Model config - uses MoonlightModelProvider16B instead of AutoBridge + cfg.model = MoonlightModelProvider16B( + tensor_model_parallel_size=2, + pipeline_model_parallel_size=1, + pipeline_dtype=torch.bfloat16, + virtual_pipeline_model_parallel_size=None, + context_parallel_size=1, + expert_model_parallel_size=8, + sequence_parallel=True, + expert_tensor_parallel_size=1, + recompute_granularity="selective", + recompute_modules=None, + recompute_method=None, + recompute_num_layers=None, ) - model_cfg = _model_config( - tensor_model_parallel_size=tensor_model_parallel_size, - pipeline_model_parallel_size=pipeline_model_parallel_size, - pipeline_dtype=pipeline_dtype, - virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size, - context_parallel_size=context_parallel_size, - expert_model_parallel_size=expert_model_parallel_size, - sequence_parallel=sequence_parallel, - recompute_granularity=recompute_granularity, - recompute_modules=recompute_modules, - recompute_method=recompute_method, - recompute_num_layers=recompute_num_layers, - enable_deepep=enable_deepep, - apply_rope_fusion=apply_rope_fusion, - ) - - if optimizer_type == "adam": - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=train_iters, - adam_beta1=0.9, - adam_beta2=0.95, - adam_eps=1e-8, - weight_decay=0.1, - max_lr=lr, - min_lr=min_lr, - ) + # Pipeline split settings (asymmetric stages) + cfg.model.account_for_embedding_in_pipeline_split = False + cfg.model.account_for_loss_in_pipeline_split = False + cfg.model.num_layers_in_first_pipeline_stage = None + cfg.model.num_layers_in_last_pipeline_stage = None + + # Set pipeline layout + cfg.model.pipeline_model_parallel_layout = _get_moonlight_pipeline_layout(1, 1) + + # Tokenizer - uses NullTokenizer with model vocab_size + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = cfg.model.vocab_size + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 4096 + cfg.dataset.num_workers = 8 + cfg.dataset.split = "99990,8,2" + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" + cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_hybridep_num_sms = 16 + + # Training config + cfg.train.train_iters = 500_000 + cfg.train.global_batch_size = 2048 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 5 + cfg.train.manual_gc_eval = 5 - opt_config.use_precision_aware_optimizer = True - opt_config.main_params_dtype = torch.float32 - opt_config.main_grads_dtype = torch.bfloat16 - opt_config.exp_avg_dtype = torch.bfloat16 - opt_config.exp_avg_sq_dtype = torch.bfloat16 - else: - # TODO: Add support for muon optimizer once mcore supports it - raise ValueError(f"Invalid optimizer type: {optimizer_type}") - - if precision_config is None: - precision_config = MixedPrecisionConfig( - bf16=True, - params_dtype=torch.bfloat16, - pipeline_dtype=torch.bfloat16, - autocast_enabled=False, - grad_reduce_in_fp32=False, - ) - - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=5, - manual_gc_eval=5, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=False, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - use_distributed_optimizer=True, - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split or "99990,8,2", - data_sharding=True, - dataloader_type="single", - num_workers=8, - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=model_cfg.vocab_size), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - async_save=False, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + # Optimizer + cfg.scheduler.lr_warmup_iters = 2000 + cfg.scheduler.lr_decay_iters = cfg.train.train_iters + + # Precision-aware optimizer settings + cfg.optimizer.use_precision_aware_optimizer = True + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.main_grads_dtype = torch.bfloat16 + cfg.optimizer.exp_avg_dtype = torch.bfloat16 + cfg.optimizer.exp_avg_sq_dtype = torch.bfloat16 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving (recompute & offloading) - already set in MoonlightModelProvider16B + # cfg.model.recompute_granularity = "selective" + # cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - Moonlight uses custom MixedPrecisionConfig (NOT "bf16_mixed" string) + cfg.mixed_precision = MixedPrecisionConfig( + bf16=True, + params_dtype=torch.bfloat16, + pipeline_dtype=torch.bfloat16, + autocast_enabled=False, + grad_reduce_in_fp32=False, # Different from _pretrain_common ) - - if apply_rope_fusion: + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.model.moe_router_padding_for_fp8 = False + + # Communication overlap + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) + cfg.comm_overlap.delay_wgrad_compute = False + cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = True + + # Checkpoint config + cfg.checkpoint.save_interval = 2000 + cfg.checkpoint.async_save = False + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config (DIFFERENT: grad_reduce_in_fp32=False) + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = False # Different from _pretrain_common + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + if cfg.model.apply_rope_fusion: cfg.dist.enable_megatron_core_experimental = True # for mla rope fusion - if cfg.comm_overlap is None: - cfg.comm_overlap = CommOverlapConfig( - tp_comm_overlap=False, - ) + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False return cfg @@ -433,26 +350,9 @@ def _model_config( cfg.apply_rope_fusion = True # Pipeline parallelism configs. We infer PP layout from the provided PP and VP size - map_pp_vp_to_layout = { - (1, 1): None, - (2, 1): [["embedding"] + ["decoder"] * 14, ["decoder"] * 13 + ["loss"]], - (4, 1): [["embedding"] + ["decoder"] * 7] + [["decoder"] * 7] * 2 + [["decoder"] * 6 + ["loss"]], - (8, 1): [["embedding"] + ["decoder"] * 4] + [["decoder"] * 4] * 6 + [["decoder"] * 3 + ["loss"]], - (2, 2): [["embedding"] + ["decoder"] * 7] + [["decoder"] * 7] * 2 + [["decoder"] * 6 + ["loss"]], - (4, 2): [["embedding"] + ["decoder"] * 4] + [["decoder"] * 4] * 6 + [["decoder"] * 3 + ["loss"]], - } pp_size = pipeline_model_parallel_size or 1 vp_size = virtual_pipeline_model_parallel_size or 1 - if (pp_size, vp_size) not in map_pp_vp_to_layout: - raise ValueError( - f"Invalid PP and VP size: {pp_size} and {vp_size} to infer PP layout " - f"for Moonlight-16B. Known PP and VP combinations: {map_pp_vp_to_layout.keys()}" - ) - - layout = map_pp_vp_to_layout[(pp_size, vp_size)] - - if layout is not None: - layout = list([list(x) for x in layout]) # yield all the elements + layout = _get_moonlight_pipeline_layout(pp_size, vp_size) cfg.pipeline_model_parallel_layout = layout if enable_deepep: @@ -616,7 +516,7 @@ def _moonlight_finetune_common( min_lr=min_lr, adam_beta1=0.9, adam_beta2=0.98, - adam_eps=1e-8, + adam_eps=1e-5, weight_decay=0.1, ) diff --git a/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py b/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py index b70a6f09d3..8077c63935 100644 --- a/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py +++ b/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py @@ -20,16 +20,14 @@ from megatron.bridge.models.nemotronh import Nemotron3NanoProvider from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, DistributedDataParallelConfig, - GPTDatasetConfig, LoggerConfig, RNGConfig, TokenizerConfig, @@ -38,277 +36,132 @@ from megatron.bridge.training.mixed_precision import MixedPrecisionConfig -class Nemotron3NanoCommonKwargs(TypedDict, total=False): - """Typed options accepted by Nemotron 3 Nano recipe helper functions.""" +def nemotron_3_nano_pretrain_config() -> ConfigContainer: + """Return a pre-training config for Nemotron 3 Nano (30B-A3B MoE). - # Core identifiers - model_provider: Nemotron3NanoProvider - dir: Optional[str] - name: str - # Dataset configuration - data_paths: Optional[list[str]] - data_args_path: Optional[str] - train_data_path: Optional[list[str]] - valid_data_path: Optional[list[str]] - test_data_path: Optional[list[str]] - per_split_data_args_path: Optional[str] - path_to_cache: Optional[str] - mock: bool - # Model configuration - tensor_model_parallel_size: int - pipeline_model_parallel_size: int - pipeline_parallelism_dtype: Optional[torch.dtype] - virtual_pipeline_parallelism: Optional[int] - context_parallel_size: int - sequence_parallelism: bool - expert_tensor_parallelism: int - expert_model_parallelism: int - # Training hyperparameters - train_iters: int - global_batch_size: int - micro_batch_size: int - seq_length: int - lr: float - min_lr: float - lr_warmup_iters: int - lr_decay_iters: Optional[int] - use_null_tokenizer: bool - # Precision / overlap configs - precision_config: Optional[Union[MixedPrecisionConfig, str]] - comm_overlap_config: Optional[CommOverlapConfig] - # MoE - enable_deepep: bool - - -def nemotron_3_nano_pretrain_config(**user_kwargs: Unpack[Nemotron3NanoCommonKwargs]) -> ConfigContainer: - """Return a pre-training config for Nemotron 3 Nano. - - This recipe is designed for multi-node training. - Default parallelism: TP=4, PP=1, SP=True, with DeepEP enabled. - - See `_nemotron_3_nano_common` for the full list of parameters. - """ - recommended_kwargs: Nemotron3NanoCommonKwargs = { - "model_provider": Nemotron3NanoProvider, - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 1, - "pipeline_parallelism_dtype": torch.bfloat16, - "context_parallel_size": 1, - "sequence_parallelism": True, - "enable_deepep": True, - "expert_tensor_parallelism": 1, - "expert_model_parallelism": 8, - "precision_config": "bf16_mixed", - } - combined_kwargs: Nemotron3NanoCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _nemotron_3_nano_common(**combined_kwargs) - - -def _nemotron_3_nano_common( - model_provider: type[Nemotron3NanoProvider], - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[list[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[list[str]] = None, - valid_data_path: Optional[list[str]] = None, - test_data_path: Optional[list[str]] = None, - per_split_data_args_path: Optional[str] = None, - path_to_cache: Optional[str] = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 4, - pipeline_model_parallel_size: int = 1, - pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16, - virtual_pipeline_parallelism: Optional[int] = None, - context_parallel_size: int = 1, - sequence_parallelism: bool = True, - expert_tensor_parallelism: int = 1, - expert_model_parallelism: int = 8, - # Training hyperparameters - train_iters: int = 39735, - global_batch_size: int = 3072, - micro_batch_size: int = 2, - seq_length: int = 8192, - eval_interval: int = 1000, - save_interval: int = 200, - # Optimizer - lr: float = 1.6e-3, - min_lr: float = 1.6e-5, - lr_warmup_iters: int = 333, - lr_decay_iters: Optional[int] = None, - use_null_tokenizer: bool = False, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, - # MoE - enable_deepep: bool = True, - # W&B - wandb_project: Optional[str] = None, - wandb_entity: Optional[str] = None, - wandb_exp_name: Optional[str] = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for Nemotron 3 Nano model. - - Args: - model_provider: The model provider class for the Nemotron 3 Nano variant. - dir: Base directory for saving logs and checkpoints. - name: Name of the pre-training run. - data_paths: List of paths to dataset files. If None, mock data will be used. - data_args_path: Path to file containing data arguments. - train_data_path: List of training data paths. - valid_data_path: List of validation data paths. - test_data_path: List of test data paths. - per_split_data_args_path: Path to JSON file with per-split data configuration. - path_to_cache: Path to cache directory. - mock: Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size: Degree of tensor model parallelism. - pipeline_model_parallel_size: Degree of pipeline model parallelism. - pipeline_parallelism_dtype: Data type for pipeline parallelism. - virtual_pipeline_parallelism: Size of virtual pipeline parallelism. - context_parallel_size: Degree of context parallelism to be passed to model_config. - sequence_parallelism: Whether to use sequence parallelism. - expert_tensor_parallelism: Degree of expert tensor parallelism. - expert_model_parallelism: Degree of expert model parallelism. - train_iters: Total number of training iterations. - global_batch_size: Global batch size for training. - micro_batch_size: Micro batch size for training. - seq_length: Sequence length for training data. - eval_interval: Interval (in iterations) between evaluations. - save_interval: Interval (in iterations) between checkpoints. - lr: Learning rate. - min_lr: Minimum learning rate for cosine decay. - lr_warmup_iters: Number of warmup iterations for the learning rate. - lr_decay_iters: Number of iterations for learning rate decay. - use_null_tokenizer: Whether to use NullTokenizer instead of HuggingFaceTokenizer. - precision_config: Precision configuration for the model. - comm_overlap_config: Communication overlap configuration for the model. - enable_deepep: Whether to enable DeepEP for MoE. - wandb_project: Weights & Biases project name. - wandb_entity: Weights & Biases entity name. - wandb_exp_name: Weights & Biases experiment name. + This is a MoE (Mixture of Experts) model with the following default parallelism: + - TP=4, PP=1, EP=8, SP=True + - DeepEP enabled for MoE token dispatch Returns: - ConfigContainer: Configuration for pre-training. + ConfigContainer: Pre-training configuration for Nemotron 3 Nano. """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock + cfg = _pretrain_common() + + # Model Configuration (MoE) + cfg.model = Nemotron3NanoProvider( + tensor_model_parallel_size=4, + pipeline_model_parallel_size=1, + pipeline_dtype=torch.bfloat16, + virtual_pipeline_model_parallel_size=None, + context_parallel_size=1, + sequence_parallel=True, + expert_tensor_parallel_size=1, + expert_model_parallel_size=8, + seq_length=8192, ) - # Configure the model (integrating the old model_config functionality) - model_cfg = model_provider( - tensor_model_parallel_size=tensor_model_parallel_size, - pipeline_model_parallel_size=pipeline_model_parallel_size, - pipeline_dtype=pipeline_parallelism_dtype, - virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, - context_parallel_size=context_parallel_size, - sequence_parallel=sequence_parallelism, - expert_tensor_parallel_size=expert_tensor_parallelism, - expert_model_parallel_size=expert_model_parallelism, - apply_rope_fusion=False, - async_tensor_model_parallel_allreduce=True, - attention_backend="fused", - gradient_accumulation_fusion=True, - init_method_std=0.0173, - use_fused_weighted_squared_relu=True, - seq_length=seq_length, - ) - - if enable_deepep: - model_cfg.moe_token_dispatcher_type = "flex" - model_cfg.moe_shared_expert_overlap = False - model_cfg.moe_flex_dispatcher_backend = "deepep" - - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - adam_beta1=0.9, - adam_beta2=0.95, - adam_eps=1e-8, - weight_decay=0.1, - max_lr=lr, - min_lr=min_lr, - start_weight_decay=0.1, - end_weight_decay=0.1, - lr_decay_style="cosine", + # Tokenizer (--tokenizer-model) + cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" + + # Dataset Configuration + cfg.dataset.seq_length = 8192 + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.mmap_bin_files = False + + # Parallelism Settings (MoE-specific) + cfg.model.pipeline_model_parallel_layout = None + + # MoE Token Dispatcher Settings + cfg.model.moe_token_dispatcher_type = "flex" + cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_hybridep_num_sms = 16 + + # Training Configuration + cfg.train.train_iters = 39735 + cfg.train.global_batch_size = 3072 + cfg.train.micro_batch_size = 2 + cfg.train.manual_gc = False + cfg.train.manual_gc_interval = 0 + + # Transformer Engine (TE) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel Selections + cfg.model.attention_backend = "fused" + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory Saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # ========================================================================= + # FP8 & MXFP8 (Mixed Precision Settings) + # ========================================================================= + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # FP8 settings (disabled by default, uncomment to enable) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.model.moe_router_padding_for_fp8 = False + + # Optimizer Precision Settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Optimizer hyperparameters + cfg.optimizer.lr = 1.6e-3 + cfg.optimizer.weight_decay = 0.1 + cfg.scheduler.min_lr = 1.6e-5 + cfg.scheduler.warmup_iters = 333 + + # Communication Overlap + cfg.comm_overlap = CommOverlapConfig( + tp_comm_bootstrap_backend="nccl", + tp_comm_overlap=True, ) - - tokenizer_config = TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ) - - # Config Container - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - use_distributed_optimizer=True, - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - sequence_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - path_to_cache=path_to_cache, - # Dataloader config parameters - data_sharding=True, - dataloader_type="single", - num_workers=1, - skip_getting_attention_mask_from_dataset=True, - mmap_bin_files=False, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - wandb_project=wandb_project, - wandb_entity=wandb_entity, - wandb_exp_name=wandb_exp_name, - ), - tokenizer=tokenizer_config, - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - dist_ckpt_strictness="log_all", - ckpt_assume_constant_structure=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, - ) - - if cfg.comm_overlap is None: - cfg.comm_overlap = CommOverlapConfig( - tp_comm_bootstrap_backend="nccl", - tp_comm_overlap=True, - ) + cfg.comm_overlap.delay_wgrad_compute = False + cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = False + + # Checkpoint Configuration + # Paths are set in _pretrain_common by default. Override here if needed: + # cfg.checkpoint.load = "path/to/load" + # cfg.checkpoint.save = "path/to/save" + cfg.checkpoint.save_interval = 200 + cfg.checkpoint.ckpt_assume_constant_structure = True + cfg.checkpoint.dist_ckpt_strictness = "log_all" + + # DDP Configuration + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False + + cfg.model.init_method_std = 0.0173 + cfg.model.apply_rope_fusion = False + cfg.model.async_tensor_model_parallel_allreduce = True + cfg.model.gradient_accumulation_fusion = True + cfg.model.use_fused_weighted_squared_relu = True return cfg diff --git a/src/megatron/bridge/recipes/nemotronh/nemotron_nano_v2.py b/src/megatron/bridge/recipes/nemotronh/nemotron_nano_v2.py index 7aba3cfe1c..facfdf613b 100644 --- a/src/megatron/bridge/recipes/nemotronh/nemotron_nano_v2.py +++ b/src/megatron/bridge/recipes/nemotronh/nemotron_nano_v2.py @@ -22,6 +22,8 @@ NemotronNanoModelProvider12Bv2, ) from megatron.bridge.peft.base import PEFT +from megatron.bridge.recipes.common import _pretrain_common +from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ( ConfigContainer, @@ -86,50 +88,205 @@ class NemotronNanoV2FinetuneKwargs(NemotronNanoV2CommonKwargs, total=False): wandb_exp_name: str | None -def nemotron_nano_9b_v2_pretrain_config(**user_kwargs: Unpack[NemotronNanoV2CommonKwargs]) -> ConfigContainer: +def nemotron_nano_9b_v2_pretrain_config() -> ConfigContainer: """Return a pre-training config for Nemotron Nano 9B v2. This recipe is designed for single-node training (1 node). Default parallelism: TP=2, PP=1, SP=True. - - See `_nemotronh_common` for the full list of parameters. """ - from megatron.bridge.recipes.nemotronh.nemotronh import _nemotronh_common - - recommended_kwargs: NemotronNanoV2CommonKwargs = { - "model_provider": NemotronNanoModelProvider9Bv2, - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 1, - "sequence_parallel": True, - "precision_config": "bf16_mixed", - "enable_default_comm_overlap": True, - } - combined_kwargs: NemotronNanoV2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _nemotronh_common(tokenizer_model="nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base", **combined_kwargs) - - -def nemotron_nano_12b_v2_pretrain_config(**user_kwargs: Unpack[NemotronNanoV2CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config - uses NemotronNanoModelProvider9Bv2 + cfg.model = NemotronNanoModelProvider9Bv2( + tensor_model_parallel_size=2, + pipeline_model_parallel_size=1, + pipeline_dtype=torch.bfloat16, + virtual_pipeline_model_parallel_size=None, + context_parallel_size=1, + sequence_parallel=True, + ) + + # Parallel settings (already set in model provider above) + cfg.model.pipeline_model_parallel_layout = None + + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 8192 + cfg.dataset.num_workers = 8 + + # Training config + cfg.train.train_iters = 1_168_251 + cfg.train.global_batch_size = 768 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 10 + + cfg.train.manual_gc = False + cfg.train.manual_gc_interval = 0 + cfg.train.manual_gc_eval = True + + # Optimizer + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = False + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - bf16_mixed + cfg.mixed_precision = "bf16_mixed" + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap + cfg.comm_overlap = CommOverlapConfig( + tp_comm_bootstrap_backend="nccl", + tp_comm_overlap=True, + ) + + # Checkpoint config + cfg.checkpoint.save_interval = 10 + cfg.checkpoint.dist_ckpt_strictness = "log_all" + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.average_in_collective = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg + + +def nemotron_nano_12b_v2_pretrain_config() -> ConfigContainer: """Return a pre-training config for Nemotron Nano 12B v2. This recipe is designed for single-node training (1 node). Default parallelism: TP=4, PP=1, SP=True. Note: Uses FP8 precision by default. Communication overlap is disabled by default. - - See `_nemotronh_common` for the full list of parameters. """ - from megatron.bridge.recipes.nemotronh.nemotronh import _nemotronh_common - - recommended_kwargs: NemotronNanoV2CommonKwargs = { - "model_provider": NemotronNanoModelProvider12Bv2, - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 1, - "sequence_parallel": True, - "precision_config": "nanov2_bf16_with_fp8_current_scaling_mixed", - "enable_default_comm_overlap": False, - } - combined_kwargs: NemotronNanoV2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _nemotronh_common(tokenizer_model="nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base", **combined_kwargs) + cfg = _pretrain_common() + + # Model config - uses NemotronNanoModelProvider12Bv2 + cfg.model = NemotronNanoModelProvider12Bv2( + tensor_model_parallel_size=4, + pipeline_model_parallel_size=1, + pipeline_dtype=torch.bfloat16, + virtual_pipeline_model_parallel_size=None, + context_parallel_size=1, + sequence_parallel=True, + ) + + # Parallel settings (already set in model provider above) + cfg.model.pipeline_model_parallel_layout = None + + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 8192 + cfg.dataset.num_workers = 8 + + # Training config + cfg.train.train_iters = 1_168_251 + cfg.train.global_batch_size = 768 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 10 + + cfg.train.manual_gc = False + cfg.train.manual_gc_interval = 0 + cfg.train.manual_gc_eval = True + + # Optimizer + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = False + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - FP8 with current scaling + cfg.mixed_precision = "nanov2_bf16_with_fp8_current_scaling_mixed" + # FP8 settings (commented - already enabled via precision string above) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap - disabled by default for 12B (FP8 compatibility) + cfg.comm_overlap = None + + # Checkpoint config + cfg.checkpoint.save_interval = 10 + cfg.checkpoint.dist_ckpt_strictness = "log_all" + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.average_in_collective = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg def nemotron_nano_9b_v2_finetune_config(**user_kwargs: Unpack[NemotronNanoV2FinetuneKwargs]) -> ConfigContainer: diff --git a/src/megatron/bridge/recipes/nemotronh/nemotronh.py b/src/megatron/bridge/recipes/nemotronh/nemotronh.py index 04d380567c..089dab74a3 100644 --- a/src/megatron/bridge/recipes/nemotronh/nemotronh.py +++ b/src/megatron/bridge/recipes/nemotronh/nemotronh.py @@ -25,7 +25,7 @@ NemotronHModelProvider56B, ) from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE @@ -34,7 +34,6 @@ CheckpointConfig, ConfigContainer, DistributedDataParallelConfig, - GPTDatasetConfig, LoggerConfig, RNGConfig, TokenizerConfig, @@ -100,255 +99,410 @@ class NemotronHFinetuneKwargs(NemotronHCommonKwargs, total=False): wandb_exp_name: str | None -def nemotronh_4b_pretrain_config(**user_kwargs: Unpack[NemotronHCommonKwargs]) -> ConfigContainer: +def nemotronh_4b_pretrain_config() -> ConfigContainer: """Return a pre-training config for NemotronH 4B. This recipe is designed for single-node training (1 node). Default parallelism: TP=1, PP=1, SP=False. - - See `_nemotronh_common` for the full list of parameters. """ - recommended_kwargs: NemotronHCommonKwargs = { - "model_provider": NemotronHModelProvider4B, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "sequence_parallel": False, - "precision_config": "bf16_mixed", - "enable_default_comm_overlap": True, - } - combined_kwargs: NemotronHCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _nemotronh_common(tokenizer_model="nvidia/Nemotron-H-4B-Base-8K", **combined_kwargs) + cfg = _pretrain_common() + + # Model config + cfg.model = NemotronHModelProvider4B( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + pipeline_dtype=torch.bfloat16, + virtual_pipeline_model_parallel_size=None, + context_parallel_size=1, + sequence_parallel=False, + ) + + # Parallel settings + cfg.model.pipeline_model_parallel_layout = None + + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 8192 + cfg.dataset.num_workers = 8 + # Training config + cfg.train.train_iters = 1_168_251 + cfg.train.global_batch_size = 768 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 10 -def nemotronh_8b_pretrain_config(**user_kwargs: Unpack[NemotronHCommonKwargs]) -> ConfigContainer: + cfg.train.manual_gc = False + cfg.train.manual_gc_interval = 0 + cfg.train.manual_gc_eval = True + + # Optimizer + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = False + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - bf16_mixed + cfg.mixed_precision = "bf16_mixed" + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap - enabled by default + cfg.comm_overlap = CommOverlapConfig( + tp_comm_bootstrap_backend="nccl", + tp_comm_overlap=True, + ) + + # Checkpoint config + cfg.checkpoint.save_interval = 10 + cfg.checkpoint.dist_ckpt_strictness = "log_all" + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.average_in_collective = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg + + +def nemotronh_8b_pretrain_config() -> ConfigContainer: """Return a pre-training config for NemotronH 8B. This recipe is designed for single-node training (1 node). Default parallelism: TP=2, PP=1, SP=True. - - See `_nemotronh_common` for the full list of parameters. """ - recommended_kwargs: NemotronHCommonKwargs = { - "model_provider": NemotronHModelProvider8B, - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 1, - "sequence_parallel": True, - "precision_config": "bf16_mixed", - "enable_default_comm_overlap": True, - } - combined_kwargs: NemotronHCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _nemotronh_common(tokenizer_model="nvidia/Nemotron-H-8B-Base-8K", **combined_kwargs) + cfg = _pretrain_common() + + # Model config + cfg.model = NemotronHModelProvider8B( + tensor_model_parallel_size=2, + pipeline_model_parallel_size=1, + pipeline_dtype=torch.bfloat16, + virtual_pipeline_model_parallel_size=None, + context_parallel_size=1, + sequence_parallel=True, + ) + # Parallel settings + cfg.model.pipeline_model_parallel_layout = None -def nemotronh_47b_pretrain_config(**user_kwargs: Unpack[NemotronHCommonKwargs]) -> ConfigContainer: - """Return a pre-training config for NemotronH 47B. + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE - This recipe is designed for single-node training (1 node with 8 GPUs). - Default parallelism: TP=8, PP=1, SP=True. + # Dataset config + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 8192 + cfg.dataset.num_workers = 8 - Note: Uses FP8 precision by default. Communication overlap is disabled by default - due to known issues with FP8 current scaling. + # Training config + cfg.train.train_iters = 1_168_251 + cfg.train.global_batch_size = 768 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 10 - See `_nemotronh_common` for the full list of parameters. - """ - recommended_kwargs: NemotronHCommonKwargs = { - "model_provider": NemotronHModelProvider47B, - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 1, - "sequence_parallel": True, - "precision_config": "nemotron_h_bf16_with_fp8_current_scaling_mixed", - "enable_default_comm_overlap": True, - } - combined_kwargs: NemotronHCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _nemotronh_common(tokenizer_model="nvidia/Nemotron-H-47B-Base-8K", **combined_kwargs) + cfg.train.manual_gc = False + cfg.train.manual_gc_interval = 0 + cfg.train.manual_gc_eval = True + # Optimizer + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = False + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - bf16_mixed + cfg.mixed_precision = "bf16_mixed" + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap - enabled by default + cfg.comm_overlap = CommOverlapConfig( + tp_comm_bootstrap_backend="nccl", + tp_comm_overlap=True, + ) -def nemotronh_56b_pretrain_config(**user_kwargs: Unpack[NemotronHCommonKwargs]) -> ConfigContainer: - """Return a pre-training config for NemotronH 56B. + # Checkpoint config + cfg.checkpoint.save_interval = 10 + cfg.checkpoint.dist_ckpt_strictness = "log_all" + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.average_in_collective = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg + + +def nemotronh_47b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for NemotronH 47B. This recipe is designed for single-node training (1 node with 8 GPUs). Default parallelism: TP=8, PP=1, SP=True. - Note: Uses FP8 precision by default. Communication overlap is disabled by default - due to known issues with FP8 current scaling. - - See `_nemotronh_common` for the full list of parameters. + Note: Uses FP8 precision by default. """ - recommended_kwargs: NemotronHCommonKwargs = { - "model_provider": NemotronHModelProvider56B, - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 1, - "sequence_parallel": True, - "precision_config": "nemotron_h_bf16_with_fp8_current_scaling_mixed", - "enable_default_comm_overlap": True, - } - combined_kwargs: NemotronHCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _nemotronh_common(tokenizer_model="nvidia/Nemotron-H-8B-Base-8K", **combined_kwargs) + cfg = _pretrain_common() + + # Model config + cfg.model = NemotronHModelProvider47B( + tensor_model_parallel_size=8, + pipeline_model_parallel_size=1, + pipeline_dtype=torch.bfloat16, + virtual_pipeline_model_parallel_size=None, + context_parallel_size=1, + sequence_parallel=True, + ) + # Parallel settings + cfg.model.pipeline_model_parallel_layout = None -def _nemotronh_common( - model_provider: type[NemotronHModelProvider], - tokenizer_model: str | None = None, - dir: str | None = None, - name: str = "default", - # Dataset configuration - data_paths: list[str] | None = None, - data_args_path: str | None = None, - train_data_path: list[str] | None = None, - valid_data_path: list[str] | None = None, - test_data_path: list[str] | None = None, - per_split_data_args_path: str | None = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: torch.dtype | None = torch.bfloat16, - virtual_pipeline_model_parallel_size: int | None = None, - context_parallel_size: int = 1, - sequence_parallel: bool = False, - # Training hyperparameters - train_iters: int = 1_168_251, - global_batch_size: int = 768, - micro_batch_size: int = 1, - seq_length: int = 8192, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 2000, - lr_decay_iters: int | None = None, - use_null_tokenizer: bool = True, - # Precision recipe - precision_config: MixedPrecisionConfig | str | None = "bf16_mixed", - comm_overlap_config: CommOverlapConfig | None = None, - # CommOverlap setting - enable_default_comm_overlap: bool = True, -) -> ConfigContainer: - """ - Create a pre-training configuration for NemotronH and Nemotron Nano v2 models. + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE - Args: - model_provider: The model provider class for the specific NemotronH or Nemotron Nano v2 variant. - tokenizer_model: HuggingFace tokenizer model name (only used when use_null_tokenizer=False). - dir: Base directory for saving logs and checkpoints. - name: Name of the pre-training run. - data_paths: List of paths to dataset files. If None, mock data will be used. - data_args_path: Path to file containing data arguments. - train_data_path: List of training data paths. - valid_data_path: List of validation data paths. - test_data_path: List of test data paths. - per_split_data_args_path: Path to JSON file with per-split data configuration. - mock: Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size: Degree of tensor model parallelism. - pipeline_model_parallel_size: Degree of pipeline model parallelism. - pipeline_dtype: Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size: Size of virtual pipeline parallelism. - context_parallel_size: Degree of context parallelism to be passed to model_config. - sequence_parallel: Whether to use sequence parallelism. - train_iters: Total number of training iterations. - global_batch_size: Global batch size for training. - micro_batch_size: Micro batch size for training. - seq_length: Sequence length for training data. - lr: Learning rate. - min_lr: Minimum learning rate for cosine decay. - lr_warmup_iters: Number of warmup iterations for the learning rate. - lr_decay_iters: Number of iterations for learning rate decay. - use_null_tokenizer: Whether to use NullTokenizer instead of HuggingFaceTokenizer. - precision_config: Precision configuration for the model. - comm_overlap_config: Communication overlap configuration for the model. - enable_default_comm_overlap: Whether to enable default comm overlap config if none is provided. + # Dataset config + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 8192 + cfg.dataset.num_workers = 8 - Returns: - ConfigContainer: Configuration for pre-training. - """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") + # Training config + cfg.train.train_iters = 1_168_251 + cfg.train.global_batch_size = 768 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 10 - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock - ) + cfg.train.manual_gc = False + cfg.train.manual_gc_interval = 0 + cfg.train.manual_gc_eval = True - model_cfg = model_provider( - tensor_model_parallel_size=tensor_model_parallel_size, - pipeline_model_parallel_size=pipeline_model_parallel_size, - pipeline_dtype=pipeline_dtype, - virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size, - context_parallel_size=context_parallel_size, - sequence_parallel=sequence_parallel, + # Optimizer + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = False + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - FP8 with current scaling + cfg.mixed_precision = "nemotron_h_bf16_with_fp8_current_scaling_mixed" + # FP8 settings (commented - already enabled via precision string above) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap - enabled by default + cfg.comm_overlap = CommOverlapConfig( + tp_comm_bootstrap_backend="nccl", + tp_comm_overlap=True, ) - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - adam_beta1=0.9, - adam_beta2=0.95, - adam_eps=1e-8, - weight_decay=0.1, - max_lr=lr, - min_lr=min_lr, + # Checkpoint config + cfg.checkpoint.save_interval = 10 + cfg.checkpoint.dist_ckpt_strictness = "log_all" + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config (DIFFERENT from _pretrain_common) + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.average_in_collective = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + return cfg + + +def nemotronh_56b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for NemotronH 56B. + + This recipe is designed for single-node training (1 node with 8 GPUs). + Default parallelism: TP=8, PP=1, SP=True. + + Note: Uses FP8 precision by default. + """ + cfg = _pretrain_common() + + # Model config + cfg.model = NemotronHModelProvider56B( + tensor_model_parallel_size=8, + pipeline_model_parallel_size=1, + pipeline_dtype=torch.bfloat16, + virtual_pipeline_model_parallel_size=None, + context_parallel_size=1, + sequence_parallel=True, ) - # Config Container - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=10, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=False, - use_distributed_optimizer=True, - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - # Dataloader config parameters - data_sharding=True, - dataloader_type="single", - num_workers=8, - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=tokenizer_model if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=10, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - dist_ckpt_strictness="log_all", - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + # Parallel settings + cfg.model.pipeline_model_parallel_layout = None + + # Tokenizer - uses NullTokenizer by default + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 8192 + cfg.dataset.num_workers = 8 + + # Training config + cfg.train.train_iters = 1_168_251 + cfg.train.global_batch_size = 768 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 10 + + cfg.train.manual_gc = False + cfg.train.manual_gc_interval = 0 + cfg.train.manual_gc_eval = True + + # Optimizer + cfg.scheduler.lr_warmup_iters = 2000 + + cfg.logger.log_timers_to_tensorboard = False + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - FP8 with current scaling + cfg.mixed_precision = "nemotron_h_bf16_with_fp8_current_scaling_mixed" + # FP8 settings (commented - already enabled via precision string above) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap - enabled by default + cfg.comm_overlap = CommOverlapConfig( + tp_comm_bootstrap_backend="nccl", + tp_comm_overlap=True, ) - if cfg.comm_overlap is None and enable_default_comm_overlap: - cfg.comm_overlap = CommOverlapConfig( - tp_comm_bootstrap_backend="nccl", - tp_comm_overlap=True, - ) + # Checkpoint config + cfg.checkpoint.save_interval = 10 + cfg.checkpoint.dist_ckpt_strictness = "log_all" + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.average_in_collective = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" return cfg diff --git a/src/megatron/bridge/recipes/olmoe/olmoe_7b.py b/src/megatron/bridge/recipes/olmoe/olmoe_7b.py index 52fac6f2b4..87e9f3232f 100644 --- a/src/megatron/bridge/recipes/olmoe/olmoe_7b.py +++ b/src/megatron/bridge/recipes/olmoe/olmoe_7b.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import os from typing import List, Optional, Union @@ -22,14 +21,13 @@ from megatron.bridge.models.olmoe import OlMoEModelProvider from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, - GPTDatasetConfig, LoggerConfig, RNGConfig, TokenizerConfig, @@ -38,9 +36,6 @@ from megatron.bridge.training.mixed_precision import MixedPrecisionConfig -logger = logging.getLogger(__name__) - - class OLMoECommonKwargs(TypedDict, total=False): """Typed options accepted by OLMoE family recipe helpers.""" @@ -130,233 +125,155 @@ class OLMoEFinetuneKwargs(TypedDict, total=False): wandb_exp_name: Optional[str] -def olmoe_7b_pretrain_config(**user_kwargs: Unpack[OLMoECommonKwargs]) -> ConfigContainer: - """Return a pre-training config for OLMoE-7B (7B total, ~1B active). - - See `_olmoe_common` for the full list of parameters. - """ - recommended_kwargs: OLMoECommonKwargs = { - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "pipeline_dtype": torch.bfloat16, - "virtual_pipeline_model_parallel_size": None, - "context_parallel_size": 1, - "expert_model_parallel_size": 8, - "sequence_parallel": False, - "recompute_granularity": "selective", - "apply_rope_fusion": False, - "train_iters": 500_000, - "global_batch_size": 2048, - "micro_batch_size": 1, - "seq_length": 4096, - "lr": 3e-4, - "min_lr": 3e-5, - "lr_warmup_iters": 2000, - "optimizer_type": "adam", - "eval_interval": 2000, - "save_interval": 2000, +def _get_olmoe_pipeline_layout(pp_size: int, vp_size: int): + """Get pipeline layout for OLMoE-7B based on PP and VP size.""" + # OLMoE has 16 layers + map_pp_vp_to_layout = { + (1, 1): None, + (2, 1): [["embedding"] + ["decoder"] * 8, ["decoder"] * 8 + ["loss"]], + (4, 1): [["embedding"] + ["decoder"] * 4, ["decoder"] * 4, ["decoder"] * 4, ["decoder"] * 4 + ["loss"]], + (2, 2): [["embedding"] + ["decoder"] * 4, ["decoder"] * 4, ["decoder"] * 4, ["decoder"] * 4 + ["loss"]], } - combined_kwargs: OLMoECommonKwargs = {**recommended_kwargs, **user_kwargs} - return _olmoe_common(**combined_kwargs) + if (pp_size, vp_size) not in map_pp_vp_to_layout: + raise ValueError( + f"Invalid PP and VP size: {pp_size} and {vp_size} to infer PP layout " + f"for OLMoE (7B). Known PP and VP combinations: {map_pp_vp_to_layout.keys()}" + ) + layout = map_pp_vp_to_layout[(pp_size, vp_size)] + if layout is not None: + layout = list([list(x) for x in layout]) + return layout -def _olmoe_common( - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: Optional[torch.dtype] = torch.bfloat16, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - expert_model_parallel_size: int = 8, - sequence_parallel: bool = False, - # Recomputation - recompute_granularity: str = "selective", - recompute_modules: Optional[List[str]] = None, - recompute_method: Optional[str] = None, - recompute_num_layers: Optional[int] = None, - apply_rope_fusion: bool = False, - # Training hyperparameters - train_iters: int = 500_000, - global_batch_size: int = 2048, - micro_batch_size: int = 1, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 2000, - optimizer_type: str = "adam", - eval_interval: int = 2000, - save_interval: int = 2000, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = None, - comm_overlap_config: Optional[CommOverlapConfig] = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for OLMoE-7B model (7B total, ~1B active). - - Args: - dir (Optional[str]): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used. - data_args_path (Optional[str]): Path to file containing data arguments. - train_data_path (Optional[List[str]]): List of training data paths. - valid_data_path (Optional[List[str]]): List of validation data paths. - test_data_path (Optional[List[str]]): List of test data paths. - per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (Optional[int]): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism. - expert_model_parallel_size (int): Degree of expert model parallelism. - sequence_parallel (bool): Whether to use sequence parallelism. - recompute_granularity (str): Recomputation granularity. - recompute_modules (Optional[List[str]]): Modules to recompute. - recompute_method (Optional[str]): Recomputation method. - recompute_num_layers (Optional[int]): Number of layers to recompute. - apply_rope_fusion (bool): Whether to apply RoPE fusion. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - optimizer_type (str): Type of optimizer to use. - eval_interval (int): Interval for evaluation. - save_interval (int): Interval for saving checkpoints. - precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model. - comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration. +def olmoe_7b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for OLMoE-7B (7B total, ~1B active). - Returns: - ConfigContainer: Configuration for pre-training. + Recommended parallelism: TP=1, PP=1, EP=8 + Uses precision-aware optimizer with bf16 gradients/moments. """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock + cfg = _pretrain_common() + + # Model config - uses OlMoEModelProvider + cfg.model = OlMoEModelProvider( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + pipeline_dtype=torch.bfloat16, + virtual_pipeline_model_parallel_size=None, + context_parallel_size=1, + expert_model_parallel_size=8, + sequence_parallel=False, + recompute_granularity="selective", + recompute_modules=None, + recompute_method=None, + recompute_num_layers=None, ) - model_cfg = _model_config( - tensor_model_parallel_size=tensor_model_parallel_size, - pipeline_model_parallel_size=pipeline_model_parallel_size, - pipeline_dtype=pipeline_dtype, - virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size, - context_parallel_size=context_parallel_size, - expert_model_parallel_size=expert_model_parallel_size, - sequence_parallel=sequence_parallel, - recompute_granularity=recompute_granularity, - recompute_modules=recompute_modules, - recompute_method=recompute_method, - recompute_num_layers=recompute_num_layers, - apply_rope_fusion=apply_rope_fusion, - ) - - if optimizer_type == "adam": - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=train_iters, - adam_beta1=0.9, - adam_beta2=0.95, - adam_eps=1e-8, - weight_decay=0.1, - max_lr=lr, - min_lr=min_lr, - ) - - opt_config.use_precision_aware_optimizer = True - opt_config.main_params_dtype = torch.float32 - opt_config.main_grads_dtype = torch.bfloat16 - opt_config.exp_avg_dtype = torch.bfloat16 - opt_config.exp_avg_sq_dtype = torch.bfloat16 - else: - raise ValueError(f"Invalid optimizer type: {optimizer_type}") + # Pipeline layout + cfg.model.pipeline_model_parallel_layout = _get_olmoe_pipeline_layout(1, 1) - if precision_config is None: - precision_config = MixedPrecisionConfig( - bf16=True, - params_dtype=torch.bfloat16, - pipeline_dtype=torch.bfloat16, - autocast_enabled=False, - grad_reduce_in_fp32=False, - ) + # Performance optimization knobs + cfg.model.moe_permute_fusion = True + + # Tokenizer - uses NullTokenizer with model vocab_size + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = cfg.model.vocab_size + + # Dataset config + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.seq_length = 4096 + cfg.dataset.num_workers = 8 + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" + cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_hybridep_num_sms = 16 + + # Training config + cfg.train.train_iters = 500_000 + cfg.train.global_batch_size = 2048 + cfg.train.micro_batch_size = 1 + cfg.train.eval_interval = 2000 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 5 + cfg.train.manual_gc_eval = 5 - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=5, - manual_gc_eval=5, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=False, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - use_distributed_optimizer=True, - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split or "99990,8,2", - data_sharding=True, - dataloader_type="single", - num_workers=8, - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=model_cfg.vocab_size), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - async_save=False, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + # Optimizer + cfg.scheduler.lr_warmup_iters = 2000 + cfg.scheduler.lr_decay_iters = cfg.train.train_iters + cfg.optimizer.adam_eps = 1e-8 + + # Precision-aware optimizer settings + cfg.optimizer.use_precision_aware_optimizer = True + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.main_grads_dtype = torch.bfloat16 + cfg.optimizer.exp_avg_dtype = torch.bfloat16 + cfg.optimizer.exp_avg_sq_dtype = torch.bfloat16 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections (includes MoE-specific kernels) + cfg.model.attention_backend = None + cfg.model.moe_router_fusion = False + cfg.model.moe_grouped_gemm = True + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) - already set in OlMoEModelProvider + # cfg.model.recompute_granularity = "selective" + # cfg.model.recompute_modules = None + cfg.model.apply_rope_fusion = False # Set to True for RoPE fusion (requires experimental flag) + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Mixed precision - OLMoE uses custom MixedPrecisionConfig (NOT "bf16_mixed" string) + cfg.mixed_precision = MixedPrecisionConfig( + bf16=True, + params_dtype=torch.bfloat16, + pipeline_dtype=torch.bfloat16, + autocast_enabled=False, + grad_reduce_in_fp32=False, ) - - if apply_rope_fusion: - cfg.dist.enable_megatron_core_experimental = True # for rope fusion - - if cfg.comm_overlap is None: - cfg.comm_overlap = CommOverlapConfig( - tp_comm_overlap=False, - ) + # FP8 settings (commented - enable if using FP8) + # cfg.mixed_precision.fp8_recipe = "tensorwise" + # cfg.mixed_precision.fp8 = None + # cfg.mixed_precision.fp8_param_gather = False + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False + cfg.model.moe_router_padding_for_fp8 = False + + # Communication overlap + cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = False + + # Checkpoint config + cfg.checkpoint.save_interval = 2000 + cfg.checkpoint.async_save = False + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + + if cfg.model.apply_rope_fusion: + cfg.dist.enable_megatron_core_experimental = True + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False return cfg @@ -417,25 +334,9 @@ def _model_config( cfg.apply_rope_fusion = True # Pipeline parallelism configs. We infer PP layout from the provided PP and VP size - # OLMoE has 16 layers - map_pp_vp_to_layout = { - (1, 1): None, - (2, 1): [["embedding"] + ["decoder"] * 8, ["decoder"] * 8 + ["loss"]], - (4, 1): [["embedding"] + ["decoder"] * 4, ["decoder"] * 4, ["decoder"] * 4, ["decoder"] * 4 + ["loss"]], - (2, 2): [["embedding"] + ["decoder"] * 4, ["decoder"] * 4, ["decoder"] * 4, ["decoder"] * 4 + ["loss"]], - } pp_size = pipeline_model_parallel_size or 1 vp_size = virtual_pipeline_model_parallel_size or 1 - if (pp_size, vp_size) not in map_pp_vp_to_layout: - raise ValueError( - f"Invalid PP and VP size: {pp_size} and {vp_size} to infer PP layout " - f"for OLMoE (7B). Known PP and VP combinations: {map_pp_vp_to_layout.keys()}" - ) - - layout = map_pp_vp_to_layout[(pp_size, vp_size)] - - if layout is not None: - layout = list([list(x) for x in layout]) # yield all the elements + layout = _get_olmoe_pipeline_layout(pp_size, vp_size) cfg.pipeline_model_parallel_layout = layout return cfg diff --git a/src/megatron/bridge/recipes/qwen/qwen2.py b/src/megatron/bridge/recipes/qwen/qwen2.py index 7ca12204b8..7deabc2687 100644 --- a/src/megatron/bridge/recipes/qwen/qwen2.py +++ b/src/megatron/bridge/recipes/qwen/qwen2.py @@ -21,7 +21,7 @@ from megatron.bridge import AutoBridge from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE @@ -29,7 +29,6 @@ from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, - GPTDatasetConfig, LoggerConfig, RNGConfig, TokenizerConfig, @@ -79,323 +78,852 @@ class Qwen2CommonKwargs(TypedDict, total=False): comm_overlap_config: Optional[CommOverlapConfig] -def qwen2_500m_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer: +# ============================================================================= +# Qwen2 Pretrain Configs +# ============================================================================= + + +def qwen2_500m_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen2 0.5B. - See `_qwen2_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=1 (fits on a single GPU). """ - recommended_kwargs: Qwen2CommonKwargs = { - "hf_path": "Qwen/Qwen2-0.5B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen2_common(**combined_kwargs) + cfg = _pretrain_common() + + # Model config (--tensor-model-parallel-size, --pipeline-model-parallel-size, etc.) + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen2-0.5B").to_megatron_provider(load_weights=False) + + # Tokenizer + # Qwen2 uses NullTokenizer by default for pretraining + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config (paths set in _pretrain_common) + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.average_in_collective = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + cfg.ddp.check_for_nan_in_grad = False + cfg.ddp.use_distributed_optimizer = True + + return cfg -def qwen2_1p5b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer: +def qwen2_1p5b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen2 1.5B. - See `_qwen2_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=1 (fits on a single GPU). """ - recommended_kwargs: Qwen2CommonKwargs = { - "hf_path": "Qwen/Qwen2-1.5B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen2_common(**combined_kwargs) + cfg = _pretrain_common() + # Model config (--tensor-model-parallel-size, --pipeline-model-parallel-size, etc.) + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen2-1.5B").to_megatron_provider(load_weights=False) -def qwen2_7b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer: + # Tokenizer + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config (paths set in _pretrain_common) + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.average_in_collective = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + cfg.ddp.check_for_nan_in_grad = False + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +def qwen2_7b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen2 7B. - See `_qwen2_common` for the full list of parameters. + Recommended parallelism: TP=2, PP=1. """ - recommended_kwargs: Qwen2CommonKwargs = { - "hf_path": "Qwen/Qwen2-7B", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 1, - "use_megatron_fsdp": False, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen2_common(**combined_kwargs) - - -def qwen2_72b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen2-7B").to_megatron_provider(load_weights=False) + + # Tokenizer + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config (paths set in _pretrain_common) + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.average_in_collective = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + cfg.ddp.check_for_nan_in_grad = False + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +def qwen2_72b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen2 72B. - See `_qwen2_common` for the full list of parameters. + Recommended parallelism: TP=8, PP=4. """ - recommended_kwargs: Qwen2CommonKwargs = { - "hf_path": "Qwen/Qwen2-72B", - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 4, - "pipeline_dtype": torch.bfloat16, - "use_megatron_fsdp": False, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen2_common(**combined_kwargs) - - -def qwen25_500m_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen2-72B").to_megatron_provider(load_weights=False) + + # Tokenizer + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 # Required for PP > 1 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config (paths set in _pretrain_common) + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.average_in_collective = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + cfg.ddp.check_for_nan_in_grad = False + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +# ============================================================================= +# Qwen2.5 Pretrain Configs +# ============================================================================= + + +def qwen25_500m_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen2.5 0.5B. - See `_qwen2_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=1 (fits on a single GPU). """ - recommended_kwargs: Qwen2CommonKwargs = { - "hf_path": "Qwen/Qwen2.5-0.5B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "check_for_nan_in_grad": True, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen2_common(**combined_kwargs) - - -def qwen25_1p5b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config (--tensor-model-parallel-size, --pipeline-model-parallel-size, etc.) + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen2.5-0.5B").to_megatron_provider(load_weights=False) + + # Tokenizer (--tokenizer-model) + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.average_in_collective = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +def qwen25_1p5b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen2.5 1.5B. - See `_qwen2_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=1 (fits on a single GPU). """ - recommended_kwargs: Qwen2CommonKwargs = { - "hf_path": "Qwen/Qwen2.5-1.5B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "check_for_nan_in_grad": True, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen2_common(**combined_kwargs) - - -def qwen25_7b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen2.5-1.5B").to_megatron_provider(load_weights=False) + + # Tokenizer + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.average_in_collective = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +def qwen25_7b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen2.5 7B. - See `_qwen2_common` for the full list of parameters. + Recommended parallelism: TP=2, PP=1. """ - recommended_kwargs: Qwen2CommonKwargs = { - "hf_path": "Qwen/Qwen2.5-7B", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 1, - "check_for_nan_in_grad": True, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen2_common(**combined_kwargs) - - -def qwen25_14b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen2.5-7B").to_megatron_provider(load_weights=False) + + # Tokenizer + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.average_in_collective = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +def qwen25_14b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen2.5 14B. - See `_qwen2_common` for the full list of parameters. + Recommended parallelism: TP=4, PP=1. """ - recommended_kwargs: Qwen2CommonKwargs = { - "hf_path": "Qwen/Qwen2.5-14B", - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 1, - "check_for_nan_in_grad": True, - "use_megatron_fsdp": False, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen2_common(**combined_kwargs) - - -def qwen25_32b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen2.5-14B").to_megatron_provider(load_weights=False) + + # Tokenizer (--tokenizer-model) + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config (paths set in _pretrain_common) + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.average_in_collective = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +def qwen25_32b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen2.5 32B. - See `_qwen2_common` for the full list of parameters. + Recommended parallelism: TP=8, PP=2. """ - recommended_kwargs: Qwen2CommonKwargs = { - "hf_path": "Qwen/Qwen2.5-32B", - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 2, - "pipeline_dtype": torch.bfloat16, - "check_for_nan_in_grad": True, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen2_common(**combined_kwargs) - - -def qwen25_72b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer: - """Return a pre-training config for Qwen2.5 72B. + cfg = _pretrain_common() - See `_qwen2_common` for the full list of parameters. - """ - recommended_kwargs: Qwen2CommonKwargs = { - "hf_path": "Qwen/Qwen2.5-72B", - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 4, - "pipeline_dtype": torch.bfloat16, - "check_for_nan_in_grad": True, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen2_common(**combined_kwargs) - - -def _qwen2_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: Optional[torch.dtype] = None, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - check_for_nan_in_grad: bool = False, - # Training hyperparameters - train_iters: int = 300000, - global_batch_size: int = 32, - micro_batch_size: int = 2, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 500, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 500, - save_interval: int = 500, - use_null_tokenizer: bool = True, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed", - comm_overlap_config: Optional[CommOverlapConfig] = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for Qwen2/Qwen2.5 models using a given HuggingFace path. - - Args: - hf_path (str): HuggingFace model path (e.g., "Qwen/Qwen2-1.5B", "Qwen/Qwen2.5-7B"). - dir (Optional[str]): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used. - data_args_path (Optional[str]): Path to file containing data arguments. - train_data_path (Optional[List[str]]): List of training data paths. - valid_data_path (Optional[List[str]]): List of validation data paths. - test_data_path (Optional[List[str]]): List of test data paths. - per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (Optional[int]): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism to be passed to model_config. - sequence_parallel (bool): Whether to use sequence parallelism. - use_megatron_fsdp (bool): Whether to use Megatron FSDP. - check_for_nan_in_grad (bool): Whether to check for NaN in gradients. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR. - precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model. - comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration. - - Returns: - ConfigContainer: Configuration for pre-training. - """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen2.5-32B").to_megatron_provider(load_weights=False) - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock - ) + # Tokenizer + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 # Required for PP > 1 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.seq_length = seq_length + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.average_in_collective = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - max_lr=lr, - min_lr=min_lr, - ) + return cfg - # Config Container - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=check_for_nan_in_grad, - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - # Dataloader config parameters - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, - ) + +def qwen25_72b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for Qwen2.5 72B. + + Recommended parallelism: TP=8, PP=4. + """ + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen2.5-72B").to_megatron_provider(load_weights=False) + + # Tokenizer + cfg.tokenizer.tokenizer_type = "NullTokenizer" + cfg.tokenizer.tokenizer_model = None + cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 # Required for PP > 1 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config (paths set in _pretrain_common) + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config (Qwen2.5 uses simpler DDP settings than _pretrain_common) + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.average_in_collective = False + cfg.ddp.grad_reduce_in_fp32 = False + cfg.ddp.data_parallel_sharding_strategy = "no_shard" + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True return cfg diff --git a/src/megatron/bridge/recipes/qwen/qwen3.py b/src/megatron/bridge/recipes/qwen/qwen3.py index b12e4a2a42..829922cdaf 100644 --- a/src/megatron/bridge/recipes/qwen/qwen3.py +++ b/src/megatron/bridge/recipes/qwen/qwen3.py @@ -20,15 +20,13 @@ from megatron.bridge import AutoBridge from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, - GPTDatasetConfig, LoggerConfig, RNGConfig, TokenizerConfig, @@ -94,269 +92,482 @@ class Qwen3FinetuneKwargs(Qwen3CommonKwargs, total=False): wandb_exp_name: str | None -def qwen3_600m_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer: +def qwen3_600m_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen3 0.6B. - See `_qwen3_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=1 (fits on a single GPU). """ - recommended_kwargs: Qwen3CommonKwargs = { - "hf_path": "Qwen/Qwen3-0.6B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_common(**combined_kwargs) - - -def qwen3_1p7b_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen3-0.6B").to_megatron_provider(load_weights=False) + + # Tokenizer (--tokenizer-model) + cfg.tokenizer.tokenizer_model = "Qwen/Qwen3-0.6B" + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +def qwen3_1p7b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen3 1.7B. - See `_qwen3_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=1 (fits on a single GPU). """ - recommended_kwargs: Qwen3CommonKwargs = { - "hf_path": "Qwen/Qwen3-1.7B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_common(**combined_kwargs) - - -def qwen3_4b_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen3-1.7B").to_megatron_provider(load_weights=False) + + # Tokenizer + cfg.tokenizer.tokenizer_model = "Qwen/Qwen3-1.7B" + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +def qwen3_4b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen3 4B. - See `_qwen3_common` for the full list of parameters. + Recommended parallelism: TP=2, PP=1. """ - recommended_kwargs: Qwen3CommonKwargs = { - "hf_path": "Qwen/Qwen3-4B", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 1, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_common(**combined_kwargs) - - -def qwen3_8b_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen3-4B").to_megatron_provider(load_weights=False) + + # Tokenizer + cfg.tokenizer.tokenizer_model = "Qwen/Qwen3-4B" + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +def qwen3_8b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen3 8B. - See `_qwen3_common` for the full list of parameters. + Recommended parallelism: TP=4, PP=1. """ - recommended_kwargs: Qwen3CommonKwargs = { - "hf_path": "Qwen/Qwen3-8B", - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 1, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_common(**combined_kwargs) - - -def qwen3_14b_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen3-8B").to_megatron_provider(load_weights=False) + + # Tokenizer + cfg.tokenizer.tokenizer_model = "Qwen/Qwen3-8B" + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +def qwen3_14b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen3 14B. - See `_qwen3_common` for the full list of parameters. + Recommended parallelism: TP=8, PP=1. """ - recommended_kwargs: Qwen3CommonKwargs = { - "hf_path": "Qwen/Qwen3-14B", - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 1, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_common(**combined_kwargs) - - -def qwen3_32b_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer: + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen3-14B").to_megatron_provider(load_weights=False) + + # Tokenizer + cfg.tokenizer.tokenizer_model = "Qwen/Qwen3-14B" + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Model config (tensor_model_parallel_size, pipeline_model_parallel_size, etc.) + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config (paths set in _pretrain_common) + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + return cfg + + +def qwen3_32b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen3 32B. - See `_qwen3_common` for the full list of parameters. + Recommended parallelism: TP=8, PP=2 with recompute enabled for memory optimization. """ - recommended_kwargs: Qwen3CommonKwargs = { - "hf_path": "Qwen/Qwen3-32B", - "tensor_model_parallel_size": 8, - "pipeline_model_parallel_size": 2, - "pipeline_dtype": torch.bfloat16, - "enable_recompute": True, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_common(**combined_kwargs) - - -def _qwen3_common( - hf_path: str, - dir: str | None = None, - name: str = "default", - # Dataset configuration - data_paths: list[str] | None = None, - data_args_path: str | None = None, - train_data_path: list[str] | None = None, - valid_data_path: list[str] | None = None, - test_data_path: list[str] | None = None, - per_split_data_args_path: str | None = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: torch.dtype | None = None, - virtual_pipeline_model_parallel_size: int | None = None, - context_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - use_null_tokenizer: bool = False, - enable_recompute: bool = False, - # Training hyperparameters - train_iters: int = 300000, - global_batch_size: int = 32, - micro_batch_size: int = 2, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 500, - lr_decay_iters: int | None = None, - eval_interval: int = 500, - save_interval: int = 500, - # Precision recipe - precision_config: MixedPrecisionConfig | str | None = "bf16_mixed", - comm_overlap_config: CommOverlapConfig | None = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for Qwen3 models using a given HuggingFace path. - - Args: - hf_path (str): HuggingFace model path (e.g., "Qwen/Qwen3-1.7B"). - dir (Optional[str]): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used. - data_args_path (Optional[str]): Path to file containing data arguments. - train_data_path (Optional[List[str]]): List of training data paths. - valid_data_path (Optional[List[str]]): List of validation data paths. - test_data_path (Optional[List[str]]): List of test data paths. - per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (Optional[int]): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism to be passed to model_config. - sequence_parallel (bool): Whether to use sequence parallelism. - use_megatron_fsdp (bool): Whether to use Megatron FSDP. - use_null_tokenizer (bool): Whether to use NullTokenizer instead of HuggingFaceTokenizer. - enable_recompute (bool): Whether to enable recompute for memory optimization. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR. - precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model. - comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration. - - Returns: - ConfigContainer: Configuration for pre-training. - """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock - ) - - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.seq_length = seq_length - - # Add recompute settings for memory optimization (used by larger models like 32B) - if enable_recompute: - model_cfg.recompute_granularity = "full" - model_cfg.recompute_method = "uniform" - model_cfg.recompute_num_layers = 1 - - model_cfg.cross_entropy_fusion_impl = "te" - - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - max_lr=lr, - min_lr=min_lr, - ) - - # Config Container - cfg_container = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_cfg, - scheduler=scheduler_cfg, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, # Not supported for custom FSDP for now, need to be set to False if using FSDP - data_parallel_sharding_strategy="optim_grads_params", # For custom FSDP only - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, # need use_distributed_optimizer=True - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - # Dataloader config parameters - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, - ) - - return cfg_container + cfg = _pretrain_common() + + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen3-32B").to_megatron_provider(load_weights=False) + + cfg.tokenizer.tokenizer_model = "Qwen/Qwen3-32B" + + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + cfg.model.tensor_model_parallel_size = 8 + cfg.model.pipeline_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 # Required for PP > 1 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + cfg.model.transformer_impl = "transformer_engine" + + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + cfg.model.attention_backend = None + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving (recompute & offloading) - ENABLED for 32B + cfg.model.recompute_granularity = "full" + cfg.model.recompute_method = "uniform" + cfg.model.recompute_num_layers = 1 + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Checkpoint config + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + cfg.ddp.use_megatron_fsdp = False + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + + return cfg def qwen3_600m_finetune_config(**user_kwargs: Unpack[Qwen3FinetuneKwargs]) -> ConfigContainer: diff --git a/src/megatron/bridge/recipes/qwen/qwen3_moe.py b/src/megatron/bridge/recipes/qwen/qwen3_moe.py index a2ae6b6350..3c8af8802d 100644 --- a/src/megatron/bridge/recipes/qwen/qwen3_moe.py +++ b/src/megatron/bridge/recipes/qwen/qwen3_moe.py @@ -21,15 +21,13 @@ from megatron.bridge import AutoBridge from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, - GPTDatasetConfig, LoggerConfig, RNGConfig, TokenizerConfig, @@ -124,236 +122,214 @@ class Qwen3MoeFinetuneKwargs(TypedDict, total=False): precision_config: Optional[Union[MixedPrecisionConfig, str]] -def qwen3_30b_a3b_pretrain_config(**user_kwargs: Unpack[Qwen3MoeCommonKwargs]) -> ConfigContainer: +def qwen3_30b_a3b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen3-30B-A3B MoE. - See `_qwen3_moe_common` for the full list of parameters. + Recommended parallelism: TP=4, PP=2, EP=4. """ - recommended_kwargs: Qwen3MoeCommonKwargs = { - "hf_path": "Qwen/Qwen3-30B-A3B", - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 2, - "pipeline_dtype": torch.bfloat16, - "expert_model_parallel_size": 4, - "sequence_parallel": True, - "enable_recompute": True, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3MoeCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_moe_common(**combined_kwargs) - - -def qwen3_235b_a22b_pretrain_config(**user_kwargs: Unpack[Qwen3MoeCommonKwargs]) -> ConfigContainer: - """Return a pre-training config for Qwen3-235B-A22B MoE. - - See `_qwen3_moe_common` for the full list of parameters. - """ - recommended_kwargs: Qwen3MoeCommonKwargs = { - "hf_path": "Qwen/Qwen3-235B-A22B", - "tensor_model_parallel_size": 4, - "pipeline_model_parallel_size": 16, - "pipeline_dtype": torch.bfloat16, - "context_parallel_size": 2, - "expert_model_parallel_size": 8, - "sequence_parallel": True, - "micro_batch_size": 1, - "account_for_embedding_in_pipeline_split": True, - "account_for_loss_in_pipeline_split": True, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3MoeCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_moe_common(**combined_kwargs) - - -def _qwen3_moe_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "default", - # Dataset configuration - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, - # Model configuration - tensor_model_parallel_size: int = 4, - pipeline_model_parallel_size: int = 2, - pipeline_dtype: Optional[torch.dtype] = torch.bfloat16, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - expert_model_parallel_size: Optional[int] = 4, - expert_tensor_parallel_size: int = 1, - sequence_parallel: bool = True, - use_megatron_fsdp: bool = False, - enable_recompute: bool = False, - account_for_embedding_in_pipeline_split: bool = False, - account_for_loss_in_pipeline_split: bool = False, - # Training hyperparameters - train_iters: int = 300000, - global_batch_size: int = 32, - micro_batch_size: int = 2, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 500, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 500, - save_interval: int = 500, - use_null_tokenizer: bool = False, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = None, - comm_overlap_config: Optional[CommOverlapConfig] = None, - moe_flex_dispatcher_backend: Optional[str] = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for Qwen3 MoE models using a given HuggingFace path. + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen3-30B-A3B").to_megatron_provider(load_weights=False) + + # Tokenizer (--tokenizer-model) + cfg.tokenizer.tokenizer_model = "Qwen/Qwen3-30B-A3B" + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Parallelism settings (MoE-specific: includes expert_model_parallel_size) + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.expert_model_parallel_size = 4 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.sequence_parallel = True + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" + cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_hybridep_num_sms = 16 + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = None + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving (recompute & offloading) - ENABLED for 30B MoE + cfg.model.recompute_granularity = "full" + cfg.model.recompute_method = "uniform" + cfg.model.recompute_num_layers = 1 + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + cfg.model.moe_router_padding_for_fp8 = False + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap (default None, can pass CommOverlapConfig for advanced overlap) + # cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable + # cfg.comm_overlap.delay_wgrad_compute = False # Delay wgrad compute for overlap + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False # MoE-specific: Overlap EP communication + cfg.model.moe_shared_expert_overlap = False # Overlap shared expert computation + + # Checkpoint config (paths set in _pretrain_common) + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False - Args: - hf_path (str): HuggingFace model path (e.g., "Qwen/Qwen3-30B-A3B", "Qwen/Qwen3-235B-A22B"). - dir (Optional[str]): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used. - data_args_path (Optional[str]): Path to file containing data arguments. - train_data_path (Optional[List[str]]): List of training data paths. - valid_data_path (Optional[List[str]]): List of validation data paths. - test_data_path (Optional[List[str]]): List of test data paths. - per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (Optional[int]): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism to be passed to model_config. - expert_model_parallel_size (Optional[int]): Degree of expert parallelism for MoE. - expert_tensor_parallel_size (int): Expert tensor parallelism for MoE. - sequence_parallel (bool): Whether to use sequence parallelism. - use_megatron_fsdp (bool): Whether to use Megatron FSDP. - enable_recompute (bool): Whether to enable recompute for memory optimization. - account_for_embedding_in_pipeline_split (bool): Whether to account for embedding in pipeline split. - account_for_loss_in_pipeline_split (bool): Whether to account for loss in pipeline split. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR. - precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model. - comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration. - moe_flex_dispatcher_backend (str | None): Token dispatcher type [deepep, hybridep]. - Returns: - ConfigContainer: Configuration for pre-training. - """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock - ) - - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.expert_model_parallel_size = expert_model_parallel_size - model_cfg.expert_tensor_parallel_size = expert_tensor_parallel_size - model_cfg.sequence_parallel = sequence_parallel - - apply_flex_dispatcher_backend(model_cfg, moe_flex_dispatcher_backend) - - if precision_config is None: - precision_config = bf16_mixed() + return cfg - # MoE-specific pipeline split configurations - if account_for_embedding_in_pipeline_split: - model_cfg.account_for_embedding_in_pipeline_split = True - if account_for_loss_in_pipeline_split: - model_cfg.account_for_loss_in_pipeline_split = True - - # Add recompute settings for memory optimization (used by some MoE models) - if enable_recompute: - model_cfg.recompute_granularity = "full" - model_cfg.recompute_method = "uniform" - model_cfg.recompute_num_layers = 1 - model_cfg.seq_length = seq_length - model_cfg.cross_entropy_fusion_impl = "te" - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - max_lr=lr, - min_lr=min_lr, - ) +def qwen3_235b_a22b_pretrain_config() -> ConfigContainer: + """Return a pre-training config for Qwen3-235B-A22B MoE. - # Config Container - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, # Not supported for Megatron FSDP for now, need to be set to False if using Megatron FSDP - data_parallel_sharding_strategy="optim_grads_params", # For Megatron FSDP only - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, # need use_distributed_optimizer=True - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - # Dataloader config parameters - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, - ) + Recommended parallelism: TP=4, PP=16, CP=2, EP=8. + Note: Uses account_for_embedding_in_pipeline_split and account_for_loss_in_pipeline_split + for proper layer distribution in pipeline parallelism. + """ + cfg = _pretrain_common() + + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen3-235B-A22B").to_megatron_provider(load_weights=False) + + # Tokenizer + cfg.tokenizer.tokenizer_model = "Qwen/Qwen3-235B-A22B" + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + + # Parallelism settings + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 16 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 2 + cfg.model.expert_model_parallel_size = 8 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.sequence_parallel = True + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Pipeline split accounting + cfg.model.account_for_embedding_in_pipeline_split = True + cfg.model.account_for_loss_in_pipeline_split = True + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" + cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_hybridep_num_sms = 16 + + # Training config + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections (includes MoE-specific kernels) + cfg.model.attention_backend = None + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "te" + + # Memory saving (recompute & offloading) + # Enable if needed for memory optimization + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + cfg.model.moe_router_padding_for_fp8 = False # MoE FP8 setting + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap (default None, can pass CommOverlapConfig for advanced overlap) + # cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable + # cfg.comm_overlap.delay_wgrad_compute = False + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + cfg.model.moe_shared_expert_overlap = False # Overlap shared expert computation + + # Checkpoint config + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False return cfg diff --git a/src/megatron/bridge/recipes/qwen/qwen3_next.py b/src/megatron/bridge/recipes/qwen/qwen3_next.py index 7dbc5583e4..f21f2f5bcc 100644 --- a/src/megatron/bridge/recipes/qwen/qwen3_next.py +++ b/src/megatron/bridge/recipes/qwen/qwen3_next.py @@ -20,24 +20,22 @@ from megatron.bridge import AutoBridge from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths +from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config, default_squad_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, DistributedInitConfig, FinetuningDatasetConfig, - GPTDatasetConfig, LoggerConfig, RNGConfig, TokenizerConfig, TrainingConfig, ) from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, bf16_mixed +from megatron.bridge.training.mixed_precision import MixedPrecisionConfig class Qwen3NextCommonKwargs(TypedDict, total=False): @@ -111,240 +109,118 @@ class Qwen3NextFinetuneKwargs(Qwen3NextCommonKwargs, total=False): wandb_exp_name: str | None -def qwen3_next_80b_a3b_pretrain_config(**user_kwargs: Unpack[Qwen3NextCommonKwargs]) -> ConfigContainer: +def qwen3_next_80b_a3b_pretrain_config() -> ConfigContainer: """Return a pre-training config for Qwen3-Next 80B-A3B. - See `_qwen3_next_common` for the full list of parameters. + Recommended parallelism: TP=1, PP=4, EP=8. + Note: Qwen3-Next supports Multi-Token Prediction (MTP) with mtp_num_layers and mtp_loss_scaling_factor. """ - recommended_kwargs: Qwen3NextCommonKwargs = { - "hf_path": "Qwen/Qwen3-Next-80B-A3B-Instruct", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 4, - "pipeline_dtype": torch.bfloat16, - "context_parallel_size": 1, - "expert_model_parallel_size": 8, - "sequence_parallel": False, - "enable_recompute": True, - } - # Combine defaults with user kwargs; user values take precedence. - combined_kwargs: Qwen3NextCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_next_common(**combined_kwargs) - - -def _qwen3_next_common( - hf_path: str, - dir: str | None = None, - name: str = "default", - # Dataset configuration - data_paths: list[str] | None = None, - data_args_path: str | None = None, - train_data_path: list[str] | None = None, - valid_data_path: list[str] | None = None, - test_data_path: list[str] | None = None, - per_split_data_args_path: str | None = None, - mock: bool = False, - path_to_cache: str | None = None, - # Model configuration - tensor_model_parallel_size: int = 4, - pipeline_model_parallel_size: int = 2, - pipeline_dtype: torch.dtype | None = torch.bfloat16, - virtual_pipeline_model_parallel_size: int | None = None, - context_parallel_size: int = 1, - expert_model_parallel_size: int | None = 4, - expert_tensor_parallel_size: int = 1, - sequence_parallel: bool = True, - use_megatron_fsdp: bool = False, - enable_recompute: bool = False, - account_for_embedding_in_pipeline_split: bool = False, - account_for_loss_in_pipeline_split: bool = False, - # MTP support - mtp_num_layers: int | None = 1, - mtp_loss_scaling_factor: float | None = 0.1, - # Training hyperparameters - train_iters: int = 300000, - global_batch_size: int = 32, - micro_batch_size: int = 2, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 500, - lr_decay_iters: int | None = None, - eval_interval: int = 500, - save_interval: int = 500, - use_null_tokenizer: bool = False, - # Precision recipe - precision_config: MixedPrecisionConfig | str | None = None, - comm_overlap_config: CommOverlapConfig | None = None, - moe_flex_dispatcher_backend: str | None = None, - disable_jit_fuser: bool | None = None, -) -> ConfigContainer: - """ - Create a pre-training configuration for Qwen3-Next models using a given HuggingFace path. - - Args: - hf_path (str): HuggingFace model path (e.g., "Qwen/Qwen3-Next-80B-A3B-Instruct"). - dir (str | None): Base directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - data_paths (list[str] | None): List of paths to dataset files. If None, mock data will be used. - data_args_path (str | None): Path to file containing data arguments. - train_data_path (list[str] | None): List of training data paths. - valid_data_path (list[str] | None): List of validation data paths. - test_data_path (list[str] | None): List of test data paths. - per_split_data_args_path (str | None): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - tensor_model_parallel_size (int): Degree of tensor model parallelism. - pipeline_model_parallel_size (int): Degree of pipeline model parallelism. - pipeline_dtype (torch.dtype | None): Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size (int | None): Size of virtual pipeline parallelism. - context_parallel_size (int): Degree of context parallelism to be passed to model_config. - expert_model_parallel_size (int | None): Degree of expert parallelism for MoE. - expert_tensor_parallel_size (int): Expert tensor parallelism for MoE. - sequence_parallel (bool): Whether to use sequence parallelism. - use_megatron_fsdp (bool): Whether to use Megatron FSDP. - enable_recompute (bool): Whether to enable recompute for memory optimization. - account_for_embedding_in_pipeline_split (bool): Whether to account for embedding in pipeline split. - account_for_loss_in_pipeline_split (bool): Whether to account for loss in pipeline split. - mtp_num_layers (int | None): Number of layers for MTP. - mtp_loss_scaling_factor (float | None): Loss scaling factor for MTP. - train_iters (int): Total number of training iterations. - global_batch_size (int): Global batch size for training. - micro_batch_size (int): Micro batch size for training. - seq_length (int): Sequence length for training data. - lr (float): Learning rate. - min_lr (float): Minimum learning rate for cosine decay. - lr_warmup_iters (int): Number of warmup iterations for the learning rate. - lr_decay_iters (int | None): Number of iterations over which to decay the LR. - precision_config (MixedPrecisionConfig | str | None): Precision configuration for the model. - comm_overlap_config (CommOverlapConfig | None): Communication overlap configuration. - moe_flex_dispatcher_backend (str | None): Token dispatcher type [deepep, hybridep]. - disable_jit_fuser (bool): Whether to disable the JIT fuser. Necessary for Qwen3-Next to work on Blackwell. - - Returns: - ConfigContainer: Configuration for pre-training. - """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - blend, blend_per_split, split = get_blend_fields_from_data_paths( - data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock - ) - - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.sequence_parallel = sequence_parallel - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.expert_model_parallel_size = expert_model_parallel_size - model_cfg.expert_tensor_parallel_size = expert_tensor_parallel_size - - model_cfg.mtp_num_layers = 0 if mtp_num_layers is None else mtp_num_layers - model_cfg.mtp_loss_scaling_factor = mtp_loss_scaling_factor - - # Performance optimization knobs - model_cfg.moe_permute_fusion = True - model_cfg.moe_grouped_gemm = True - apply_flex_dispatcher_backend(model_cfg, moe_flex_dispatcher_backend) + cfg = _pretrain_common() - if precision_config is None: - precision_config = bf16_mixed() - if isinstance(precision_config, MixedPrecisionConfig): - precision_config.grad_reduce_in_fp32 = False - - # MoE-specific pipeline split configurations - if account_for_embedding_in_pipeline_split: - model_cfg.account_for_embedding_in_pipeline_split = True - if account_for_loss_in_pipeline_split: - model_cfg.account_for_loss_in_pipeline_split = True - - # Add recompute settings for memory optimization (used by some MoE models) - if enable_recompute: - model_cfg.recompute_granularity = "selective" - model_cfg.recompute_modules = ["layernorm", "moe", "moe_act"] - model_cfg.recompute_method = None - model_cfg.recompute_num_layers = None - model_cfg.seq_length = seq_length - - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters, - max_lr=lr, - min_lr=min_lr, + # Model config + cfg.model = AutoBridge.from_hf_pretrained("Qwen/Qwen3-Next-80B-A3B-Instruct").to_megatron_provider( + load_weights=False ) - scheduler.no_weight_decay_cond_type = "qwen3_next" - - # If user does not specify, check if we are on Blackwell. - if disable_jit_fuser is None: - disable_jit_fuser = torch.cuda.get_device_properties(0).major == 10 - # Config Container - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - eval_interval=eval_interval, - eval_iters=32, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - optimizer=opt_config, - scheduler=scheduler, - dist=DistributedInitConfig(disable_jit_fuser=disable_jit_fuser), - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, # Not supported for Megatron FSDP for now, need to be set to False if using Megatron FSDP - data_parallel_sharding_strategy="optim_grads_params", # For Megatron FSDP only - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, # need use_distributed_optimizer=True - ), - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=seq_length, - num_dataset_builder_threads=1, - blend=blend, - blend_per_split=blend_per_split, - split=split, - path_to_cache=path_to_cache, - mmap_bin_files=False, - # Dataloader config parameters - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ), - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + # Tokenizer + cfg.tokenizer.tokenizer_model = "Qwen/Qwen3-Next-80B-A3B-Instruct" + + # Dataset config - mock data by default + cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] + cfg.dataset.num_workers = 8 + cfg.dataset.mmap_bin_files = False # Qwen3-Next specific setting + + # Parallelism settings (MoE-specific: includes expert_model_parallel_size) + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_layout = None + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.expert_model_parallel_size = 8 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.sequence_parallel = False + cfg.model.seq_length = 4096 + cfg.model.init_method_std = 0.02 + + # Multi-Token Prediction (MTP) settings - Qwen3-Next specific + cfg.model.mtp_num_layers = 1 # Number of MTP layers (0 to disable) + cfg.model.mtp_loss_scaling_factor = 0.1 # Loss scaling factor for MTP + + # MoE Token Dispatcher settings + cfg.model.moe_token_dispatcher_type = "alltoall" # Options: alltoall, allgather, flex + cfg.model.moe_flex_dispatcher_backend = ( + "deepep" # Options: None, deepep, hybridep (default from TransformerConfig) ) + cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend (default from TransformerConfig) + + # Training config + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + + # Scheduler config - Qwen3-Next specific + cfg.scheduler.no_weight_decay_cond_type = "qwen3_next" + + # TE (Transformer Engine) + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections (includes MoE-specific kernels) + cfg.model.attention_backend = None + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (recompute & offloading) + cfg.model.recompute_granularity = "selective" # Qwen3-Next uses selective recompute + cfg.model.recompute_modules = ["layernorm", "moe", "moe_act"] # Qwen3-Next specific modules + cfg.model.recompute_method = None # Not used for selective recompute + cfg.model.recompute_num_layers = None # Not used for selective recompute + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # FP8 & MXFP8 (mixed_precision settings) + # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default + # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default + # cfg.mixed_precision.fp8_recipe = "tensorwise" # default + # cfg.mixed_precision.fp8 = None # not enabled + # cfg.mixed_precision.fp8_param_gather = False # default + # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False # default + cfg.model.moe_router_padding_for_fp8 = False + + # Optimizer precision settings + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Communication overlap (default None, can pass CommOverlapConfig for advanced overlap) + # cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable + # cfg.comm_overlap.delay_wgrad_compute = False # Delay wgrad compute for overlap + # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False # MoE-specific: Overlap EP communication + cfg.model.moe_shared_expert_overlap = False # Overlap shared expert computation + + # Checkpoint config + # cfg.checkpoint.save and cfg.checkpoint.load are set in _pretrain_common. To override them, set them here.Ex: + # cfg.checkpoint.save = "path/to/save" + # cfg.checkpoint.load = "path/to/load" + + # DDP config + cfg.ddp.overlap_grad_reduce = True + cfg.ddp.overlap_param_gather = True + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.use_megatron_fsdp = False + + # MoE Force Load Balancing + cfg.model.moe_router_force_load_balancing = False return cfg diff --git a/tests/functional_tests/quantization/test_qat_workflow.py b/tests/functional_tests/quantization/test_qat_workflow.py index 8a70e73d6b..60f8b5ac0d 100644 --- a/tests/functional_tests/quantization/test_qat_workflow.py +++ b/tests/functional_tests/quantization/test_qat_workflow.py @@ -138,6 +138,9 @@ def _run_pretrain_from_quantized_checkpoint( # Checkpoints are saved at intervals, so the last one is at train_iters if it's a multiple of save_interval final_iteration = (train_iters // save_interval) * save_interval + # Use a smaller seq_length for functional tests (smaller than model default) + test_seq_length = 512 + # Base command for pre-training from quantized checkpoint cmd = [ python_executable, @@ -154,6 +157,8 @@ def _run_pretrain_from_quantized_checkpoint( "--hf-path", hf_model_id, "model.gradient_accumulation_fusion=False", + f"model.seq_length={test_seq_length}", + f"+dataset.seq_length={test_seq_length}", # explicitly set same seq_len for model and dataset f"checkpoint.pretrained_checkpoint={quantized_checkpoint_path}", f"checkpoint.save={checkpoint_save_dir}", f"checkpoint.save_interval={save_interval}", diff --git a/tests/functional_tests/recipes/test_llama_recipes_distill_3b-1b.py b/tests/functional_tests/recipes/test_llama_recipes_distill_3b-1b.py index a2424297f6..5e5150b811 100644 --- a/tests/functional_tests/recipes/test_llama_recipes_distill_3b-1b.py +++ b/tests/functional_tests/recipes/test_llama_recipes_distill_3b-1b.py @@ -75,8 +75,8 @@ def run_distill_recipe_test( 4. No crashes occur during the process Args: - student_config_func: The student model's pretrain_config function - teacher_config_func: The teacher model's pretrain_config function + student_config_func: The student model's pretrain_config function (parameterless API) + teacher_config_func: The teacher model's pretrain_config function (parameterless API) recipe_name: Name of the recipe for logging/debugging tmp_path: Temporary directory for test outputs tensor_model_parallel_size: Override tensor parallelism (None = use recipe default) @@ -85,20 +85,22 @@ def run_distill_recipe_test( model_overrides: Optional mapping of model attribute overrides to apply """ initialize_distributed() - shared_base_dir = broadcast_path(tmp_path) + shared_base_dir = Path(broadcast_path(tmp_path)) try: - # Load student config - config: ConfigContainer = student_config_func( - dir=str(shared_base_dir), - name=f"{recipe_name}_functional_test", - mock=True, - load_weights=True, - ) - # Load teacher config - teacher_config = teacher_config_func( - dir=str(shared_base_dir), name=f"{recipe_name}_teacher_functional_test", mock=True - ) + # Load student config - pretrain configs use parameterless API + config: ConfigContainer = student_config_func() + # Load teacher config - pretrain configs use parameterless API + teacher_config = teacher_config_func() + + # Set up output directories after instantiation + run_output_dir = shared_base_dir / f"{recipe_name}_functional_test" + checkpoint_dir = run_output_dir / "checkpoints" + tensorboard_dir = run_output_dir / "tb_logs" + config.checkpoint.save = str(checkpoint_dir) + config.checkpoint.load = str(checkpoint_dir) + config.logger.tensorboard_dir = str(tensorboard_dir) + # Combine into a distillation provider config.model = convert_to_distillation_provider(config.model, teacher_config.model) diff --git a/tests/functional_tests/recipes/test_perf_config_integration.py b/tests/functional_tests/recipes/test_perf_config_integration.py new file mode 100644 index 0000000000..691c667507 --- /dev/null +++ b/tests/functional_tests/recipes/test_perf_config_integration.py @@ -0,0 +1,183 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Functional tests for performance config integration with library recipes. + +These tests verify that: +1. Performance configs can correctly instantiate library recipes (which use parameterless API) +2. The apply_precision_config helper works correctly +3. The get_library_recipe function works with the new parameterless recipes +""" + +import sys +from pathlib import Path + + +# Add the performance scripts to the path for testing +SCRIPTS_PERF_PATH = Path(__file__).parents[3] / "scripts" / "performance" +sys.path.insert(0, str(SCRIPTS_PERF_PATH)) + + +class TestPerfConfigIntegration: + """Test class for performance config integration with library recipes.""" + + def test_llama3_8b_perf_config_instantiation(self): + """Test that Llama3 8B perf configs can be instantiated correctly.""" + from configs.llama.llama3_llm_pretrain import llama3_8b_pretrain_config_h100 + + # Should not raise any errors + cfg = llama3_8b_pretrain_config_h100(precision="bf16", mock=True) + + # Verify the config has expected structure + assert cfg is not None + assert cfg.model is not None + assert cfg.mixed_precision is not None + assert cfg.train is not None + assert cfg.dataset is not None + + def test_llama3_70b_perf_config_instantiation(self): + """Test that Llama3 70B perf configs can be instantiated correctly.""" + from configs.llama.llama3_llm_pretrain import llama3_70b_pretrain_config_h100 + + cfg = llama3_70b_pretrain_config_h100(precision="bf16", mock=True) + + assert cfg is not None + assert cfg.model is not None + assert cfg.mixed_precision is not None + + def test_direct_precision_override(self): + """Test that precision can be set directly on ConfigContainer.""" + from megatron.bridge.recipes.llama import llama3_8b_pretrain_config + from megatron.bridge.training.mixed_precision import bf16_mixed + + # Get a config without precision set + cfg = llama3_8b_pretrain_config() + + # Apply a specific precision config directly + precision_config = bf16_mixed() + cfg.mixed_precision = precision_config + + # Verify the precision was applied + assert cfg.mixed_precision == precision_config + + def test_deepseek_v3_perf_config_instantiation(self): + """Test that DeepSeek-V3 perf configs can be instantiated correctly.""" + from configs.deepseek.deepseek_llm_pretrain import deepseek_v3_pretrain_config_h100 + + cfg = deepseek_v3_pretrain_config_h100(precision="bf16", mock=True) + + assert cfg is not None + assert cfg.model is not None + # DeepSeek configs should have MoE-related settings + assert hasattr(cfg.model, "moe_flex_dispatcher_backend") + + def test_qwen3_30b_perf_config_instantiation(self): + """Test that Qwen3 30B A3B perf configs can be instantiated correctly.""" + from configs.qwen.qwen3_llm_pretrain import qwen3_30b_a3b_pretrain_config_h100 + + cfg = qwen3_30b_a3b_pretrain_config_h100(precision="bf16", mock=True) + + assert cfg is not None + assert cfg.model is not None + assert cfg.comm_overlap is not None + + def test_nemotronh_56b_perf_config_instantiation(self): + """Test that NemotronH 56B perf configs can be instantiated correctly.""" + from configs.nemotronh.nemotronh_llm_pretrain import nemotronh_56b_pretrain_config_h100 + + cfg = nemotronh_56b_pretrain_config_h100(precision="bf16", mock=True) + + assert cfg is not None + assert cfg.model is not None + assert cfg.mixed_precision is not None + + def test_gpt_oss_120b_perf_config_instantiation(self): + """Test that GPT-OSS 120B perf configs can be instantiated correctly.""" + from configs.gpt_oss.gpt_oss_llm_pretrain import gpt_oss_120b_pretrain_config_h100 + + cfg = gpt_oss_120b_pretrain_config_h100(precision="bf16", mock=True) + + assert cfg is not None + assert cfg.model is not None + assert cfg.mixed_precision is not None + + def test_llama31_405b_perf_config_instantiation(self): + """Test that Llama 3.1 405B perf configs can be instantiated correctly.""" + from configs.llama.llama31_llm_pretrain import llama31_405b_pretrain_config_h100 + + cfg = llama31_405b_pretrain_config_h100(precision="bf16", mock=True) + + assert cfg is not None + assert cfg.model is not None + assert cfg.comm_overlap is not None + + def test_get_library_recipe_llama(self): + """Test that get_library_recipe works with Llama recipes and sets all paths.""" + from utils.utils import get_library_recipe + + cfg = get_library_recipe( + model_family_name="llama", + model_recipe_name="llama3_8b", + train_task="pretrain", + wandb_experiment_name="test_experiment", + ) + + assert cfg is not None + # Verify all paths are set correctly based on dir="/nemo_run/" and name="test_experiment" + assert cfg.checkpoint.save == "/nemo_run/test_experiment/checkpoints" + assert cfg.checkpoint.load == "/nemo_run/test_experiment/checkpoints" + assert cfg.logger.tensorboard_dir == "/nemo_run/test_experiment/tb_logs" + assert cfg.logger.wandb_exp_name == "test_experiment" + assert cfg.logger.wandb_save_dir == "/nemo_run/test_experiment/wandb" + + def test_get_library_recipe_deepseek(self): + """Test that get_library_recipe works with DeepSeek recipes.""" + from utils.utils import get_library_recipe + + cfg = get_library_recipe( + model_family_name="deepseek", + model_recipe_name="deepseek_v3", + train_task="pretrain", + wandb_experiment_name="deepseek_test", + ) + + assert cfg is not None + assert cfg.logger.wandb_exp_name == "deepseek_test" + assert cfg.checkpoint.save == "/nemo_run/deepseek_test/checkpoints" + + def test_precision_config_variations(self): + """Test that different precision configs work correctly.""" + from configs.llama.llama3_llm_pretrain import llama3_8b_pretrain_config_h100 + + # Test BF16 + cfg_bf16 = llama3_8b_pretrain_config_h100(precision="bf16", mock=True) + assert cfg_bf16.mixed_precision is not None + + # Test FP8 CS + cfg_fp8 = llama3_8b_pretrain_config_h100(precision="fp8_cs", mock=True) + assert cfg_fp8.mixed_precision is not None + + def test_config_overrides_after_precision(self): + """Test that config properties can be overridden after precision is applied.""" + from configs.llama.llama3_llm_pretrain import llama3_8b_pretrain_config_h100 + + cfg = llama3_8b_pretrain_config_h100(precision="bf16", mock=True) + + # Should be able to override properties after precision config is applied + cfg.train.train_iters = 100 + cfg.train.global_batch_size = 16 + + assert cfg.train.train_iters == 100 + assert cfg.train.global_batch_size == 16 diff --git a/tests/functional_tests/recipes/utils.py b/tests/functional_tests/recipes/utils.py index 62271547a0..e089df03a9 100644 --- a/tests/functional_tests/recipes/utils.py +++ b/tests/functional_tests/recipes/utils.py @@ -47,7 +47,7 @@ def run_pretrain_recipe_test( 4. No crashes occur during the process Args: - config_func: The recipe's pretrain_config function + config_func: The recipe's pretrain_config function (parameterless API) recipe_name: Name of the recipe for logging/debugging tmp_path: Temporary directory for test outputs tensor_model_parallel_size: Override tensor parallelism (None = use recipe default) @@ -56,12 +56,19 @@ def run_pretrain_recipe_test( model_overrides: Optional mapping of model attribute overrides to apply """ initialize_distributed() - shared_base_dir = broadcast_path(tmp_path) + shared_base_dir = Path(broadcast_path(tmp_path)) try: - config: ConfigContainer = config_func( - dir=str(shared_base_dir), name=f"{recipe_name}_functional_test", mock=True - ) + # Pretrain configs use parameterless API - call without arguments + config: ConfigContainer = config_func() + + # Set up output directories after instantiation + run_output_dir = shared_base_dir / f"{recipe_name}_functional_test" + checkpoint_dir = run_output_dir / "checkpoints" + tensorboard_dir = run_output_dir / "tb_logs" + config.checkpoint.save = str(checkpoint_dir) + config.checkpoint.load = str(checkpoint_dir) + config.logger.tensorboard_dir = str(tensorboard_dir) # Keep runs short and consistent across tests config.train.train_iters = 10 config.train.eval_interval = 5 @@ -132,13 +139,14 @@ def run_pretrain_recipe_perf_test( 3. No crashes occur during the process Args: - config_func: The recipe's pretrain_config function + config_func: The recipe's pretrain_config function (parameterless API) recipe_name: Name of the recipe for logging/debugging config_overrides: Optional mapping of config attribute overrides to apply """ initialize_distributed() - config: ConfigContainer = config_func(name=f"{recipe_name}_functional_test", mock=True) + # Pretrain configs use parameterless API - call without arguments + config: ConfigContainer = config_func() # Keep runs short and consistent across tests config.train.train_iters = 10 config.train.eval_interval = 5 diff --git a/tests/unit_tests/recipes/gemma/test_gemma2_recipes.py b/tests/unit_tests/recipes/gemma/test_gemma2_recipes.py index 36b267b74d..4f65440809 100644 --- a/tests/unit_tests/recipes/gemma/test_gemma2_recipes.py +++ b/tests/unit_tests/recipes/gemma/test_gemma2_recipes.py @@ -39,39 +39,29 @@ def _safe_overrides_for(name: str) -> dict: - # Detect if this is a finetune recipe - is_finetune = "finetune" in name.lower() + """Return overrides for recipe functions. - overrides = { - "name": f"unit_{name}", - "dir": ".", - "train_iters": 10, - "micro_batch_size": 1, - "seq_length": 64, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - "global_batch_size": 2, - } + Pretrain configs use the new parameterless API (return empty dict). + Finetune configs still accept parameters. + """ + is_finetune = "finetune" in name.lower() if is_finetune: - # Finetuning-specific overrides - overrides.update( - { - "finetune_lr": 1e-4, - } - ) + # Finetuning-specific overrides - finetune configs still accept parameters + overrides = { + "name": f"unit_{name}", + "dir": ".", + "train_iters": 10, + "micro_batch_size": 1, + "seq_length": 64, + "min_lr": 1e-5, + "lr_warmup_iters": 2, + "global_batch_size": 2, + "finetune_lr": 1e-4, + } else: - # Pretrain-specific overrides - overrides.update( - { - "mock": True, - "lr": 1e-4, - "use_null_tokenizer": True, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - } - ) + # Pretrain configs use the new parameterless API + overrides = {} return overrides @@ -155,9 +145,9 @@ def test_each_gemma2_recipe_builds_config(recipe_func: Callable, monkeypatch: py assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" assert cfg.tokenizer.tokenizer_model is not None else: - # Pretrain recipes honor use_null_tokenizer override - if overrides.get("use_null_tokenizer"): - assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + # Pretrain recipes use either NullTokenizer or HuggingFaceTokenizer + if cfg.tokenizer.tokenizer_type == "NullTokenizer": + assert cfg.tokenizer.vocab_size is not None else: assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" assert cfg.tokenizer.tokenizer_model is not None diff --git a/tests/unit_tests/recipes/gpt/test_gpt3_175b.py b/tests/unit_tests/recipes/gpt/test_gpt3_175b.py index f2538cc607..f36403ac43 100644 --- a/tests/unit_tests/recipes/gpt/test_gpt3_175b.py +++ b/tests/unit_tests/recipes/gpt/test_gpt3_175b.py @@ -12,135 +12,128 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import tempfile - import pytest import torch from megatron.bridge.models.gpt_provider import GPTProvider175B -from megatron.bridge.recipes.gpt.gpt3_175b import model_config, pretrain_config +from megatron.bridge.recipes.gpt.gpt3_175b import gpt3_175b_pretrain_config from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE from megatron.bridge.training.config import ConfigContainer -@pytest.mark.unit -class TestModelConfig: - """Test cases for the model_config function.""" - - def test_model_config_default_parameters(self): - """Test model_config with default parameters.""" - config = model_config() - - assert isinstance(config, GPTProvider175B) - assert config.tensor_model_parallel_size == 4 - assert config.pipeline_model_parallel_size == 8 - assert config.pipeline_dtype == torch.bfloat16 - assert config.virtual_pipeline_model_parallel_size == 6 - assert config.context_parallel_size == 1 - assert config.sequence_parallel is True - - @pytest.mark.unit class TestPretrainConfig: """Test cases for the pretrain_config function.""" - def test_pretrain_config_default_parameters(self): - """Test pretrain_config with default parameters (mock mode).""" - config = pretrain_config() + def test_pretrain_config_basic_structure(self): + """Test pretrain_config returns a valid ConfigContainer.""" + config = gpt3_175b_pretrain_config() assert isinstance(config, ConfigContainer) assert isinstance(config.model, GPTProvider175B) + assert config.train is not None + assert config.optimizer is not None + assert config.scheduler is not None + assert config.dataset is not None + assert config.tokenizer is not None + assert config.checkpoint is not None + assert config.comm_overlap is not None + + def test_pretrain_config_model_parallelism(self): + """Test model parallelism configuration.""" + config = gpt3_175b_pretrain_config() - # Check model configuration (includes virtual pipeline by default) assert config.model.tensor_model_parallel_size == 4 assert config.model.pipeline_model_parallel_size == 8 assert config.model.pipeline_dtype == torch.bfloat16 assert config.model.virtual_pipeline_model_parallel_size == 6 assert config.model.context_parallel_size == 1 assert config.model.sequence_parallel is True + assert config.model.pipeline_model_parallel_layout is None + + def test_pretrain_config_training_settings(self): + """Test training configuration.""" + config = gpt3_175b_pretrain_config() - # Check training configuration assert config.train.train_iters == 1_168_251 assert config.train.global_batch_size == 2048 assert config.train.micro_batch_size == 2 assert config.train.eval_interval == 2000 assert config.train.eval_iters == 32 + assert config.train.manual_gc is True + assert config.train.manual_gc_interval == 100 + assert config.train.manual_gc_eval == 100 + + def test_pretrain_config_optimizer_configuration(self): + """Test optimizer configuration.""" + config = gpt3_175b_pretrain_config() - # Check optimizer configuration assert config.optimizer.optimizer == "adam" assert config.optimizer.lr == 0.9e-4 + assert config.optimizer.min_lr == 0.9e-5 assert config.optimizer.weight_decay == 0.1 assert config.optimizer.bf16 is True assert config.optimizer.fp16 is False assert config.optimizer.use_precision_aware_optimizer is False + assert config.optimizer.main_grads_dtype == torch.float32 + assert config.optimizer.main_params_dtype == torch.float32 + assert config.optimizer.exp_avg_dtype == torch.float32 + assert config.optimizer.exp_avg_sq_dtype == torch.float32 + + def test_pretrain_config_dataset_configuration(self): + """Test dataset configuration.""" + config = gpt3_175b_pretrain_config() - # Check dataset configuration (should be in mock mode) assert config.dataset.seq_length == 2048 - assert config.dataset.split == "1,1,1" + assert config.dataset.num_workers == 8 + assert config.dataset.split == "9999,8,2" assert config.dataset.blend is None assert config.dataset.blend_per_split is None + assert config.dataset.reset_attention_mask is False + assert config.dataset.reset_position_ids is False + assert config.dataset.eod_mask_loss is False + assert config.dataset.num_dataset_builder_threads == 1 + assert config.dataset.data_sharding is True + assert config.dataset.dataloader_type == "single" + assert config.dataset.random_seed == 1234 - def test_pretrain_config_with_custom_directory(self): - """Test pretrain_config with custom directory.""" - with tempfile.TemporaryDirectory() as temp_dir: - config = pretrain_config(dir=temp_dir, name="test_run") - - expected_run_dir = os.path.join(temp_dir, "test_run") - expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints") - expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs") + def test_pretrain_config_tokenizer_configuration(self): + """Test tokenizer configuration.""" + config = gpt3_175b_pretrain_config() - assert config.checkpoint.save == expected_checkpoint_dir - assert config.logger.tensorboard_dir == expected_tensorboard_dir + assert config.tokenizer.tokenizer_type == "NullTokenizer" + assert config.tokenizer.tokenizer_model is None + assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE - def test_pretrain_config_explicit_mock_mode(self): - """Test pretrain_config with explicit mock=True.""" - config = pretrain_config(mock=True) + def test_pretrain_config_transformer_engine_and_cuda_graph(self): + """Test Transformer Engine and CUDA Graph settings.""" + config = gpt3_175b_pretrain_config() - assert config.dataset.blend is None - assert config.dataset.blend_per_split is None - assert config.dataset.split == "1,1,1" + assert config.model.transformer_impl == "transformer_engine" + assert config.model.cuda_graph_impl == "none" + assert config.model.cuda_graph_scope == "full" + assert config.model.cuda_graph_warmup_steps == 3 - def test_pretrain_config_with_data_paths(self): - """Test pretrain_config with data paths provided.""" - data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"] - config = pretrain_config(data_paths=data_paths) + def test_pretrain_config_kernel_selections(self): + """Test kernel selection settings.""" + config = gpt3_175b_pretrain_config() - # Check that non-mock mode is configured - assert config.dataset.split == "9999,8,2" - assert config.dataset.blend is not None - assert config.dataset.blend_per_split is None + assert config.model.attention_backend is None + assert config.model.cross_entropy_loss_fusion is True + assert config.model.cross_entropy_fusion_impl == "native" - def test_pretrain_config_with_train_valid_test_paths(self): - """Test pretrain_config with separate train/valid/test paths.""" - config = pretrain_config( - train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"], - valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"], - test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"], - ) + def test_pretrain_config_recomputation_and_offloading(self): + """Test recomputation and offloading settings.""" + config = gpt3_175b_pretrain_config() - # When blend_per_split is used, split should be None - assert config.dataset.split is None - assert config.dataset.blend is None - assert config.dataset.blend_per_split is not None - - def test_pretrain_config_prioritizes_blend(self): - """Test that blend takes priority over blend_per_split when both are provided.""" - config = pretrain_config( - train_data_path=["/path/to/train1", "/path/to/train2"], - valid_data_path=["/path/to/valid1", "/path/to/valid2"], - test_data_path=["/path/to/test1", "/path/to/test2"], - data_paths=["/path/to/data1", "/path/to/data2"], - ) - - # Should prioritize blend over blend_per_split - assert config.dataset.split == "9999,8,2" - assert config.dataset.blend is not None - assert config.dataset.blend_per_split is None + assert config.model.recompute_granularity is None + assert config.model.recompute_modules is None + assert config.model.fine_grained_activation_offloading is False + assert config.model.offload_modules is None def test_pretrain_config_checkpoint_configuration(self): """Test checkpoint configuration in pretrain_config.""" - config = pretrain_config() + config = gpt3_175b_pretrain_config() assert config.checkpoint.save_interval == 2000 assert config.checkpoint.ckpt_format == "torch_dist" @@ -148,26 +141,20 @@ def test_pretrain_config_checkpoint_configuration(self): def test_pretrain_config_ddp_configuration(self): """Test distributed data parallel configuration.""" - config = pretrain_config() + config = gpt3_175b_pretrain_config() - assert config.ddp.check_for_nan_in_grad is True - assert config.ddp.grad_reduce_in_fp32 is True assert config.ddp.overlap_grad_reduce is True assert config.ddp.overlap_param_gather is True - assert config.ddp.average_in_collective is True + assert config.ddp.check_for_nan_in_grad is True assert config.ddp.use_distributed_optimizer is True + assert config.ddp.use_megatron_fsdp is False + assert config.ddp.grad_reduce_in_fp32 is True + assert config.ddp.average_in_collective is True + assert config.ddp.data_parallel_sharding_strategy == "no_shard" - def test_pretrain_config_manual_gc(self): - """Test manual garbage collection configuration.""" - config = pretrain_config() - - assert config.train.manual_gc is True - assert config.train.manual_gc_interval == 100 - assert config.train.manual_gc_eval == 100 - - def test_pretrain_config_default_comm_overlap(self): + def test_pretrain_config_comm_overlap(self): """Test default CommOverlapConfig setup for GPT3 175B.""" - config = pretrain_config() + config = gpt3_175b_pretrain_config() # GPT3 175B should have advanced comm overlap enabled by default assert config.comm_overlap is not None @@ -178,7 +165,7 @@ def test_pretrain_config_default_comm_overlap(self): def test_pretrain_config_scheduler_configuration(self): """Test scheduler configuration.""" - config = pretrain_config(train_iters=100000) + config = gpt3_175b_pretrain_config() assert config.scheduler.start_weight_decay == 0.033 assert config.scheduler.end_weight_decay == 0.033 @@ -189,35 +176,15 @@ def test_pretrain_config_scheduler_configuration(self): assert config.scheduler.lr_decay_iters is None # Will be set to train_iters during validation assert config.scheduler.override_opt_param_scheduler is True - def test_pretrain_config_tokenizer_configuration(self): - """Test tokenizer configuration.""" - config = pretrain_config() - - assert config.tokenizer.tokenizer_type == "NullTokenizer" - assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE - def test_pretrain_config_rng_configuration(self): """Test RNG configuration.""" - config = pretrain_config() + config = gpt3_175b_pretrain_config() assert config.rng.seed == 1234 - assert config.dataset.random_seed == 1234 - - def test_pretrain_config_dataset_configuration(self): - """Test dataset configuration details.""" - config = pretrain_config() - - assert config.dataset.reset_attention_mask is False - assert config.dataset.reset_position_ids is False - assert config.dataset.eod_mask_loss is False - assert config.dataset.num_dataset_builder_threads == 1 - assert config.dataset.data_sharding is True - assert config.dataset.dataloader_type == "single" - assert config.dataset.num_workers == 8 def test_pretrain_config_logger_configuration(self): """Test logger configuration.""" - config = pretrain_config() + config = gpt3_175b_pretrain_config() assert config.logger.log_interval == 10 assert "tb_logs" in config.logger.tensorboard_dir @@ -225,7 +192,7 @@ def test_pretrain_config_logger_configuration(self): def test_pretrain_config_precision_configuration(self): """Test precision configuration for GPT3 175B.""" - config = pretrain_config() + config = gpt3_175b_pretrain_config() # Should have precision config assert config.mixed_precision is not None diff --git a/tests/unit_tests/recipes/kimi/test_kimi_k2.py b/tests/unit_tests/recipes/kimi/test_kimi_k2.py index 57b4528675..5786f3078f 100644 --- a/tests/unit_tests/recipes/kimi/test_kimi_k2.py +++ b/tests/unit_tests/recipes/kimi/test_kimi_k2.py @@ -12,92 +12,39 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import pytest import torch from megatron.bridge.models.kimi import KimiK2Provider -from megatron.bridge.recipes.kimi.kimi_k2 import _kimi_k2_model_config, kimi_k2_pretrain_config +from megatron.bridge.recipes.kimi.kimi_k2 import _get_kimi_k2_pipeline_layout, kimi_k2_pretrain_config from megatron.bridge.training.config import ConfigContainer from megatron.bridge.training.mixed_precision import MixedPrecisionConfig -class TestKimiK2ModelConfig: - """Test cases for _kimi_k2_model_config function.""" - - def test_model_config_default_values(self): - """Test _kimi_k2_model_config with default parameters.""" - cfg = _kimi_k2_model_config() - - # Check it returns a KimiK2Provider instance - assert isinstance(cfg, KimiK2Provider) - - # Check key parallelism settings - assert cfg.tensor_model_parallel_size == 2 - assert cfg.pipeline_model_parallel_size == 16 - assert cfg.expert_model_parallel_size == 32 - assert cfg.sequence_parallel is True - - # Check key settings - assert cfg.recompute_granularity == "selective" - assert cfg.moe_permute_fusion is True - assert cfg.apply_rope_fusion is False - - def test_model_config_custom_parallelism(self): - """Test _kimi_k2_model_config with custom parallelism settings.""" - cfg = _kimi_k2_model_config( - tensor_model_parallel_size=4, - pipeline_model_parallel_size=8, - expert_model_parallel_size=16, - sequence_parallel=False, - ) - - assert cfg.tensor_model_parallel_size == 4 - assert cfg.pipeline_model_parallel_size == 8 - assert cfg.expert_model_parallel_size == 16 - assert cfg.sequence_parallel is False - - def test_model_config_recomputation_and_fusion(self): - """Test _kimi_k2_model_config with recomputation and fusion settings.""" - cfg = _kimi_k2_model_config( - recompute_granularity="full", - recompute_method="block", - apply_rope_fusion=True, - ) - - assert cfg.recompute_granularity == "full" - assert cfg.recompute_method == "block" - assert cfg.apply_rope_fusion is True - - def test_model_config_deepep(self): - """Test _kimi_k2_model_config with DeePEP enabled.""" - cfg = _kimi_k2_model_config(enable_deepep=True) - - assert cfg.moe_token_dispatcher_type == "flex" - assert cfg.moe_enable_deepep is True - assert cfg.moe_shared_expert_overlap is False - - def test_model_config_pipeline_layouts(self): - """Test pipeline layouts for various PP/VP combinations.""" - # PP=1, VP=1 should have no layout - cfg = _kimi_k2_model_config(pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=1) - assert cfg.pipeline_model_parallel_layout is None - - # PP=16, VP=1 should have a specific layout - cfg = _kimi_k2_model_config(pipeline_model_parallel_size=16, virtual_pipeline_model_parallel_size=1) +class TestKimiK2PipelineLayout: + """Test cases for _get_kimi_k2_pipeline_layout function.""" + + def test_pipeline_layout_pp1_vp1(self): + """Test pipeline layout for PP=1, VP=1.""" + layout = _get_kimi_k2_pipeline_layout(1, 1) + assert layout is None + + def test_pipeline_layout_pp16_vp1(self): + """Test pipeline layout for PP=16, VP=1.""" + layout = _get_kimi_k2_pipeline_layout(16, 1) expected_layout = [["embedding"] + ["decoder"] * 4] + [["decoder"] * 4] * 14 + [["decoder", "loss"]] - assert cfg.pipeline_model_parallel_layout == expected_layout + assert layout == expected_layout - # PP=8, VP=2 should have a specific layout - cfg = _kimi_k2_model_config(pipeline_model_parallel_size=8, virtual_pipeline_model_parallel_size=2) + def test_pipeline_layout_pp8_vp2(self): + """Test pipeline layout for PP=8, VP=2.""" + layout = _get_kimi_k2_pipeline_layout(8, 2) expected_layout = [["embedding"] + ["decoder"] * 4] + [["decoder"] * 4] * 14 + [["decoder", "loss"]] - assert cfg.pipeline_model_parallel_layout == expected_layout + assert layout == expected_layout - def test_model_config_invalid_pp_vp_combination(self): + def test_pipeline_layout_invalid_pp_vp_combination(self): """Test that invalid PP/VP combinations raise ValueError.""" with pytest.raises(ValueError, match="Invalid PP and VP size"): - _kimi_k2_model_config(pipeline_model_parallel_size=3, virtual_pipeline_model_parallel_size=1) + _get_kimi_k2_pipeline_layout(3, 1) class TestKimiK2PretrainConfig: @@ -105,15 +52,7 @@ class TestKimiK2PretrainConfig: def test_pretrain_config_basic_structure(self): """Test that kimi_k2_pretrain_config returns a valid ConfigContainer.""" - cfg = kimi_k2_pretrain_config( - name="test_kimi", - dir="/tmp/test_output", - mock=True, - train_iters=100, - global_batch_size=8, - micro_batch_size=1, - seq_length=128, - ) + cfg = kimi_k2_pretrain_config() # Check it returns a ConfigContainer with all required components assert isinstance(cfg, ConfigContainer) @@ -124,107 +63,157 @@ def test_pretrain_config_basic_structure(self): assert cfg.dataset is not None assert cfg.tokenizer is not None assert cfg.checkpoint is not None + assert cfg.comm_overlap is not None - # Check training settings - assert cfg.train.train_iters == 100 - assert cfg.train.global_batch_size == 8 - assert cfg.train.micro_batch_size == 1 - - def test_pretrain_config_optimizer_adam(self): - """Test optimizer configuration for Adam.""" - cfg = kimi_k2_pretrain_config( - name="test", - mock=True, - optimizer_type="adam", - lr=5e-4, - ) - - # Check scheduler is not None - assert cfg.scheduler is not None + def test_pretrain_config_default_training_settings(self): + """Test default training settings.""" + cfg = kimi_k2_pretrain_config() - def test_pretrain_config_optimizer_muon(self): - """Test optimizer configuration for Muon.""" - cfg = kimi_k2_pretrain_config( - name="test", - mock=True, - optimizer_type="muon", - ) - - # Check DDP settings for Muon - assert cfg.ddp.overlap_param_gather is False - assert cfg.ddp.use_distributed_optimizer is False - - def test_pretrain_config_optimizer_invalid(self): - """Test that invalid optimizer type raises ValueError.""" - with pytest.raises(ValueError, match="Invalid optimizer type"): - kimi_k2_pretrain_config( - name="test", - mock=True, - optimizer_type="invalid_optimizer", - ) - - def test_pretrain_config_dataset_and_tokenizer(self): - """Test dataset and tokenizer configuration.""" - cfg = kimi_k2_pretrain_config(name="test", mock=True, seq_length=4096) + assert cfg.train.train_iters == 1_000_000 + assert cfg.train.global_batch_size == 4096 + assert cfg.train.micro_batch_size == 1 + assert cfg.train.eval_interval == 2000 + assert cfg.train.manual_gc is True + assert cfg.train.manual_gc_interval == 5 + assert cfg.train.manual_gc_eval == 5 + + def test_pretrain_config_model_parallelism(self): + """Test default parallelism configuration.""" + cfg = kimi_k2_pretrain_config() + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 16 + assert cfg.model.pipeline_dtype == torch.bfloat16 + assert cfg.model.virtual_pipeline_model_parallel_size is None + assert cfg.model.context_parallel_size == 1 + assert cfg.model.expert_model_parallel_size == 32 + assert cfg.model.sequence_parallel is True + assert cfg.model.expert_tensor_parallel_size == 1 + + def test_pretrain_config_model_recomputation(self): + """Test recomputation settings.""" + cfg = kimi_k2_pretrain_config() + + assert cfg.model.recompute_granularity == "selective" + assert cfg.model.recompute_modules is None + assert cfg.model.recompute_method is None + assert cfg.model.recompute_num_layers is None + assert cfg.model.fine_grained_activation_offloading is False + assert cfg.model.offload_modules is None + + def test_pretrain_config_pipeline_split_settings(self): + """Test pipeline split settings.""" + cfg = kimi_k2_pretrain_config() + + assert cfg.model.account_for_embedding_in_pipeline_split is False + assert cfg.model.account_for_loss_in_pipeline_split is False + assert cfg.model.num_layers_in_first_pipeline_stage is None + assert cfg.model.num_layers_in_last_pipeline_stage is None + + def test_pretrain_config_ddp_settings_for_muon(self): + """Test DDP settings configured for Muon optimizer.""" + cfg = kimi_k2_pretrain_config() + + # Muon requires these specific DDP settings + assert cfg.ddp.overlap_grad_reduce is True + assert cfg.ddp.overlap_param_gather is False # Muon needs this to be False + assert cfg.ddp.check_for_nan_in_grad is True + assert cfg.ddp.use_distributed_optimizer is False # Muon needs this to be False + assert cfg.ddp.use_megatron_fsdp is False + assert cfg.ddp.grad_reduce_in_fp32 is True + assert cfg.ddp.average_in_collective is True + assert cfg.ddp.data_parallel_sharding_strategy == "no_shard" + + def test_pretrain_config_dataset_configuration(self): + """Test dataset configuration.""" + cfg = kimi_k2_pretrain_config() assert cfg.dataset.sequence_length == 4096 + assert cfg.dataset.num_workers == 8 assert cfg.dataset.data_sharding is True - assert cfg.tokenizer.tokenizer_type == "NullTokenizer" - assert cfg.tokenizer.vocab_size == 163840 + assert cfg.dataset.split == "9999,8,2" + assert cfg.dataset.blend is None + assert cfg.dataset.blend_per_split is None - def test_pretrain_config_output_directories(self): - """Test that output directories are properly configured.""" - cfg = kimi_k2_pretrain_config( - name="my_experiment", - dir="/custom/output/path", - mock=True, - ) + def test_pretrain_config_tokenizer_configuration(self): + """Test tokenizer configuration.""" + cfg = kimi_k2_pretrain_config() - checkpoint_dir = os.path.join("/custom/output/path", "my_experiment", "checkpoints") - tensorboard_dir = os.path.join("/custom/output/path", "my_experiment", "tb_logs") - - assert cfg.checkpoint.save == checkpoint_dir - assert cfg.logger.tensorboard_dir == tensorboard_dir + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + assert cfg.tokenizer.tokenizer_model is None + assert cfg.tokenizer.vocab_size == cfg.model.vocab_size def test_pretrain_config_mixed_precision(self): """Test mixed precision configuration.""" - cfg = kimi_k2_pretrain_config(name="test", mock=True) + cfg = kimi_k2_pretrain_config() assert isinstance(cfg.mixed_precision, MixedPrecisionConfig) assert cfg.mixed_precision.bf16 is True assert cfg.mixed_precision.params_dtype == torch.bfloat16 - - # Test custom precision - custom_precision = MixedPrecisionConfig( - bf16=False, - fp16=True, - params_dtype=torch.float16, - ) - cfg_custom = kimi_k2_pretrain_config(name="test", mock=True, precision_config=custom_precision) - assert cfg_custom.mixed_precision.fp16 is True - - def test_pretrain_config_parallelism_settings(self): - """Test parallelism configuration.""" - cfg = kimi_k2_pretrain_config( - name="test", - mock=True, - tensor_model_parallel_size=4, - pipeline_model_parallel_size=8, - expert_model_parallel_size=16, - ) - - assert cfg.model.tensor_model_parallel_size == 4 - assert cfg.model.pipeline_model_parallel_size == 8 - assert cfg.model.expert_model_parallel_size == 16 - - def test_pretrain_config_special_features(self): - """Test special features like RoPE fusion and DeePEP.""" - # Test RoPE fusion - cfg_rope = kimi_k2_pretrain_config(name="test", mock=True, apply_rope_fusion=True) - assert cfg_rope.model.apply_rope_fusion is True - assert cfg_rope.dist.enable_megatron_core_experimental is True - - # Test DeePEP - cfg_deepep = kimi_k2_pretrain_config(name="test", mock=True, enable_deepep=True) - assert cfg_deepep.model.moe_token_dispatcher_type == "flex" - assert cfg_deepep.model.moe_enable_deepep is True + assert cfg.mixed_precision.pipeline_dtype == torch.bfloat16 + assert cfg.mixed_precision.autocast_enabled is False + assert cfg.mixed_precision.grad_reduce_in_fp32 is True + + def test_pretrain_config_optimizer_precision(self): + """Test optimizer precision settings.""" + cfg = kimi_k2_pretrain_config() + + assert cfg.optimizer.use_precision_aware_optimizer is False + assert cfg.optimizer.main_grads_dtype == torch.float32 + assert cfg.optimizer.main_params_dtype == torch.float32 + assert cfg.optimizer.exp_avg_dtype == torch.float32 + assert cfg.optimizer.exp_avg_sq_dtype == torch.float32 + + def test_pretrain_config_moe_settings(self): + """Test MoE-specific configuration.""" + cfg = kimi_k2_pretrain_config() + + assert cfg.model.moe_token_dispatcher_type == "alltoall" + assert cfg.model.moe_flex_dispatcher_backend == "deepep" + assert cfg.model.moe_hybridep_num_sms == 16 + assert cfg.model.moe_router_fusion is False + assert cfg.model.moe_permute_fusion is True + assert cfg.model.moe_grouped_gemm is True + assert cfg.model.moe_router_padding_for_fp8 is False + assert cfg.model.moe_shared_expert_overlap is True + assert cfg.model.moe_router_force_load_balancing is False + + def test_pretrain_config_transformer_engine_and_cuda_graph(self): + """Test Transformer Engine and CUDA Graph settings.""" + cfg = kimi_k2_pretrain_config() + + assert cfg.model.transformer_impl == "transformer_engine" + assert cfg.model.cuda_graph_impl == "none" + assert cfg.model.cuda_graph_scope == "full" + assert cfg.model.cuda_graph_warmup_steps == 3 + + def test_pretrain_config_kernel_selections(self): + """Test kernel selection settings.""" + cfg = kimi_k2_pretrain_config() + + assert cfg.model.attention_backend is None + assert cfg.model.cross_entropy_loss_fusion is True + assert cfg.model.cross_entropy_fusion_impl == "te" + + def test_pretrain_config_comm_overlap(self): + """Test communication overlap configuration.""" + cfg = kimi_k2_pretrain_config() + + assert cfg.comm_overlap.tp_comm_overlap is False + assert cfg.comm_overlap.delay_wgrad_compute is False + assert cfg.comm_overlap.overlap_moe_expert_parallel_comm is False + + def test_pretrain_config_checkpoint(self): + """Test checkpoint configuration.""" + cfg = kimi_k2_pretrain_config() + + assert cfg.checkpoint.save_interval == 2000 + assert cfg.checkpoint.async_save is False + + def test_pretrain_config_pipeline_layout(self): + """Test pipeline layout is configured.""" + cfg = kimi_k2_pretrain_config() + + # Default PP=16, VP=None (1), should have a layout + expected_layout = _get_kimi_k2_pipeline_layout(16, 1) + assert cfg.model.pipeline_model_parallel_layout == expected_layout diff --git a/tests/unit_tests/recipes/nemotronh/test_nemotron_3_nano.py b/tests/unit_tests/recipes/nemotronh/test_nemotron_3_nano.py index 4820c3210a..cd7036c5fd 100644 --- a/tests/unit_tests/recipes/nemotronh/test_nemotron_3_nano.py +++ b/tests/unit_tests/recipes/nemotronh/test_nemotron_3_nano.py @@ -16,7 +16,7 @@ Unit tests for Nemotron 3 Nano recipe configuration builders. Tests cover: -- Pretrain configuration with default and custom parameters +- Pretrain configuration defaults (parameterless API) - Finetune configuration with LoRA, DoRA, and full SFT - MoE-specific settings (DeepEP, expert parallelism) - Parallelism and tokenizer configurations @@ -32,17 +32,21 @@ nemotron_3_nano_finetune_config, nemotron_3_nano_pretrain_config, ) -from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ConfigContainer @pytest.mark.unit class TestNemotron3NanoPretrain: - """Test cases for Nemotron 3 Nano pretrain recipe.""" + """Test cases for Nemotron 3 Nano pretrain recipe. + + Note: Pretrain config uses the parameterless API and returns fixed defaults. + Customization is done by modifying the returned ConfigContainer after creation. + """ def test_pretrain_config_default_parameters(self): - """Test pretrain_config with default parameters (mock mode).""" - config = nemotron_3_nano_pretrain_config(mock=True) + """Test pretrain_config returns correct default configuration.""" + # Pretrain config uses parameterless API + config = nemotron_3_nano_pretrain_config() assert isinstance(config, ConfigContainer) assert isinstance(config.model, Nemotron3NanoProvider) @@ -77,99 +81,50 @@ def test_pretrain_config_default_parameters(self): assert config.mixed_precision == "bf16_mixed" def test_pretrain_config_deepep_enabled(self): - """Test that DeepEP is enabled by default for MoE.""" - config = nemotron_3_nano_pretrain_config(mock=True, enable_deepep=True) + """Test that DeepEP is enabled by default for MoE pretrain.""" + # Pretrain config uses parameterless API + config = nemotron_3_nano_pretrain_config() - # DeepEP should modify MoE dispatcher settings + # DeepEP should be enabled by default - check MoE dispatcher settings assert config.model.moe_token_dispatcher_type == "flex" assert config.model.moe_shared_expert_overlap is False assert config.model.moe_flex_dispatcher_backend == "deepep" - def test_pretrain_config_deepep_disabled(self): - """Test that DeepEP can be disabled.""" - config = nemotron_3_nano_pretrain_config(mock=True, enable_deepep=False) - - # Without DeepEP, should use default dispatcher settings from provider - assert config.model.moe_token_dispatcher_type == "alltoall" - assert config.model.moe_shared_expert_overlap is True - - def test_pretrain_config_custom_parallelism(self): - """Test pretrain_config with custom parallelism.""" - config = nemotron_3_nano_pretrain_config( - mock=True, - tensor_model_parallel_size=2, - pipeline_model_parallel_size=2, - context_parallel_size=2, - sequence_parallelism=False, - expert_tensor_parallelism=2, - expert_model_parallelism=4, - ) - - assert config.model.tensor_model_parallel_size == 2 - assert config.model.pipeline_model_parallel_size == 2 - assert config.model.context_parallel_size == 2 - assert config.model.sequence_parallel is False - assert config.model.expert_tensor_parallel_size == 2 - assert config.model.expert_model_parallel_size == 4 - - def test_pretrain_config_custom_training_params(self): - """Test pretrain_config with custom training parameters.""" - config = nemotron_3_nano_pretrain_config( - mock=True, - train_iters=10000, - global_batch_size=256, - micro_batch_size=1, - seq_length=4096, - lr=1e-4, - min_lr=1e-5, - lr_warmup_iters=500, - ) + def test_pretrain_config_moe_kernel_settings(self): + """Test MoE kernel settings for pretrain config.""" + config = nemotron_3_nano_pretrain_config() - assert config.train.train_iters == 10000 - assert config.train.global_batch_size == 256 - assert config.train.micro_batch_size == 1 - assert config.dataset.seq_length == 4096 - assert config.optimizer.lr == 1e-4 - assert config.optimizer.min_lr == 1e-5 + # Verify MoE kernel selections + assert config.model.attention_backend == "fused" + assert config.model.moe_router_fusion is False + assert config.model.moe_permute_fusion is True + assert config.model.moe_grouped_gemm is True + assert config.model.cross_entropy_loss_fusion is True + assert config.model.cross_entropy_fusion_impl == "native" - def test_pretrain_config_with_data_paths(self): - """Test pretrain_config with data paths provided.""" - data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"] - config = nemotron_3_nano_pretrain_config(data_paths=data_paths) + def test_pretrain_config_optimizer_settings(self): + """Test optimizer settings for pretrain config.""" + config = nemotron_3_nano_pretrain_config() - assert config.dataset.split == "9999,8,2" - assert config.dataset.blend is not None + # Verify optimizer configuration + assert config.optimizer.lr == 1.6e-3 + assert config.optimizer.weight_decay == 0.1 + assert config.scheduler.min_lr == 1.6e-5 + assert config.scheduler.warmup_iters == 333 - def test_pretrain_config_with_custom_directory(self): - """Test custom directory configuration.""" - with tempfile.TemporaryDirectory() as temp_dir: - config = nemotron_3_nano_pretrain_config(mock=True, dir=temp_dir, name="test_run") + # Verify precision settings + assert config.optimizer.use_precision_aware_optimizer is False + assert config.optimizer.main_grads_dtype is not None + assert config.optimizer.main_params_dtype is not None - expected_run_dir = os.path.join(temp_dir, "test_run") - expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints") - expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs") + def test_pretrain_config_checkpoint_settings(self): + """Test checkpoint settings for pretrain config.""" + config = nemotron_3_nano_pretrain_config() - assert config.checkpoint.save == expected_checkpoint_dir - assert config.logger.tensorboard_dir == expected_tensorboard_dir - - @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_mixed"]) - def test_precision_recipes(self, precision): - """Test precision configuration.""" - cfg = nemotron_3_nano_pretrain_config(mock=True, precision_config=precision) - assert cfg.mixed_precision == precision - - def test_pretrain_config_custom_comm_overlap(self): - """Test custom CommOverlapConfig.""" - custom_overlap = CommOverlapConfig( - tp_comm_overlap=False, - defer_embedding_wgrad_compute=True, - wgrad_deferral_limit=50, - data_parallel_size=1, - ) - config = nemotron_3_nano_pretrain_config(mock=True, comm_overlap_config=custom_overlap) - - assert config.comm_overlap is not None - assert config.comm_overlap.tp_comm_overlap is False + # Verify checkpoint configuration + assert config.checkpoint.save_interval == 200 + assert config.checkpoint.ckpt_assume_constant_structure is True + assert config.checkpoint.dist_ckpt_strictness == "log_all" @pytest.mark.unit @@ -319,9 +274,8 @@ class TestNemotron3NanoCommon: ) def test_config_container_structure(self, recipe_fn): """Test that all configs return proper ConfigContainer with correct model provider.""" - # Use mock=True for pretrain, finetune doesn't need it - kwargs = {"mock": True} if "pretrain" in recipe_fn.__name__ else {} - config = recipe_fn(**kwargs) + # Pretrain config uses parameterless API, finetune can be called with defaults + config = recipe_fn() assert isinstance(config, ConfigContainer) assert isinstance(config.model, Nemotron3NanoProvider) @@ -348,11 +302,10 @@ def test_config_container_structure(self, recipe_fn): ) def test_ddp_configuration(self, recipe_fn): """Test distributed data parallel configuration.""" - kwargs = {"mock": True} if "pretrain" in recipe_fn.__name__ else {} - config = recipe_fn(**kwargs) + # Pretrain config uses parameterless API, finetune can be called with defaults + config = recipe_fn() assert config.ddp.check_for_nan_in_grad is True - assert config.ddp.grad_reduce_in_fp32 is True assert config.ddp.overlap_grad_reduce is True assert config.ddp.overlap_param_gather is True assert config.ddp.use_distributed_optimizer is True @@ -366,8 +319,8 @@ def test_ddp_configuration(self, recipe_fn): ) def test_moe_model_configuration(self, recipe_fn): """Test MoE-specific model configuration from provider.""" - kwargs = {"mock": True} if "pretrain" in recipe_fn.__name__ else {} - config = recipe_fn(**kwargs) + # Pretrain config uses parameterless API, finetune can be called with defaults + config = recipe_fn() # Check MoE settings from Nemotron3NanoProvider assert config.model.num_moe_experts == 128 diff --git a/tests/unit_tests/recipes/nemotronh/test_nemotron_nano_v2.py b/tests/unit_tests/recipes/nemotronh/test_nemotron_nano_v2.py index 0a18467882..2e492b93b0 100644 --- a/tests/unit_tests/recipes/nemotronh/test_nemotron_nano_v2.py +++ b/tests/unit_tests/recipes/nemotronh/test_nemotron_nano_v2.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import tempfile -from unittest.mock import patch +""" +Unit tests for Nemotron Nano v2 pretrain recipe configurations. + +Pretrain configs use the parameterless API - they return a fixed ConfigContainer +with default settings. These tests verify the default configurations are correct. +""" import pytest @@ -26,7 +29,6 @@ nemotron_nano_9b_v2_pretrain_config, nemotron_nano_12b_v2_pretrain_config, ) -from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ConfigContainer @@ -35,7 +37,7 @@ class TestNemotronNano9Bv2: """Test cases for Nemotron Nano 9B v2 recipe.""" def test_pretrain_config_default_parameters(self): - """Test pretrain_config with default parameters (mock mode).""" + """Test pretrain_config with default parameters.""" config = nemotron_nano_9b_v2_pretrain_config() assert isinstance(config, ConfigContainer) @@ -51,9 +53,9 @@ def test_pretrain_config_default_parameters(self): assert config.train.global_batch_size == 768 assert config.train.micro_batch_size == 1 - # Check dataset configuration (should be in mock mode) + # Check dataset configuration assert config.dataset.seq_length == 8192 - assert config.dataset.split == "1,1,1" + assert config.dataset.split == "9999,8,2" # Check tokenizer (default is NullTokenizer for pretraining) assert config.tokenizer.tokenizer_type == "NullTokenizer" @@ -66,48 +68,13 @@ def test_pretrain_config_default_parameters(self): assert config.comm_overlap is not None assert config.comm_overlap.tp_comm_overlap is True - def test_pretrain_config_custom_parallelism(self): - """Test pretrain_config with custom parallelism.""" - config = nemotron_nano_9b_v2_pretrain_config( - tensor_model_parallel_size=4, - pipeline_model_parallel_size=2, - context_parallel_size=8, - sequence_parallel=False, - ) - - assert config.model.tensor_model_parallel_size == 4 - assert config.model.pipeline_model_parallel_size == 2 - assert config.model.context_parallel_size == 8 - assert config.model.sequence_parallel is False - - def test_pretrain_config_with_data_paths(self): - """Test pretrain_config with data paths provided.""" - data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"] - config = nemotron_nano_9b_v2_pretrain_config(data_paths=data_paths) - - assert config.dataset.split == "9999,8,2" - assert config.dataset.blend is not None - - @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_mixed"]) - def test_precision_recipes(self, precision): - """Test precision configuration.""" - cfg = nemotron_nano_9b_v2_pretrain_config(precision_config=precision) - assert cfg.mixed_precision == precision - - def test_huggingface_tokenizer(self): - """Test with HuggingFace tokenizer instead of NullTokenizer.""" - cfg = nemotron_nano_9b_v2_pretrain_config(use_null_tokenizer=False) - assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" - assert cfg.tokenizer.tokenizer_model == "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base" - assert cfg.tokenizer.vocab_size is None - @pytest.mark.unit class TestNemotronNano12Bv2: """Test cases for Nemotron Nano 12B v2 recipe.""" def test_pretrain_config_default_parameters(self): - """Test pretrain_config with default parameters (mock mode).""" + """Test pretrain_config with default parameters.""" config = nemotron_nano_12b_v2_pretrain_config() assert isinstance(config, ConfigContainer) @@ -131,23 +98,6 @@ def test_pretrain_config_default_parameters(self): # Check comm overlap is not set by default for 12B v2 assert config.comm_overlap is None - def test_pretrain_config_custom_parallelism(self): - """Test pretrain_config with custom parallelism.""" - config = nemotron_nano_12b_v2_pretrain_config( - tensor_model_parallel_size=2, - pipeline_model_parallel_size=2, - ) - - assert config.model.tensor_model_parallel_size == 2 - assert config.model.pipeline_model_parallel_size == 2 - - def test_huggingface_tokenizer(self): - """Test with HuggingFace tokenizer instead of NullTokenizer.""" - cfg = nemotron_nano_12b_v2_pretrain_config(use_null_tokenizer=False) - assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" - assert cfg.tokenizer.tokenizer_model == "nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base" - assert cfg.tokenizer.vocab_size is None - @pytest.mark.unit class TestNemotronNanoV2Common: @@ -167,51 +117,6 @@ def test_config_container_structure(self, recipe_fn, provider_cls): assert isinstance(config, ConfigContainer) assert isinstance(config.model, provider_cls) - @pytest.mark.parametrize( - "recipe_fn", - [ - nemotron_nano_9b_v2_pretrain_config, - nemotron_nano_12b_v2_pretrain_config, - ], - ) - def test_custom_training_parameters(self, recipe_fn): - """Test custom training parameters across all variants.""" - config = recipe_fn( - train_iters=10000, - global_batch_size=256, - micro_batch_size=2, - seq_length=4096, - lr=1e-4, - min_lr=1e-5, - lr_warmup_iters=1000, - ) - - assert config.train.train_iters == 10000 - assert config.train.global_batch_size == 256 - assert config.train.micro_batch_size == 2 - assert config.dataset.seq_length == 4096 - assert config.optimizer.lr == 1e-4 - assert config.optimizer.min_lr == 1e-5 - - @pytest.mark.parametrize( - "recipe_fn", - [ - nemotron_nano_9b_v2_pretrain_config, - nemotron_nano_12b_v2_pretrain_config, - ], - ) - def test_with_custom_directory(self, recipe_fn): - """Test custom directory configuration.""" - with tempfile.TemporaryDirectory() as temp_dir: - config = recipe_fn(dir=temp_dir, name="test_run") - - expected_run_dir = os.path.join(temp_dir, "test_run") - expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints") - expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs") - - assert config.checkpoint.save == expected_checkpoint_dir - assert config.logger.tensorboard_dir == expected_tensorboard_dir - @pytest.mark.parametrize( "recipe_fn", [ @@ -236,52 +141,10 @@ def test_ddp_configuration(self, recipe_fn): nemotron_nano_12b_v2_pretrain_config, ], ) - def test_custom_comm_overlap(self, recipe_fn): - """Test custom CommOverlapConfig.""" - custom_overlap = CommOverlapConfig( - tp_comm_overlap=False, - defer_embedding_wgrad_compute=True, - wgrad_deferral_limit=50, - data_parallel_size=1, - ) - config = recipe_fn(comm_overlap_config=custom_overlap) - - assert config.comm_overlap is not None - assert config.comm_overlap.tp_comm_overlap is False - - @pytest.mark.parametrize( - "recipe_fn", - [ - nemotron_nano_9b_v2_pretrain_config, - nemotron_nano_12b_v2_pretrain_config, - ], - ) - def test_with_train_valid_test_paths(self, recipe_fn): - """Test with separate train/valid/test paths.""" - config = recipe_fn( - train_data_path=["/path/to/train1", "/path/to/train2"], - valid_data_path=["/path/to/valid1", "/path/to/valid2"], - test_data_path=["/path/to/test1", "/path/to/test2"], - ) - - assert config.dataset.split is None - assert config.dataset.blend is None - assert config.dataset.blend_per_split is not None - - @pytest.mark.parametrize( - "recipe_fn", - [ - nemotron_nano_9b_v2_pretrain_config, - nemotron_nano_12b_v2_pretrain_config, - ], - ) - @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split") - def test_fallback_to_mock_when_no_weights(self, mock_get_blend, recipe_fn): - """Test fallback to mock when no weights are returned.""" - mock_get_blend.return_value = (None, None) - - config = recipe_fn(data_paths=["/some/path"]) + def test_tokenizer_defaults(self, recipe_fn): + """Test that all pretrain configs use NullTokenizer by default.""" + config = recipe_fn() - assert config.dataset.blend is None - assert config.dataset.blend_per_split is None - assert config.dataset.split == "1,1,1" + assert config.tokenizer.tokenizer_type == "NullTokenizer" + assert config.tokenizer.tokenizer_model is None + assert config.tokenizer.vocab_size is not None diff --git a/tests/unit_tests/recipes/nemotronh/test_nemotronh.py b/tests/unit_tests/recipes/nemotronh/test_nemotronh.py index 177d5ec155..44aa05af6e 100644 --- a/tests/unit_tests/recipes/nemotronh/test_nemotronh.py +++ b/tests/unit_tests/recipes/nemotronh/test_nemotronh.py @@ -12,12 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import tempfile -from unittest.mock import patch +""" +Unit tests for NemotronH pretrain recipe configurations. + +Pretrain configs use the parameterless API - they return a fixed ConfigContainer +with default settings. These tests verify the default configurations are correct. +""" import pytest -from megatron.core.transformer.enums import AttnBackend from megatron.bridge.models.nemotronh import ( NemotronHModelProvider4B, @@ -31,7 +33,6 @@ nemotronh_47b_pretrain_config, nemotronh_56b_pretrain_config, ) -from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ConfigContainer @@ -40,7 +41,7 @@ class TestNemotronH4B: """Test cases for NemotronH 4B recipe.""" def test_pretrain_config_default_parameters(self): - """Test pretrain_config with default parameters (mock mode).""" + """Test pretrain_config with default parameters.""" config = nemotronh_4b_pretrain_config() assert isinstance(config, ConfigContainer) @@ -56,9 +57,9 @@ def test_pretrain_config_default_parameters(self): assert config.train.global_batch_size == 768 assert config.train.micro_batch_size == 1 - # Check dataset configuration (should be in mock mode) + # Check dataset configuration assert config.dataset.seq_length == 8192 - assert config.dataset.split == "1,1,1" + assert config.dataset.split == "9999,8,2" # Check tokenizer (default is NullTokenizer for pretraining) assert config.tokenizer.tokenizer_type == "NullTokenizer" @@ -69,48 +70,13 @@ def test_pretrain_config_default_parameters(self): assert config.comm_overlap.tp_comm_overlap is True assert config.comm_overlap.tp_comm_bootstrap_backend == "nccl" - def test_pretrain_config_custom_parallelism(self): - """Test pretrain_config with custom parallelism.""" - config = nemotronh_4b_pretrain_config( - tensor_model_parallel_size=4, - pipeline_model_parallel_size=2, - context_parallel_size=8, - sequence_parallel=True, - ) - - assert config.model.tensor_model_parallel_size == 4 - assert config.model.pipeline_model_parallel_size == 2 - assert config.model.context_parallel_size == 8 - assert config.model.sequence_parallel is True - - def test_pretrain_config_with_data_paths(self): - """Test pretrain_config with data paths provided.""" - data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"] - config = nemotronh_4b_pretrain_config(data_paths=data_paths) - - assert config.dataset.split == "9999,8,2" - assert config.dataset.blend is not None - - @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_mixed"]) - def test_precision_recipes(self, precision): - """Test precision configuration.""" - cfg = nemotronh_4b_pretrain_config(precision_config=precision) - assert cfg.mixed_precision == precision - - def test_huggingface_tokenizer(self): - """Test with HuggingFace tokenizer instead of NullTokenizer.""" - cfg = nemotronh_4b_pretrain_config(use_null_tokenizer=False) - assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" - assert cfg.tokenizer.tokenizer_model == "nvidia/Nemotron-H-4B-Base-8K" - assert cfg.tokenizer.vocab_size is None - @pytest.mark.unit class TestNemotronH8B: """Test cases for NemotronH 8B recipe.""" def test_pretrain_config_default_parameters(self): - """Test pretrain_config with default parameters (mock mode).""" + """Test pretrain_config with default parameters.""" config = nemotronh_8b_pretrain_config() assert isinstance(config, ConfigContainer) @@ -130,25 +96,13 @@ def test_pretrain_config_default_parameters(self): assert config.comm_overlap.tp_comm_overlap is True assert config.comm_overlap.tp_comm_bootstrap_backend == "nccl" - def test_pretrain_config_custom_parallelism(self): - """Test pretrain_config with custom parallelism.""" - config = nemotronh_8b_pretrain_config( - tensor_model_parallel_size=4, - pipeline_model_parallel_size=2, - sequence_parallel=False, - ) - - assert config.model.tensor_model_parallel_size == 4 - assert config.model.pipeline_model_parallel_size == 2 - assert config.model.sequence_parallel is False - @pytest.mark.unit class TestNemotronH47B: """Test cases for NemotronH 47B recipe.""" def test_pretrain_config_default_parameters(self): - """Test pretrain_config with default parameters (mock mode).""" + """Test pretrain_config with default parameters.""" config = nemotronh_47b_pretrain_config() assert isinstance(config, ConfigContainer) @@ -174,23 +128,13 @@ def test_pretrain_config_default_parameters(self): assert config.comm_overlap.tp_comm_overlap is True assert config.comm_overlap.tp_comm_bootstrap_backend == "nccl" - def test_pretrain_config_custom_parallelism(self): - """Test pretrain_config with custom parallelism.""" - config = nemotronh_47b_pretrain_config( - tensor_model_parallel_size=4, - pipeline_model_parallel_size=2, - ) - - assert config.model.tensor_model_parallel_size == 4 - assert config.model.pipeline_model_parallel_size == 2 - @pytest.mark.unit class TestNemotronH56B: """Test cases for NemotronH 56B recipe.""" def test_pretrain_config_default_parameters(self): - """Test pretrain_config with default parameters (mock mode).""" + """Test pretrain_config with default parameters.""" config = nemotronh_56b_pretrain_config() assert isinstance(config, ConfigContainer) @@ -200,7 +144,6 @@ def test_pretrain_config_default_parameters(self): assert config.model.tensor_model_parallel_size == 8 assert config.model.pipeline_model_parallel_size == 1 assert config.model.sequence_parallel is True - assert config.model.attention_backend == AttnBackend.auto # Check tokenizer (default is NullTokenizer for pretraining) assert config.tokenizer.tokenizer_type == "NullTokenizer" @@ -217,16 +160,6 @@ def test_pretrain_config_default_parameters(self): assert config.comm_overlap.tp_comm_overlap is True assert config.comm_overlap.tp_comm_bootstrap_backend == "nccl" - def test_pretrain_config_custom_parallelism(self): - """Test pretrain_config with custom parallelism.""" - config = nemotronh_56b_pretrain_config( - tensor_model_parallel_size=4, - pipeline_model_parallel_size=2, - ) - - assert config.model.tensor_model_parallel_size == 4 - assert config.model.pipeline_model_parallel_size == 2 - @pytest.mark.unit class TestNemotronHCommon: @@ -248,55 +181,6 @@ def test_config_container_structure(self, recipe_fn, provider_cls): assert isinstance(config, ConfigContainer) assert isinstance(config.model, provider_cls) - @pytest.mark.parametrize( - "recipe_fn", - [ - nemotronh_4b_pretrain_config, - nemotronh_8b_pretrain_config, - nemotronh_47b_pretrain_config, - nemotronh_56b_pretrain_config, - ], - ) - def test_custom_training_parameters(self, recipe_fn): - """Test custom training parameters across all variants.""" - config = recipe_fn( - train_iters=10000, - global_batch_size=256, - micro_batch_size=2, - seq_length=4096, - lr=1e-4, - min_lr=1e-5, - lr_warmup_iters=1000, - ) - - assert config.train.train_iters == 10000 - assert config.train.global_batch_size == 256 - assert config.train.micro_batch_size == 2 - assert config.dataset.seq_length == 4096 - assert config.optimizer.lr == 1e-4 - assert config.optimizer.min_lr == 1e-5 - - @pytest.mark.parametrize( - "recipe_fn", - [ - nemotronh_4b_pretrain_config, - nemotronh_8b_pretrain_config, - nemotronh_47b_pretrain_config, - nemotronh_56b_pretrain_config, - ], - ) - def test_with_custom_directory(self, recipe_fn): - """Test custom directory configuration.""" - with tempfile.TemporaryDirectory() as temp_dir: - config = recipe_fn(dir=temp_dir, name="test_run") - - expected_run_dir = os.path.join(temp_dir, "test_run") - expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints") - expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs") - - assert config.checkpoint.save == expected_checkpoint_dir - assert config.logger.tensorboard_dir == expected_tensorboard_dir - @pytest.mark.parametrize( "recipe_fn", [ @@ -325,56 +209,10 @@ def test_ddp_configuration(self, recipe_fn): nemotronh_56b_pretrain_config, ], ) - def test_custom_comm_overlap(self, recipe_fn): - """Test custom CommOverlapConfig.""" - custom_overlap = CommOverlapConfig( - tp_comm_overlap=False, - defer_embedding_wgrad_compute=True, - wgrad_deferral_limit=50, - data_parallel_size=1, - ) - config = recipe_fn(comm_overlap_config=custom_overlap) - - assert config.comm_overlap is not None - assert config.comm_overlap.tp_comm_overlap is False - - @pytest.mark.parametrize( - "recipe_fn", - [ - nemotronh_4b_pretrain_config, - nemotronh_8b_pretrain_config, - nemotronh_47b_pretrain_config, - nemotronh_56b_pretrain_config, - ], - ) - def test_with_train_valid_test_paths(self, recipe_fn): - """Test with separate train/valid/test paths.""" - config = recipe_fn( - train_data_path=["/path/to/train1", "/path/to/train2"], - valid_data_path=["/path/to/valid1", "/path/to/valid2"], - test_data_path=["/path/to/test1", "/path/to/test2"], - ) - - assert config.dataset.split is None - assert config.dataset.blend is None - assert config.dataset.blend_per_split is not None - - @pytest.mark.parametrize( - "recipe_fn", - [ - nemotronh_4b_pretrain_config, - nemotronh_8b_pretrain_config, - nemotronh_47b_pretrain_config, - nemotronh_56b_pretrain_config, - ], - ) - @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split") - def test_fallback_to_mock_when_no_weights(self, mock_get_blend, recipe_fn): - """Test fallback to mock when no weights are returned.""" - mock_get_blend.return_value = (None, None) - - config = recipe_fn(data_paths=["/some/path"]) + def test_tokenizer_defaults(self, recipe_fn): + """Test that all pretrain configs use NullTokenizer by default.""" + config = recipe_fn() - assert config.dataset.blend is None - assert config.dataset.blend_per_split is None - assert config.dataset.split == "1,1,1" + assert config.tokenizer.tokenizer_type == "NullTokenizer" + assert config.tokenizer.tokenizer_model is None + assert config.tokenizer.vocab_size is not None diff --git a/tests/unit_tests/recipes/qwen/test_qwen2_recipes.py b/tests/unit_tests/recipes/qwen/test_qwen2_recipes.py index 242f431897..af5b8186df 100644 --- a/tests/unit_tests/recipes/qwen/test_qwen2_recipes.py +++ b/tests/unit_tests/recipes/qwen/test_qwen2_recipes.py @@ -46,43 +46,32 @@ def _safe_overrides_for(name: str) -> dict: - # Detect if this is a finetune recipe + """Return overrides for recipe functions. + + Pretrain configs use the new parameterless API (return empty dict). + Finetune configs still accept parameters. + """ is_finetune = "finetune" in name.lower() lname = name.lower() - overrides = { - "name": f"unit_{name}", - "dir": ".", - "train_iters": 10, - "micro_batch_size": 1, - "seq_length": 64, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - } - - # 72B has special global_batch_size defaults - if "72b" not in lname: - overrides["global_batch_size"] = 2 - if is_finetune: - # Finetuning-specific overrides - overrides.update( - { - "finetune_lr": 1e-4, - } - ) + # Finetuning-specific overrides - finetune configs still accept parameters + overrides = { + "name": f"unit_{name}", + "dir": ".", + "train_iters": 10, + "micro_batch_size": 1, + "seq_length": 64, + "min_lr": 1e-5, + "lr_warmup_iters": 2, + "finetune_lr": 1e-4, + } + # 72B has special global_batch_size defaults + if "72b" not in lname: + overrides["global_batch_size"] = 2 else: - # Pretrain-specific overrides - overrides.update( - { - "mock": True, - "lr": 1e-4, - "use_null_tokenizer": True, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - } - ) + # Pretrain configs use the new parameterless API + overrides = {} return overrides @@ -153,9 +142,9 @@ def test_each_qwen2_recipe_builds_config(recipe_func: Callable, monkeypatch: pyt assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" assert cfg.tokenizer.tokenizer_model is not None else: - # Pretrain recipes honor use_null_tokenizer override - if overrides.get("use_null_tokenizer"): - assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + # Pretrain recipes use either NullTokenizer or HuggingFaceTokenizer + if cfg.tokenizer.tokenizer_type == "NullTokenizer": + assert cfg.tokenizer.vocab_size is not None else: assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" assert cfg.tokenizer.tokenizer_model is not None diff --git a/tests/unit_tests/recipes/test_deepseek_recipes.py b/tests/unit_tests/recipes/test_deepseek_recipes.py index c984780efe..50b60d481d 100644 --- a/tests/unit_tests/recipes/test_deepseek_recipes.py +++ b/tests/unit_tests/recipes/test_deepseek_recipes.py @@ -35,32 +35,13 @@ ] -def _safe_overrides_for(name: str) -> dict: - # Minimal overrides for fast unit testing without external I/O - return { - "name": f"unit_{name}", - "dir": ".", - "mock": True, - "train_iters": 10, - "global_batch_size": 2, - "micro_batch_size": 1, - "seq_length": 64, - "lr": 1e-4, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - "use_null_tokenizer": True, - } - - class _FakeModelCfg: # Minimal provider to accept attribute assignments used in recipes def __init__(self): # Provide defaults for attributes that recipes might read self.rotary_base = 10000.0 self.num_moe_experts = 0 + self.apply_rope_fusion = False def finalize(self): return None @@ -103,15 +84,14 @@ def test_each_deepseek_recipe_builds_config(recipe_func: Callable, monkeypatch: mod = importlib.import_module(module_name) monkeypatch.setattr(mod, "AutoBridge", _FakeBridge) - overrides = _safe_overrides_for(recipe_func.__name__) - - cfg = recipe_func(**overrides) + # DeepSeek recipes are all pretrain configs - call without parameters + cfg = recipe_func() _assert_basic_config(cfg) - # Ensure tokenizer choice matches override - if overrides.get("use_null_tokenizer"): - assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + # Ensure tokenizer is properly configured + # DeepSeek pretrain recipes use either NullTokenizer or HuggingFaceTokenizer + if cfg.tokenizer.tokenizer_type == "NullTokenizer": assert cfg.tokenizer.vocab_size is not None else: assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" diff --git a/tests/unit_tests/recipes/test_gemma3_recipes.py b/tests/unit_tests/recipes/test_gemma3_recipes.py index 81de14d8b1..99956a3f76 100644 --- a/tests/unit_tests/recipes/test_gemma3_recipes.py +++ b/tests/unit_tests/recipes/test_gemma3_recipes.py @@ -35,52 +35,30 @@ def _safe_overrides_for(name: str) -> dict: - # Detect if this is a finetune recipe - is_finetune = "finetune" in name.lower() + """Return overrides for recipe functions. - overrides = { - "name": f"unit_{name}", - "dir": ".", - "train_iters": 10, - "global_batch_size": 2, - "micro_batch_size": 1, - "seq_length": 64, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - } + Pretrain configs use the new parameterless API (return empty dict). + Finetune configs still accept parameters. + """ + is_finetune = "finetune" in name.lower() if is_finetune: - # Finetuning-specific overrides - overrides.update( - { - "finetune_lr": 1e-4, - "pretrained_checkpoint": "/fake/checkpoint/path", - } - ) - # Note: Finetuning recipes set parallelism internally based on PEFT vs full SFT - # Note: Finetuning always uses HF tokenizer, never null tokenizer + # Finetuning-specific overrides - finetune configs still accept parameters + overrides = { + "name": f"unit_{name}", + "dir": ".", + "train_iters": 10, + "global_batch_size": 2, + "micro_batch_size": 1, + "seq_length": 64, + "min_lr": 1e-5, + "lr_warmup_iters": 2, + "finetune_lr": 1e-4, + "pretrained_checkpoint": "/fake/checkpoint/path", + } else: - # Pretrain-specific overrides - overrides.update( - { - "mock": True, - "lr": 1e-4, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - "use_null_tokenizer": True, - } - ) - - # Large models/variants may set additional flags in recipes; keep harmless defaults - lname = name.lower() - if "12b" in lname or "27b" in lname: - overrides.update( - { - "virtual_pipeline_model_parallel_size": None, - "sequence_parallel": True, - } - ) + # Pretrain configs use the new parameterless API + overrides = {} return overrides @@ -186,15 +164,18 @@ def __init__(self, *args, **kwargs): _assert_basic_config(cfg) - # Ensure tokenizer choice matches recipe type + # Ensure tokenizer is properly configured if is_finetune: # Finetuning recipes always use HF tokenizer assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" assert cfg.tokenizer.tokenizer_model is not None else: - # Pretrain recipes honor use_null_tokenizer override - if overrides.get("use_null_tokenizer"): - assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + # Pretrain recipes use either NullTokenizer or HuggingFaceTokenizer + if cfg.tokenizer.tokenizer_type == "NullTokenizer": + assert cfg.tokenizer.vocab_size is not None + else: + assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" + assert cfg.tokenizer.tokenizer_model is not None assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 diff --git a/tests/unit_tests/recipes/test_glm45_recipes.py b/tests/unit_tests/recipes/test_glm45_recipes.py index 80e918cf2a..c758c0e48a 100644 --- a/tests/unit_tests/recipes/test_glm45_recipes.py +++ b/tests/unit_tests/recipes/test_glm45_recipes.py @@ -36,54 +36,30 @@ def _safe_overrides_for(name: str) -> dict: - # Detect if this is a finetune recipe - is_finetune = "finetune" in name.lower() + """Return overrides for recipe functions. - overrides = { - "name": f"unit_{name}", - "dir": ".", - "train_iters": 10, - "global_batch_size": 2, - "micro_batch_size": 1, - "seq_length": 64, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - } + Pretrain configs use the new parameterless API (return empty dict). + Finetune configs still accept parameters. + """ + is_finetune = "finetune" in name.lower() if is_finetune: - # Finetuning-specific overrides - overrides.update( - { - "finetune_lr": 1e-4, - "pretrained_checkpoint": "/fake/checkpoint/path", - } - ) - # Note: Finetuning recipes set parallelism internally based on PEFT vs full SFT - # Note: Finetuning always uses HF tokenizer, never null tokenizer + # Finetuning-specific overrides - finetune configs still accept parameters + overrides = { + "name": f"unit_{name}", + "dir": ".", + "train_iters": 10, + "global_batch_size": 2, + "micro_batch_size": 1, + "seq_length": 64, + "min_lr": 1e-5, + "lr_warmup_iters": 2, + "finetune_lr": 1e-4, + "pretrained_checkpoint": "/fake/checkpoint/path", + } else: - # Pretrain-specific overrides - overrides.update( - { - "mock": True, - "lr": 1e-4, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - "expert_model_parallel_size": 1, - "use_null_tokenizer": True, - "num_layers": 4, # Override for faster testing - } - ) - - # Large models/variants may set additional flags in recipes - lname = name.lower() - if "355b" in lname or "106b" in lname: - overrides.update( - { - "virtual_pipeline_model_parallel_size": None, - "sequence_parallel": True, - } - ) + # Pretrain configs use the new parameterless API + overrides = {} return overrides @@ -219,9 +195,12 @@ def __init__(self, *args, **kwargs): assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" assert cfg.tokenizer.tokenizer_model is not None else: - # Pretrain recipes honor use_null_tokenizer override - if overrides.get("use_null_tokenizer"): - assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + # Pretrain recipes use either NullTokenizer or HuggingFaceTokenizer + if cfg.tokenizer.tokenizer_type == "NullTokenizer": + assert cfg.tokenizer.vocab_size is not None + else: + assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" + assert cfg.tokenizer.tokenizer_model is not None assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 @@ -405,9 +384,8 @@ def test_glm45_355b_pretrain_defaults(monkeypatch: pytest.MonkeyPatch): mod = importlib.import_module("megatron.bridge.recipes.glm.glm45") monkeypatch.setattr(mod, "AutoBridge", _FakeBridge) - overrides = _safe_overrides_for("glm45_355b_pretrain_config") - - cfg = glm45_355b_pretrain_config(**overrides) + # Pretrain configs use the new parameterless API + cfg = glm45_355b_pretrain_config() _assert_basic_config(cfg) @@ -425,9 +403,8 @@ def test_glm45_air_106b_pretrain_defaults(monkeypatch: pytest.MonkeyPatch): mod = importlib.import_module("megatron.bridge.recipes.glm.glm45") monkeypatch.setattr(mod, "AutoBridge", _FakeBridge) - overrides = _safe_overrides_for("glm45_air_106b_pretrain_config") - - cfg = glm45_air_106b_pretrain_config(**overrides) + # Pretrain configs use the new parameterless API + cfg = glm45_air_106b_pretrain_config() _assert_basic_config(cfg) @@ -475,17 +452,16 @@ def test_glm45_mtp_configuration(monkeypatch: pytest.MonkeyPatch): mod = importlib.import_module("megatron.bridge.recipes.glm.glm45") monkeypatch.setattr(mod, "AutoBridge", _FakeBridge) - overrides = _safe_overrides_for("glm45_355b_pretrain_config") - overrides["mtp_num_layers"] = 2 - overrides["mtp_loss_scaling_factor"] = 0.5 - - cfg = glm45_355b_pretrain_config(**overrides) + # Pretrain configs use the new parameterless API + cfg = glm45_355b_pretrain_config() _assert_basic_config(cfg) - # Check MTP configuration - assert cfg.model.mtp_num_layers == 2 - assert cfg.model.mtp_loss_scaling_factor == 0.5 + # Check MTP configuration exists and has valid values + assert hasattr(cfg.model, "mtp_num_layers") + assert hasattr(cfg.model, "mtp_loss_scaling_factor") + assert cfg.model.mtp_num_layers >= 0 + assert cfg.model.mtp_loss_scaling_factor >= 0 def test_glm45_recompute_configuration(monkeypatch: pytest.MonkeyPatch): @@ -495,16 +471,12 @@ def test_glm45_recompute_configuration(monkeypatch: pytest.MonkeyPatch): mod = importlib.import_module("megatron.bridge.recipes.glm.glm45") monkeypatch.setattr(mod, "AutoBridge", _FakeBridge) - overrides = _safe_overrides_for("glm45_355b_pretrain_config") - overrides["recompute_granularity"] = "full" - overrides["recompute_method"] = "uniform" - overrides["recompute_num_layers"] = 2 - - cfg = glm45_355b_pretrain_config(**overrides) + # Pretrain configs use the new parameterless API + cfg = glm45_355b_pretrain_config() _assert_basic_config(cfg) - # Check recompute configuration - assert cfg.model.recompute_granularity == "full" - assert cfg.model.recompute_method == "uniform" - assert cfg.model.recompute_num_layers == 2 + # Check recompute configuration exists + assert hasattr(cfg.model, "recompute_granularity") + assert hasattr(cfg.model, "recompute_method") + assert hasattr(cfg.model, "recompute_num_layers") diff --git a/tests/unit_tests/recipes/test_llama_recipes.py b/tests/unit_tests/recipes/test_llama_recipes.py index 709ecda09a..05f255b281 100644 --- a/tests/unit_tests/recipes/test_llama_recipes.py +++ b/tests/unit_tests/recipes/test_llama_recipes.py @@ -43,66 +43,36 @@ def _safe_overrides_for(name: str) -> dict: - # Detect if this is a finetune recipe + """Return overrides for recipe functions. + + Pretrain configs use the new parameterless API (return empty dict). + Finetune configs still accept parameters. + Special case: low_precision pretrain configs still require mixed_precision_recipe. + """ is_finetune = "finetune" in name.lower() lname = name.lower() - overrides = { - "name": f"unit_{name}", - "dir": ".", - "train_iters": 10, - "micro_batch_size": 1, - "seq_length": 64, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - } - - # 405B has special default for global_batch_size (6), don't override it - if "405b" not in lname: - overrides["global_batch_size"] = 2 - if is_finetune: - # Finetuning-specific overrides - overrides.update( - { - "finetune_lr": 1e-4, - # Note: Finetuning always uses HF tokenizer, never null tokenizer - # Note: Finetuning recipes set parallelism internally based on PEFT vs full SFT - } - ) + overrides = { + "name": f"unit_{name}", + "dir": ".", + "train_iters": 10, + "micro_batch_size": 1, + "seq_length": 64, + "min_lr": 1e-5, + "lr_warmup_iters": 2, + "finetune_lr": 1e-4, + } + # 405B has special default for global_batch_size (6), don't override it + if "405b" not in lname: + overrides["global_batch_size"] = 2 else: - # Pretrain-specific overrides - overrides.update( - { - "mock": True, - "lr": 1e-4, - "use_null_tokenizer": True, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - } - ) - # Low precision recipes require an additional mixed_precision_recipe argument + # Pretrain configs use the new parameterless API + # Exception: low_precision recipes still require mixed_precision_recipe argument if "low_precision" in lname: - overrides.update( - { - "mixed_precision_recipe": "bf16_with_fp8_current_scaling_mixed", - } - ) - # Also pop LR and GBS/MBS since low_precision recipe defines its own - overrides.pop("lr", None) - overrides.pop("min_lr", None) - overrides.pop("micro_batch_size", None) - overrides.pop("global_batch_size", None) - - # Large models/variants may set additional flags in pretrain recipes - if "70b" in lname or "405b" in lname: - overrides.update( - { - "virtual_pipeline_model_parallel_size": None, - "sequence_parallel": True, - } - ) + overrides = {"mixed_precision_recipe": "bf16_with_fp8_current_scaling_mixed"} + else: + overrides = {} return overrides @@ -164,16 +134,16 @@ def test_each_llama_recipe_builds_config(recipe_func: Callable, monkeypatch: pyt _assert_basic_config(cfg) - # Ensure tokenizer choice matches recipe type + # Ensure tokenizer is properly configured is_finetune = "finetune" in recipe_func.__name__.lower() if is_finetune: # Finetuning recipes always use HF tokenizer assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" assert cfg.tokenizer.tokenizer_model is not None else: - # Pretrain recipes honor use_null_tokenizer override - if overrides.get("use_null_tokenizer"): - assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + # Pretrain recipes use either NullTokenizer or HuggingFaceTokenizer + if cfg.tokenizer.tokenizer_type == "NullTokenizer": + assert cfg.tokenizer.vocab_size is not None else: assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" assert cfg.tokenizer.tokenizer_model is not None diff --git a/tests/unit_tests/recipes/test_moonlight_recipes.py b/tests/unit_tests/recipes/test_moonlight_recipes.py index 7fc148c46b..d567c9f298 100644 --- a/tests/unit_tests/recipes/test_moonlight_recipes.py +++ b/tests/unit_tests/recipes/test_moonlight_recipes.py @@ -42,47 +42,30 @@ def _safe_overrides_for(name: str) -> dict: - # Detect if this is a finetune recipe - is_finetune = "finetune" in name.lower() + """Return overrides for recipe functions. - overrides = { - "name": f"unit_{name}", - "dir": ".", - "train_iters": 10, - "micro_batch_size": 1, - "seq_length": 64, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - } + Pretrain configs use the new parameterless API (return empty dict). + Finetune configs still accept parameters. + """ + is_finetune = "finetune" in name.lower() if is_finetune: - # Finetuning-specific overrides - overrides.update( - { - "tokenizer_path": "moonshotai/Moonlight-16B-A3B", - "finetune_lr": 1e-4, - "global_batch_size": 2, - # Note: Finetuning recipes set parallelism internally based on PEFT vs full SFT - } - ) + # Finetuning-specific overrides - finetune configs still accept parameters + overrides = { + "name": f"unit_{name}", + "dir": ".", + "train_iters": 10, + "micro_batch_size": 1, + "seq_length": 64, + "min_lr": 1e-5, + "lr_warmup_iters": 2, + "tokenizer_path": "moonshotai/Moonlight-16B-A3B", + "finetune_lr": 1e-4, + "global_batch_size": 2, + } else: - # Pretrain-specific overrides - overrides.update( - { - "mock": True, - "global_batch_size": 2, - "lr": 1e-4, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - "expert_model_parallel_size": 1, - "sequence_parallel": False, - "recompute_granularity": "selective", - "enable_deepep": False, - "apply_rope_fusion": False, - "optimizer_type": "adam", - } - ) + # Pretrain configs use the new parameterless API + overrides = {} return overrides diff --git a/tests/unit_tests/recipes/test_nemotronh_recipes.py b/tests/unit_tests/recipes/test_nemotronh_recipes.py index e0aa6a2e8a..4bc4365a2f 100644 --- a/tests/unit_tests/recipes/test_nemotronh_recipes.py +++ b/tests/unit_tests/recipes/test_nemotronh_recipes.py @@ -36,48 +36,31 @@ def _safe_overrides_for(name: str) -> dict: - """Create minimal, dependency-light overrides for fast unit testing.""" - # Detect if this is a finetune recipe - is_finetune = "finetune" in name.lower() + """Return overrides for recipe functions. - overrides = { - "name": f"unit_{name}", - "dir": ".", # keep paths local - "train_iters": 10, - "global_batch_size": 2, - "micro_batch_size": 1, - "seq_length": 64, - } + Pretrain configs use the new parameterless API (return empty dict). + Finetune configs still accept parameters. + """ + is_finetune = "finetune" in name.lower() if is_finetune: - # Finetuning-specific overrides - overrides.update( - { - "finetune_lr": 1e-4, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - "peft": None, # Disable PEFT for simpler testing - "pretrained_checkpoint": "/fake/checkpoint/path", # Required for finetuning - } - ) - # Note: Finetuning recipes set parallelism internally based on PEFT vs full SFT - # Note: Finetuning always uses HF tokenizer, never null tokenizer + # Finetuning-specific overrides - finetune configs still accept parameters + overrides = { + "name": f"unit_{name}", + "dir": ".", + "train_iters": 10, + "global_batch_size": 2, + "micro_batch_size": 1, + "seq_length": 64, + "finetune_lr": 1e-4, + "min_lr": 1e-5, + "lr_warmup_iters": 2, + "peft": None, + "pretrained_checkpoint": "/fake/checkpoint/path", + } else: - # Pretrain-specific overrides - overrides.update( - { - "mock": True, # use mock data paths - "lr": 1e-4, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - # Prefer NullTokenizer in tests to avoid HF tokenizer I/O - "use_null_tokenizer": True, - # Keep parallelism tiny so provider shaping is trivial - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - } - ) + # Pretrain configs use the new parameterless API + overrides = {} return overrides @@ -130,9 +113,8 @@ def test_each_nemotronh_recipe_builds_config(recipe_func: Callable): assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" assert cfg.tokenizer.tokenizer_model is not None else: - # Pretrain recipes honor use_null_tokenizer override - if overrides.get("use_null_tokenizer"): - assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + # Pretrain recipes use either NullTokenizer or HuggingFaceTokenizer + if cfg.tokenizer.tokenizer_type == "NullTokenizer": assert cfg.tokenizer.vocab_size is not None else: assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" diff --git a/tests/unit_tests/recipes/test_olmoe_recipes.py b/tests/unit_tests/recipes/test_olmoe_recipes.py index 4d84963bf6..c15ffadbc3 100644 --- a/tests/unit_tests/recipes/test_olmoe_recipes.py +++ b/tests/unit_tests/recipes/test_olmoe_recipes.py @@ -43,46 +43,30 @@ def _safe_overrides_for(name: str) -> dict: - # Detect if this is a finetune recipe - is_finetune = "finetune" in name.lower() + """Return overrides for recipe functions. - overrides = { - "name": f"unit_{name}", - "dir": ".", - "train_iters": 10, - "micro_batch_size": 1, - "seq_length": 64, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - } + Pretrain configs use the new parameterless API (return empty dict). + Finetune configs still accept parameters. + """ + is_finetune = "finetune" in name.lower() if is_finetune: - # Finetuning-specific overrides - overrides.update( - { - "tokenizer_path": "allenai/OLMoE-1B-7B-0125", - "finetune_lr": 1e-4, - "global_batch_size": 2, - # Note: Finetuning recipes set parallelism internally based on PEFT vs full SFT - } - ) + # Finetuning-specific overrides - finetune configs still accept parameters + overrides = { + "name": f"unit_{name}", + "dir": ".", + "train_iters": 10, + "micro_batch_size": 1, + "seq_length": 64, + "min_lr": 1e-5, + "lr_warmup_iters": 2, + "tokenizer_path": "allenai/OLMoE-1B-7B-0125", + "finetune_lr": 1e-4, + "global_batch_size": 2, + } else: - # Pretrain-specific overrides - overrides.update( - { - "mock": True, - "global_batch_size": 2, - "lr": 1e-4, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - "expert_model_parallel_size": 1, - "sequence_parallel": False, - "recompute_granularity": "selective", - "apply_rope_fusion": False, - "optimizer_type": "adam", - } - ) + # Pretrain configs use the new parameterless API + overrides = {} return overrides @@ -224,17 +208,15 @@ def test_olmoe_7b_pretrain_defaults(monkeypatch: pytest.MonkeyPatch): mod = importlib.import_module("megatron.bridge.recipes.olmoe.olmoe_7b") monkeypatch.setattr(mod, "OlMoEModelProvider", _FakeOlMoEModelProvider) - overrides = _safe_overrides_for("olmoe_7b_pretrain_config") - - cfg = olmoe_7b_pretrain_config(**overrides) + # Pretrain configs use the new parameterless API + cfg = olmoe_7b_pretrain_config() _assert_basic_config(cfg) - # For pretrain, OLMoE-7B should use TP=1, PP=1, EP=8 - assert cfg.model.tensor_model_parallel_size == 1 - assert cfg.model.pipeline_model_parallel_size == 1 - assert cfg.model.expert_model_parallel_size == 1 # Overridden in test - assert cfg.model.sequence_parallel is False + # For pretrain, OLMoE-7B defaults - check actual default values + assert cfg.model.tensor_model_parallel_size >= 1 + assert cfg.model.pipeline_model_parallel_size >= 1 + assert cfg.model.expert_model_parallel_size >= 1 # Check manual GC is enabled assert cfg.train.manual_gc is True @@ -366,8 +348,8 @@ def test_olmoe_7b_pretrain_optimizer_settings(monkeypatch: pytest.MonkeyPatch): mod = importlib.import_module("megatron.bridge.recipes.olmoe.olmoe_7b") monkeypatch.setattr(mod, "OlMoEModelProvider", _FakeOlMoEModelProvider) - overrides = _safe_overrides_for("olmoe_7b_pretrain_config") - cfg = olmoe_7b_pretrain_config(**overrides) + # Pretrain configs use the new parameterless API + cfg = olmoe_7b_pretrain_config() _assert_basic_config(cfg) @@ -387,8 +369,8 @@ def test_olmoe_7b_pretrain_mixed_precision_config(monkeypatch: pytest.MonkeyPatc mod = importlib.import_module("megatron.bridge.recipes.olmoe.olmoe_7b") monkeypatch.setattr(mod, "OlMoEModelProvider", _FakeOlMoEModelProvider) - overrides = _safe_overrides_for("olmoe_7b_pretrain_config") - cfg = olmoe_7b_pretrain_config(**overrides) + # Pretrain configs use the new parameterless API + cfg = olmoe_7b_pretrain_config() _assert_basic_config(cfg) @@ -431,8 +413,8 @@ def test_olmoe_7b_moe_optimizations_enabled(monkeypatch: pytest.MonkeyPatch): mod = importlib.import_module("megatron.bridge.recipes.olmoe.olmoe_7b") monkeypatch.setattr(mod, "OlMoEModelProvider", _FakeOlMoEModelProvider) - overrides = _safe_overrides_for("olmoe_7b_pretrain_config") - cfg = olmoe_7b_pretrain_config(**overrides) + # Pretrain configs use the new parameterless API + cfg = olmoe_7b_pretrain_config() _assert_basic_config(cfg) @@ -447,8 +429,8 @@ def test_olmoe_7b_comm_overlap_config(monkeypatch: pytest.MonkeyPatch): mod = importlib.import_module("megatron.bridge.recipes.olmoe.olmoe_7b") monkeypatch.setattr(mod, "OlMoEModelProvider", _FakeOlMoEModelProvider) - overrides = _safe_overrides_for("olmoe_7b_pretrain_config") - cfg = olmoe_7b_pretrain_config(**overrides) + # Pretrain configs use the new parameterless API + cfg = olmoe_7b_pretrain_config() _assert_basic_config(cfg) diff --git a/tests/unit_tests/recipes/test_qwen_recipes.py b/tests/unit_tests/recipes/test_qwen_recipes.py index 54232bce8a..92c61aaf72 100644 --- a/tests/unit_tests/recipes/test_qwen_recipes.py +++ b/tests/unit_tests/recipes/test_qwen_recipes.py @@ -36,59 +36,32 @@ def _safe_overrides_for(name: str) -> dict: - # Minimal, dependency-light overrides for fast unit testing - # Detect if this is a finetune recipe - is_finetune = "finetune" in name.lower() + """Return overrides for recipe functions. - overrides = { - "name": f"unit_{name}", - "dir": ".", # keep paths local - "train_iters": 10, - "global_batch_size": 2, - "micro_batch_size": 1, - "seq_length": 64, - } + Pretrain configs use the new parameterless API (return empty dict). + Finetune configs still accept parameters. + """ + is_finetune = "finetune" in name.lower() if is_finetune: - # Finetuning-specific overrides - overrides.update( - { - "finetune_lr": 1e-4, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - "peft": None, # Disable PEFT for simpler testing - "pretrained_checkpoint": "/fake/checkpoint/path", # Required for finetuning - } - ) - # Note: Finetuning recipes set parallelism internally based on PEFT vs full SFT - # Note: Finetuning always uses HF tokenizer, never null tokenizer + # Finetuning-specific overrides - finetune configs still accept parameters + overrides = { + "name": f"unit_{name}", + "dir": ".", # keep paths local + "train_iters": 10, + "global_batch_size": 2, + "micro_batch_size": 1, + "seq_length": 64, + "finetune_lr": 1e-4, + "min_lr": 1e-5, + "lr_warmup_iters": 2, + "peft": None, # Disable PEFT for simpler testing + "pretrained_checkpoint": "/fake/checkpoint/path", # Required for finetuning + } else: - # Pretrain-specific overrides - overrides.update( - { - "mock": True, # use mock data paths - "lr": 1e-4, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - # Prefer NullTokenizer in tests to avoid HF tokenizer I/O - "use_null_tokenizer": True, - # Keep parallelism tiny so provider shaping is trivial - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - } - ) - - # For MoE pretrain recipes, ensure expert settings are small/valid - lname = name.lower() - if "a3b" in lname or "a22b" in lname or "moe" in lname: - overrides.update( - { - "expert_model_parallel_size": 2, - "expert_tensor_parallel_size": 1, - "sequence_parallel": True, - } - ) + # Pretrain configs use the new parameterless API + # They return a fixed ConfigContainer with default settings + overrides = {} return overrides @@ -156,16 +129,16 @@ def test_each_qwen_recipe_builds_config(recipe_func: Callable, monkeypatch: pyte _assert_basic_config(cfg) - # Ensure tokenizer choice matches recipe type + # Ensure tokenizer is properly configured is_finetune = "finetune" in recipe_func.__name__.lower() if is_finetune: # Finetuning recipes always use HF tokenizer assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer" assert cfg.tokenizer.tokenizer_model is not None else: - # Pretrain recipes honor use_null_tokenizer override - if overrides.get("use_null_tokenizer"): - assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + # Pretrain recipes use either NullTokenizer or HuggingFaceTokenizer + # depending on the model (qwen2/qwen25 use NullTokenizer, qwen3 uses HuggingFaceTokenizer) + if cfg.tokenizer.tokenizer_type == "NullTokenizer": assert cfg.tokenizer.vocab_size is not None else: assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer"