Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions examples/quantization/pretrain_quantized_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,17 @@ def main() -> None:
logger.info("------------------------------------------------------------------")

# Load base configuration from the recipe as a Python dataclass
# If --hf-path is provided, pass it to the recipe function
recipe_kwargs = {}
# Pretrain configs use parameterless API
cfg: ConfigContainer = pretrain_config()
logger.info("Loaded base configuration")

# If --hf-path is provided, override the model's HuggingFace path
if args.hf_path:
logger.info(f"Using custom HuggingFace path: {args.hf_path}")
recipe_kwargs["hf_path"] = args.hf_path
# Import AutoBridge to create a new model provider with the custom HF path
from megatron.bridge.models import AutoBridge

cfg: ConfigContainer = pretrain_config(**recipe_kwargs)
logger.info("Loaded base configuration")
cfg.model = AutoBridge.from_hf_pretrained(args.hf_path).to_megatron_provider(load_weights=False)

# Print configuration on rank 0
if get_rank_safe() == 0:
Expand Down
102 changes: 61 additions & 41 deletions scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@
from utils.precision import get_precision_config
from utils.utils import get_workload_base_config

from megatron.bridge.recipes.deepseek.deepseek_v3 import deepseek_v3_pretrain_config as pretrain_config
from megatron.bridge.recipes.deepseek.deepseek_v3 import (
deepseek_v3_pretrain_config as pretrain_config,
)
from megatron.bridge.recipes.deepseek.deepseek_v3 import (
set_deepseek_v3_pipeline_model_parallel_layout,
)
from megatron.bridge.training.config import ConfigContainer


Expand Down Expand Up @@ -54,14 +59,19 @@ def deepseek_v3_pretrain_config_gb300(
)
precision_config = get_precision_config(precision)

cfg = pretrain_config(
mock=mock,
precision_config=precision_config,
pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
layout=base_cfg.pp_layout,
)
cfg = pretrain_config()
cfg.mixed_precision = precision_config

# Apply model-specific settings that were previously passed as constructor args
cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size
cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
if base_cfg.pp_layout:
cfg.model.pipeline_model_parallel_layout = base_cfg.pp_layout
else:
# Recompute layout based on updated PP/VP sizes
set_deepseek_v3_pipeline_model_parallel_layout(cfg.model)

set_deepseek_v3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down Expand Up @@ -89,14 +99,19 @@ def deepseek_v3_pretrain_config_gb200(
)
precision_config = get_precision_config(precision)

cfg = pretrain_config(
mock=mock,
precision_config=precision_config,
pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
layout=base_cfg.pp_layout,
)
cfg = pretrain_config()
cfg.mixed_precision = precision_config

# Apply model-specific settings that were previously passed as constructor args
cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size
cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
if base_cfg.pp_layout:
cfg.model.pipeline_model_parallel_layout = base_cfg.pp_layout
else:
# Recompute layout based on updated PP/VP sizes
set_deepseek_v3_pipeline_model_parallel_layout(cfg.model)

set_deepseek_v3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down Expand Up @@ -124,14 +139,16 @@ def deepseek_v3_pretrain_config_b300(
)
precision_config = get_precision_config(precision)

cfg = pretrain_config(
mock=mock,
precision_config=precision_config,
pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
layout=None,
)
cfg = pretrain_config()
cfg.mixed_precision = precision_config

# Apply model-specific settings that were previously passed as constructor args
cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size
cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
# Recompute layout based on updated PP/VP sizes
set_deepseek_v3_pipeline_model_parallel_layout(cfg.model)

Comment on lines +142 to +151
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

# First, run the provided script to check pp_layout usage
rg -n "pp_layout" scripts/performance/ --glob "*.yaml" --glob "*.json" -A2 -B2 | head -100

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 53


🏁 Script executed:

# Also check Python files for pp_layout references
rg -n "pp_layout" scripts/performance/ --glob "*.py" -A2 -B2

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 2391


🏁 Script executed:

# Find gb300 and gb200 config files to compare with b300
fd -e py "gb300|gb200|b300" scripts/performance/configs/deepseek/

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 53


🏁 Script executed:

# Look at the actual deepseek_llm_pretrain.py file around lines 142-151
cat -n scripts/performance/configs/deepseek/deepseek_llm_pretrain.py | sed -n '130,160p'

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 1373


🏁 Script executed:

# Look at the broader context to identify which functions have the conditional logic
cat -n scripts/performance/configs/deepseek/deepseek_llm_pretrain.py | sed -n '50,120p'

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 3317


🏁 Script executed:

# Also check the b300 base config to verify pp_layout is set there
cat -n scripts/performance/configs/deepseek/deepseek_workload_base_configs.py | sed -n '40,55p'

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 698


B300 config should respect base_cfg.pp_layout like GB300/GB200 variants.

The b300 function unconditionally calls set_deepseek_v3_pipeline_model_parallel_layout() without checking base_cfg.pp_layout first, unlike the gb300 and gb200 variants. Since the B300 base config explicitly sets pp_layout="Et*4|(t*4|)*14tmL", it is being silently ignored. Update b300 to use the same conditional pattern as gb300/gb200: check if base_cfg.pp_layout exists, use it if present, otherwise compute the layout.

🤖 Prompt for AI Agents
In `@scripts/performance/configs/deepseek/deepseek_llm_pretrain.py` around lines
142 - 151, The b300 pretrain config currently ignores base_cfg.pp_layout by
always calling set_deepseek_v3_pipeline_model_parallel_layout(cfg.model); change
it to follow the gb300/gb200 pattern: if base_cfg.pp_layout is set, assign
cfg.model.pp_layout = base_cfg.pp_layout (or equivalent field) and do not
recompute, otherwise call
set_deepseek_v3_pipeline_model_parallel_layout(cfg.model) to compute the layout;
update the block that sets cfg.model.pipeline_model_parallel_size /
virtual_pipeline_model_parallel_size / moe_flex_dispatcher_backend to
conditionally respect base_cfg.pp_layout before recomputing the layout.

set_deepseek_v3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -154,14 +171,16 @@ def deepseek_v3_pretrain_config_b200(
)
precision_config = get_precision_config(precision)

cfg = pretrain_config(
mock=mock,
precision_config=precision_config,
pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
layout=None,
)
cfg = pretrain_config()
cfg.mixed_precision = precision_config

# Apply model-specific settings that were previously passed as constructor args
cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size
cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
# Recompute layout based on updated PP/VP sizes
set_deepseek_v3_pipeline_model_parallel_layout(cfg.model)

set_deepseek_v3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -184,14 +203,15 @@ def deepseek_v3_pretrain_config_h100(
)
precision_config = get_precision_config(precision)

cfg = pretrain_config(
mock=mock,
precision_config=precision_config,
pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
layout="Et|(tt|)*30mL",
)
cfg = pretrain_config()
cfg.mixed_precision = precision_config

# Apply model-specific settings that were previously passed as constructor args
cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size
cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
cfg.model.pipeline_model_parallel_layout = "Et|(tt|)*30mL"

set_deepseek_v3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down
30 changes: 10 additions & 20 deletions scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,8 @@ def gpt_oss_120b_pretrain_config_gb300(
)
precision_config = get_precision_config(precision)

cfg = gpt_oss_120b_pretrain_config(
mock=mock,
precision_config=precision_config,
)
cfg = gpt_oss_120b_pretrain_config()
cfg.mixed_precision = precision_config
set_gpt_oss_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -73,10 +71,8 @@ def gpt_oss_120b_pretrain_config_gb200(
)
precision_config = get_precision_config(precision)

cfg = gpt_oss_120b_pretrain_config(
mock=mock,
precision_config=precision_config,
)
cfg = gpt_oss_120b_pretrain_config()
cfg.mixed_precision = precision_config
set_gpt_oss_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -97,10 +93,8 @@ def gpt_oss_120b_pretrain_config_b300(
)
precision_config = get_precision_config(precision)

cfg = gpt_oss_120b_pretrain_config(
mock=mock,
precision_config=precision_config,
)
cfg = gpt_oss_120b_pretrain_config()
cfg.mixed_precision = precision_config
set_gpt_oss_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -121,10 +115,8 @@ def gpt_oss_120b_pretrain_config_b200(
)
precision_config = get_precision_config(precision)

cfg = gpt_oss_120b_pretrain_config(
mock=mock,
precision_config=precision_config,
)
cfg = gpt_oss_120b_pretrain_config()
cfg.mixed_precision = precision_config
set_gpt_oss_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -145,10 +137,8 @@ def gpt_oss_120b_pretrain_config_h100(
)
precision_config = get_precision_config(precision)

cfg = gpt_oss_120b_pretrain_config(
mock=mock,
precision_config=precision_config,
)
cfg = gpt_oss_120b_pretrain_config()
cfg.mixed_precision = precision_config
set_gpt_oss_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down
15 changes: 10 additions & 5 deletions scripts/performance/configs/llama/llama31_llm_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def llama31_405b_pretrain_config_gb300(
else:
comm_overlap_cfg = userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192

cfg = llama31_405b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama31_405b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama31_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down Expand Up @@ -95,7 +96,8 @@ def llama31_405b_pretrain_config_gb200(
else:
comm_overlap_cfg = userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192

cfg = llama31_405b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama31_405b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama31_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down Expand Up @@ -129,7 +131,8 @@ def llama31_405b_pretrain_config_b300(
else:
comm_overlap_cfg = userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192

cfg = llama31_405b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama31_405b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama31_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down Expand Up @@ -158,7 +161,8 @@ def llama31_405b_pretrain_config_b200(
else:
comm_overlap_cfg = userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192

cfg = llama31_405b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama31_405b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama31_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down Expand Up @@ -187,7 +191,8 @@ def llama31_405b_pretrain_config_h100(
else:
comm_overlap_cfg = userbuffers_fp8_h100_h16384_tp8_cp2_mbs1_seqlen8192

cfg = llama31_405b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama31_405b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama31_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down
30 changes: 20 additions & 10 deletions scripts/performance/configs/llama/llama3_llm_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ def llama3_70b_pretrain_config_gb300(
else:
comm_overlap_cfg = userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192

cfg = llama3_70b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama3_70b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down Expand Up @@ -100,7 +101,8 @@ def llama3_70b_pretrain_config_gb200(
else:
comm_overlap_cfg = userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192

cfg = llama3_70b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama3_70b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down Expand Up @@ -134,7 +136,8 @@ def llama3_70b_pretrain_config_b300(
else:
comm_overlap_cfg = userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192

cfg = llama3_70b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama3_70b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down Expand Up @@ -168,7 +171,8 @@ def llama3_70b_pretrain_config_b200(
else:
comm_overlap_cfg = userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192

cfg = llama3_70b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama3_70b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down Expand Up @@ -202,7 +206,8 @@ def llama3_70b_pretrain_config_h100(
else:
comm_overlap_cfg = userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192

cfg = llama3_70b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama3_70b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -228,7 +233,8 @@ def llama3_8b_pretrain_config_gb300(
)
precision_config = get_precision_config(precision)

cfg = llama3_8b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama3_8b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -252,7 +258,8 @@ def llama3_8b_pretrain_config_gb200(
)
precision_config = get_precision_config(precision)

cfg = llama3_8b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama3_8b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -276,7 +283,8 @@ def llama3_8b_pretrain_config_b300(
)
precision_config = get_precision_config(precision)

cfg = llama3_8b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama3_8b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -300,7 +308,8 @@ def llama3_8b_pretrain_config_b200(
)
precision_config = get_precision_config(precision)

cfg = llama3_8b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama3_8b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand All @@ -324,7 +333,8 @@ def llama3_8b_pretrain_config_h100(
)
precision_config = get_precision_config(precision)

cfg = llama3_8b_pretrain_config(mock=mock, precision_config=precision_config)
cfg = llama3_8b_pretrain_config()
cfg.mixed_precision = precision_config
set_llama3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)

Expand Down
Loading
Loading