Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
4cbe4eb
[model, refactor] refactor: Centralize provider_bridge config mapping…
yaoyu-33 Feb 4, 2026
7cf6851
[recipe, training] fix: Correct adam_eps default and add non-default …
yaoyu-33 Feb 4, 2026
70436be
Fix performance config scripts for parameterless recipe API (#2201)
yaoyu-33 Feb 5, 2026
d4fd66a
[model, refactor] refactor: Centralize provider_bridge config mapping…
yaoyu-33 Feb 10, 2026
199723c
fix: call apply_flex_dispatcher_backend with correct backend in MoE p…
yaoyu-33 Feb 11, 2026
6f6a4d8
kimi k2 recipe intro (#2097)
malay-nagda Feb 11, 2026
5ec81cb
nemotron3 nano recipes (#2301)
malay-nagda Feb 19, 2026
f29a269
Update Qwen3 30B H100 Base Configs with HybridEP (#2477)
rhmukundan Feb 23, 2026
f10feb7
Fix DeepSeek-V3 H100 large scale config (#2401)
scsudhakaran Feb 23, 2026
cb434ff
fix: all2all for qwen3-next H100 (#2479)
ko3n1g Feb 23, 2026
08de997
[model,recipe] fix: Correct DeepSeek num_query_groups mapping and mig…
yaoyu-33 Feb 20, 2026
249869f
Malay/cp sft perf 2602 patch (#2527)
malay-nagda Feb 25, 2026
b1b0d2d
[model, refactor] refactor: Centralize provider_bridge config mapping…
yaoyu-33 Feb 25, 2026
e7ee97c
Fix LLAMA3 LoRa TFLOPs Formula (#2416)
rhmukundan Feb 25, 2026
0e46b18
Revert "Fix LLAMA3 LoRa TFLOPs Formula (#2416)"
ko3n1g Feb 25, 2026
6cf5879
Update Nemotron 3 Nano perf configs (#2510) (#2560)
malay-nagda Feb 26, 2026
5db8d13
scaling up gbs as number of gpus scales up (#2553)
rsalagame-nvidia Feb 26, 2026
57cbd51
chore(fix): Deployment parallelism (#2189)
ko3n1g Feb 17, 2026
605384b
chore: cherry-pick SFT/PEFT recipe support with cfg.validation -> cfg…
yaoyu-33 Feb 26, 2026
f56e2b0
[OMNIML2914] Support Nemotron-3-Nano PTQ, TE spec migration, and VLM …
yueshen2016 Feb 26, 2026
a479b03
cp: `Bump NVRX`
ko3n1g Feb 26, 2026
388d017
Malay/cp pr 2499 (#2585)
malay-nagda Feb 27, 2026
7cacc75
Onboarding LLAMA3 70B LoRa to B300 and B200 chips (#2397) (#2588)
malay-nagda Feb 27, 2026
fd58a55
Fix lint and import error in perf script llama3_llm_finetune.py (#259…
malay-nagda Mar 2, 2026
a4c7556
nemotron3_nano_h100_fix_260201 (#2617)
malay-nagda Mar 3, 2026
2ff4462
Tune kimi-k2 GB300 MXFP8 recipe (#2590)
dingqingy-nv Mar 3, 2026
a727503
Merge remote-tracking branch 'origin/r0.3.0' into ko3n1g/cp/260201
ko3n1g Mar 3, 2026
e39b6da
Update the LoRa TFLOPs Formula Fix without any hardcoding of values …
rhmukundan Mar 3, 2026
8fe60aa
fix test
ko3n1g Mar 3, 2026
44468af
bump modelopt
ko3n1g Mar 3, 2026
7a8ff8d
disable
ko3n1g Mar 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -393,10 +393,11 @@ jobs:
- script: L2_Launch_models_nemotron_vl
- script: L2_Launch_models_olmoe
- script: L2_Launch_models_qwen
# - script: L2_Launch_models_qwen_quantization
- script: L2_Launch_models_qwen_quantization
- script: L2_Launch_models_qwen_vl
- script: L2_Launch_recipes_gemma_vl
- script: L2_Launch_recipes_gpt_oss
- script: L2_Launch_models_qwen_vl_quantization
- script: L2_Launch_recipes_llama_1b
- script: L2_Launch_recipes_llama_3b
- script: L2_Launch_recipes_llama_distill
Expand All @@ -405,7 +406,7 @@ jobs:
- script: L2_Launch_data
- script: L2_Launch_post_training_quantization
- script: L2_Launch_quantization_aware_training
- script: L2_Launch_quantization_export
# - script: L2_Launch_quantization_export
- script: L2_Launch_recipes_llama_cuda_graphs
needs: [pre-flight, cicd-unit-tests]
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
Expand Down
5 changes: 1 addition & 4 deletions examples/evaluation/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,4 @@ python \
--host 0.0.0.0 \
--port 8000 \
--num_gpus "$NUM_GPUS" \
--num_replicas "$NUM_REPLICAS" \
--tensor_model_parallel_size 1 \
--pipeline_model_parallel_size 1 \
--context_parallel_size 1
--num_replicas "$NUM_REPLICAS"
11 changes: 10 additions & 1 deletion examples/evaluation/launch_evaluation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,16 @@ def main(args):
executor=executor,
)
job.start(
command=f"bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh {args.megatron_checkpoint} {args.num_replicas} {args.num_gpus} | tee -a deploy.log & sleep 120; bash /opt/Megatron-Bridge/examples/evaluation/eval.sh {args.output_dir} {args.parallelism} | tee -a eval.log",
command=f"""
bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh \
{args.megatron_checkpoint} \
{args.num_replicas} \
{args.num_gpus}| tee -a deploy.log & \
sleep 120; \
bash /opt/Megatron-Bridge/examples/evaluation/eval.sh \
{args.output_dir} \
{args.parallelism} | tee -a eval.log
""",
workdir=None,
)

Expand Down
17 changes: 16 additions & 1 deletion examples/quantization/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@

from megatron.bridge import AutoBridge
from megatron.bridge.models.decorators import torchrun_main
from megatron.bridge.models.hf_pretrained.utils import is_safe_repo


warnings.filterwarnings("ignore")
Expand All @@ -61,6 +62,7 @@ def main(
export_dir: str = "./hf_export",
export_extra_modules: bool = False,
dtype: str = "bfloat16",
trust_remote_code: bool | None = None,
) -> None:
"""Export a quantized Megatron-LM checkpoint to HuggingFace format on multiple GPUs."""
if os.environ.get("WORLD_SIZE") is None:
Expand All @@ -78,7 +80,13 @@ def main(
sys.exit(1)

# Initialize bridge from HF model to get tokenizer and model structure
bridge = AutoBridge.from_hf_pretrained(hf_model_id)
bridge = AutoBridge.from_hf_pretrained(
hf_model_id,
trust_remote_code=is_safe_repo(
trust_remote_code=trust_remote_code,
hf_path=hf_model_id,
),
)

# Get model provider and configure for multi-GPU execution
model_provider = bridge.to_megatron_provider(load_weights=False)
Expand Down Expand Up @@ -152,6 +160,7 @@ def main(
export_extra_modules=export_extra_modules_flag,
dtype=torch_dtype,
export_dir=export_dir,
trust_remote_code=is_safe_repo(trust_remote_code=trust_remote_code, hf_path=hf_model_id),
)

if is_rank_0:
Expand Down Expand Up @@ -195,6 +204,11 @@ def main(
choices=["bfloat16", "float16", "float32"],
help="Data type for export",
)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="if trust_remote_code",
)

args = parser.parse_args()
main(
Expand All @@ -207,6 +221,7 @@ def main(
args.export_dir,
args.export_extra_modules,
args.dtype,
args.trust_remote_code,
)

if torch.distributed.is_initialized():
Expand Down
13 changes: 8 additions & 5 deletions examples/quantization/pretrain_quantized_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,17 @@ def main() -> None:
logger.info("------------------------------------------------------------------")

# Load base configuration from the recipe as a Python dataclass
# If --hf-path is provided, pass it to the recipe function
recipe_kwargs = {}
# Pretrain configs use parameterless API
cfg: ConfigContainer = pretrain_config()
logger.info("Loaded base configuration")

# If --hf-path is provided, override the model's HuggingFace path
if args.hf_path:
logger.info(f"Using custom HuggingFace path: {args.hf_path}")
recipe_kwargs["hf_path"] = args.hf_path
# Import AutoBridge to create a new model provider with the custom HF path
from megatron.bridge.models import AutoBridge

cfg: ConfigContainer = pretrain_config(**recipe_kwargs)
logger.info("Loaded base configuration")
cfg.model = AutoBridge.from_hf_pretrained(args.hf_path).to_megatron_provider(load_weights=False)
Comment on lines 160 to +165
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Search for AutoBridge class and its methods
find . -type f -name "*.py" -path "*/megatron/bridge/models/*" | head -20

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 1120


🏁 Script executed:

#!/bin/bash
# Look for AutoBridge definition and from_hf_config method
rg "class AutoBridge|def from_hf_config|def from_hf_pretrained" --type=py -n

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 1977


🏁 Script executed:

#!/bin/bash
# Check the specific file for import placement
cat -n examples/quantization/pretrain_quantized_llama3_8b.py | sed -n '150,170p'

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 1095


🏁 Script executed:

#!/bin/bash
# Examine the from_hf_pretrained and from_hf_config implementations
sed -n '148,230p' src/megatron/bridge/models/conversion/auto_bridge.py

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 3593


🏁 Script executed:

#!/bin/bash
# Also check the to_megatron_provider implementation
rg "def to_megatron_provider" -A 20 --type=py

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 32005


🏁 Script executed:

#!/bin/bash
# Check what PreTrainedCausalLM.from_pretrained does and if it loads weights
rg "class AutoBridge" -A 50 src/megatron/bridge/models/conversion/auto_bridge.py | head -80

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 2443


🏁 Script executed:

#!/bin/bash
# Get the full implementation of from_hf_pretrained
sed -n '194,250p' src/megatron/bridge/models/conversion/auto_bridge.py

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 2566


🏁 Script executed:

#!/bin/bash
# Check if there's any rank guard usage in the AutoBridge or related code
rg "get_rank|rank_zero|dist.get_rank" src/megatron/bridge/models/conversion/ -A 2 -B 2

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 1787


🏁 Script executed:

#!/bin/bash
# Get the full to_megatron_provider implementation
sed -n '238,320p' src/megatron/bridge/models/conversion/auto_bridge.py

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 3635


🏁 Script executed:

#!/bin/bash
# Check if there are any attribute accesses on hf_pretrained that would fail if it's just a config
rg "self.hf_pretrained\." src/megatron/bridge/models/conversion/auto_bridge.py | head -20

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 528


🏁 Script executed:

#!/bin/bash
# Search for to_megatron_provider in AutoBridge specifically
rg "def to_megatron_provider" src/megatron/bridge/models/conversion/auto_bridge.py -A 80

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 4006


🏁 Script executed:

#!/bin/bash
# Check the config property and how hf_pretrained.config is accessed
rg "\.config" src/megatron/bridge/models/conversion/auto_bridge.py -B 2 -A 2

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 506


🏁 Script executed:

#!/bin/bash
# Check the beginning of the file to see how imports are organized
head -30 examples/quantization/pretrain_quantized_llama3_8b.py

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 1327


🏁 Script executed:

#!/bin/bash
# Verify if AutoBridge is already imported at the top
rg "^import|^from" examples/quantization/pretrain_quantized_llama3_8b.py | head -20

Repository: NVIDIA-NeMo/Megatron-Bridge

Length of output: 684


All distributed ranks load the full HF model unnecessarily — OOM risk.

AutoBridge.from_hf_pretrained(args.hf_path) loads the entire model with all weights via PreTrainedCausalLM.from_pretrained(), and because there is no rank guard, every torchrun worker independently loads the full model in parallel. For LLaMA 3-8B that is ~16 GB per rank; with 8 ranks this consumes ~128 GB total VRAM and will cause OOM before pretraining starts.

Use AutoBridge.from_hf_config() instead to load only the configuration, then call to_megatron_provider(load_weights=False) to defer weight loading until after distributed initialization:

🛡️ Proposed fix
-        # Import AutoBridge to create a new model provider with the custom HF path
-        from megatron.bridge.models import AutoBridge
-
-        cfg.model = AutoBridge.from_hf_pretrained(args.hf_path).to_megatron_provider(load_weights=False)
+        # Import AutoBridge to create a new model provider with the custom HF path
+        from megatron.bridge.models import AutoBridge
+        from transformers import AutoConfig
+
+        hf_cfg = AutoConfig.from_pretrained(args.hf_path)
+        cfg.model = AutoBridge.from_hf_config(hf_cfg).to_megatron_provider(load_weights=False)

Also move the AutoBridge import to the top of the file with other megatron.bridge.* imports to follow the import organization guideline.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@examples/quantization/pretrain_quantized_llama3_8b.py` around lines 160 -
165, Replace the blocking call to AutoBridge.from_hf_pretrained(args.hf_path)
(which loads full weights on every rank) with
AutoBridge.from_hf_config(args.hf_path) so only the model config is read and
then call .to_megatron_provider(load_weights=False) to defer weight loading
until after distributed init; update the code that currently does
AutoBridge.from_hf_pretrained(...).to_megatron_provider(load_weights=False) to
use AutoBridge.from_hf_config(...) instead, and move the `from
megatron.bridge.models import AutoBridge` import to the top with other
megatron.bridge imports to follow import organization.


# Print configuration on rank 0
if get_rank_safe() == 0:
Expand Down
41 changes: 30 additions & 11 deletions examples/quantization/ptq_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ def _validate_quantized_model(model: torch.nn.Module, is_rank_0: bool) -> None:
If someone accidentally breaks the quantization loading logic (e.g., in
has_modelopt_state or build_and_load_model), this check will catch it.

We check for QuantRowParallelLinear and QuantColumnParallelLinear as these
are present in all quantized model architectures (GPT, Llama, Qwen, Nemotron-H, etc).
We check for quantized layer types that indicate successful quantization:
- Local spec: QuantRowParallelLinear, QuantColumnParallelLinear
- TE spec: QuantTERowParallelLinear, QuantTELayerNormColumnParallelLinear

Args:
model: The unwrapped model to validate
Expand All @@ -68,25 +69,36 @@ def _validate_quantized_model(model: torch.nn.Module, is_rank_0: bool) -> None:
Raises:
RuntimeError: If the model doesn't contain expected quantized layers
"""
# Check for quantized layer types that are universal across all architectures
model_str = str(model)

required_quant_layers = [
# Local spec quantized layers
local_spec_layers = [
"QuantRowParallelLinear",
"QuantColumnParallelLinear",
]

missing_layers = [layer for layer in required_quant_layers if layer not in model_str]
# TE spec quantized layers
te_spec_layers = [
"QuantTERowParallelLinear",
"QuantTELayerNormColumnParallelLinear",
]

# Check if model has local spec quantized layers
has_local_spec = all(layer in model_str for layer in local_spec_layers)

# Check if model has TE spec quantized layers
has_te_spec = all(layer in model_str for layer in te_spec_layers)

if missing_layers:
if not has_local_spec and not has_te_spec:
error_msg = (
f"\n{'=' * 80}\n"
f"QUANTIZATION VALIDATION FAILED!\n"
f"{'=' * 80}\n"
f"Expected quantized layers not found in the loaded model.\n"
f"This indicates the quantized checkpoint was not loaded correctly.\n\n"
f"Missing: {missing_layers}\n"
f"Expected: {required_quant_layers}\n\n"
f"Expected one of:\n"
f" - Local spec: {local_spec_layers}\n"
f" - TE spec: {te_spec_layers}\n\n"
f"This is likely due to a bug in the checkpoint loading logic.\n"
f"{'=' * 80}\n"
)
Expand All @@ -95,9 +107,16 @@ def _validate_quantized_model(model: torch.nn.Module, is_rank_0: bool) -> None:
raise RuntimeError(error_msg)

if is_rank_0:
console.print(
"[green]✓ Quantization validation passed: Found QuantRowParallelLinear and QuantColumnParallelLinear[/green]"
)
if has_te_spec:
console.print(
"[green]✓ Quantization validation passed: Found TE spec quantized layers "
"(QuantTERowParallelLinear, QuantTELayerNormColumnParallelLinear)[/green]"
)
else:
console.print(
"[green]✓ Quantization validation passed: Found local spec quantized layers "
"(QuantRowParallelLinear, QuantColumnParallelLinear)[/green]"
)


@torchrun_main
Expand Down
Loading