From 0ab5f13d1043bfa45aa4828e4b7ab5a00906d9c1 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 23 Feb 2026 10:53:58 -0800 Subject: [PATCH 01/11] add ckpt tests Signed-off-by: dimapihtar --- ...2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh | 24 ++++++++++ .../ckpts/llama3_1b/test_llama3_1b_mbridge.py | 45 +++++++++++++++++++ .../ckpts/llama3_1b/test_llama3_1b_mcore.sh | 45 +++++++++++++++++++ 3 files changed, 114 insertions(+) create mode 100755 tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh create mode 100644 tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py create mode 100755 tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh new file mode 100755 index 0000000000..c988c7a766 --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py +coverage combine -q diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py new file mode 100644 index 0000000000..466fd13d34 --- /dev/null +++ b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py @@ -0,0 +1,45 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for LLaMA recipe configurations.""" + +import pytest + +from megatron.bridge.recipes.llama import llama32_1b_pretrain_config +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain + + +class TestLlama3MBridgeCkpt: + """Test class for LLaMA recipe functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_llama_pretrain_recipes(self): + """Functional test for LLaMA recipes with appropriate parallelism configurations.""" + + config = llama32_1b_pretrain_config() + + config.model.seq_length = 8192 + + config.train.train_iters = 5 + config.train.eval_iters = 5 + config.train.save_interval = 5 + config.train.global_batch_size = 8 + config.train.micro_batch_size = 1 + + config.scheduler.lr_warmup_iters = 2 + + config.logger.log_interval = 1 + + pretrain(config=config, forward_step_func=forward_step) \ No newline at end of file diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh new file mode 100755 index 0000000000..89ed67ba03 --- /dev/null +++ b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh @@ -0,0 +1,45 @@ +CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/megatron-lm/pretrain_gpt.py \ + --init-method-std 0.014 \ + --disable-bias-linear \ + --use-rope-scaling \ + --swiglu \ + --use-rotary-position-embeddings \ + --num-layers 16 \ + --hidden-size 2048 \ + --num-attention-heads 32 \ + --ffn-hidden-size 8192 \ + --kv-channels 64 \ + --group-query-attention \ + --position-embedding-type rope \ + --attention-backend fused \ + --num-query-groups 8 \ + --normalization RMSNorm \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --micro-batch-size 1 \ + --global-batch-size 8 \ + --train-iters 5 \ + --log-interval 1 \ + --tokenizer-type SentencePieceTokenizer \ + --tokenizer-model /opt/data/tokenizers/sentencepiece/tokenizer.model \ + --vocab-size 131072 \ + --save-interval 5 \ + --eval-interval 5 \ + --eval-iters 4 \ + --load /path/to/mbridge/ckpt \ + --save /path/to/save/ckpt \ + --ckpt-format torch_dist \ + --log-progress \ + --bf16 \ + --lr 4.5e-4 \ + --min-lr 4.5e-5 \ + --num-workers 2 \ + --tensorboard-dir /workspace/tb \ + --log-straggler \ + --log-interval 1 \ + --log-throughput \ + --no-load-optim From c352dbb86a263240d57ef944c4c6469fd4048464 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Thu, 26 Feb 2026 11:14:47 -0800 Subject: [PATCH 02/11] fix llama32_1b test Signed-off-by: dimapihtar --- ...2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh | 2 ++ .../ckpts/llama3_1b/test_llama3_1b_mbridge.py | 19 ++++++++++++++++-- .../ckpts/llama3_1b/test_llama3_1b_mcore.sh | 20 +++++++++++-------- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh index c988c7a766..7a178110b5 100755 --- a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh +++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh @@ -22,3 +22,5 @@ export CUDA_VISIBLE_DEVICES="0,1" # they can run basic training without crashes uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py coverage combine -q + +bash tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py index 466fd13d34..3d774b6473 100644 --- a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py +++ b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py @@ -15,6 +15,7 @@ """Functional smoke tests for LLaMA recipe configurations.""" import pytest +import subprocess from megatron.bridge.recipes.llama import llama32_1b_pretrain_config from megatron.bridge.training.gpt_step import forward_step @@ -25,7 +26,7 @@ class TestLlama3MBridgeCkpt: """Test class for LLaMA recipe functional tests.""" @pytest.mark.run_only_on("GPU") - def test_llama_pretrain_recipes(self): + def test_llama32_1B_ckpt_mbridge(self): """Functional test for LLaMA recipes with appropriate parallelism configurations.""" config = llama32_1b_pretrain_config() @@ -42,4 +43,18 @@ def test_llama_pretrain_recipes(self): config.logger.log_interval = 1 - pretrain(config=config, forward_step_func=forward_step) \ No newline at end of file + config.checkpoint.save = "/workspace/test_ckpts/llama32_1b_mbridge" + + pretrain(config=config, forward_step_func=forward_step) + + @pytest.mark.run_only_on("GPU") + def test_llama32_1B_ckpt_mcore(self): + """Functional test for LLaMA recipes with appropriate parallelism configurations.""" + + script_path = "test_llama3_1b_mcore.sh" + process = subprocess.Popen( + ["bash", script_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh index 89ed67ba03..16d686fd71 100755 --- a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh +++ b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh @@ -1,4 +1,7 @@ -CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/megatron-lm/pretrain_gpt.py \ +LOAD_DIR=/workspace/test_ckpts/llama32_1b_mbridge +SAVE_DIR=/workspace/test_ckpts/llama32_1b_mcore + +CUDA_VISIBLE_DEVICES=0,1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py \ --init-method-std 0.014 \ --disable-bias-linear \ --use-rope-scaling \ @@ -22,16 +25,15 @@ CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/megatron-lm/pretr --max-position-embeddings 8192 \ --micro-batch-size 1 \ --global-batch-size 8 \ - --train-iters 5 \ - --log-interval 1 \ - --tokenizer-type SentencePieceTokenizer \ - --tokenizer-model /opt/data/tokenizers/sentencepiece/tokenizer.model \ + --train-iters 10 \ + --mock-data \ + --tokenizer-type NullTokenizer \ --vocab-size 131072 \ --save-interval 5 \ --eval-interval 5 \ --eval-iters 4 \ - --load /path/to/mbridge/ckpt \ - --save /path/to/save/ckpt \ + --load ${LOAD_DIR} \ + --save ${SAVE_DIR} \ --ckpt-format torch_dist \ --log-progress \ --bf16 \ @@ -39,7 +41,9 @@ CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/megatron-lm/pretr --min-lr 4.5e-5 \ --num-workers 2 \ --tensorboard-dir /workspace/tb \ - --log-straggler \ --log-interval 1 \ --log-throughput \ --no-load-optim + +echo rm -rf ${LOAD_DIR} +echo rm -rf ${SAVE_DIR} From 0c25e0bcdf14c169f0cb5a5c989a0c8d94d2b14e Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Fri, 27 Feb 2026 14:42:03 -0800 Subject: [PATCH 03/11] add qwen3_4b & nemotronh_4b ckpt tests Signed-off-by: dimapihtar --- ..._Launch_ckpts_mbridge_to_mlm_llama32_1b.sh | 26 +++++++++ ...aunch_ckpts_mbridge_to_mlm_nemotronh_4b.sh | 26 +++++++++ ...2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh} | 6 +- .../test_llama32_1b_mbridge.py} | 13 +---- .../test_llama32_1b_mcore.sh} | 0 .../nemotronh_4b/test_nemotronh_4b_mbridge.py | 50 +++++++++++++++++ .../nemotronh_4b/test_nemotronh_4b_mcore.sh | 56 +++++++++++++++++++ .../ckpts/qwen3_4b/test_qwen3_4b_mbridge.py | 47 ++++++++++++++++ .../ckpts/qwen3_4b/test_qwen3_4b_mcore.sh | 53 ++++++++++++++++++ 9 files changed, 262 insertions(+), 15 deletions(-) create mode 100755 tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh create mode 100755 tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh rename tests/functional_tests/{L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh => L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh} (84%) rename tests/functional_tests/ckpts/{llama3_1b/test_llama3_1b_mbridge.py => llama32_1b/test_llama32_1b_mbridge.py} (79%) rename tests/functional_tests/ckpts/{llama3_1b/test_llama3_1b_mcore.sh => llama32_1b/test_llama32_1b_mcore.sh} (100%) create mode 100644 tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py create mode 100644 tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh create mode 100644 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py create mode 100755 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh new file mode 100755 index 0000000000..d28728b2f0 --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py +coverage combine -q + +bash tests/functional_tests/ckpts/llama3_1b/test_llama32_1b_mcore.sh diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh new file mode 100755 index 0000000000..f0f5c3bbaa --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py +coverage combine -q + +bash tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh similarity index 84% rename from tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh rename to tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh index 7a178110b5..21c5083c2c 100755 --- a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh +++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ export CUDA_VISIBLE_DEVICES="0,1" # Run recipe functional tests on 2 GPUs # This script tests recipe configurations with their default settings to ensure # they can run basic training without crashes -uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py coverage combine -q -bash tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh +bash tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py similarity index 79% rename from tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py rename to tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py index 3d774b6473..199f083140 100644 --- a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py +++ b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py @@ -22,7 +22,7 @@ from megatron.bridge.training.pretrain import pretrain -class TestLlama3MBridgeCkpt: +class TestLlama32MBridgeCkpt: """Test class for LLaMA recipe functional tests.""" @pytest.mark.run_only_on("GPU") @@ -47,14 +47,3 @@ def test_llama32_1B_ckpt_mbridge(self): pretrain(config=config, forward_step_func=forward_step) - @pytest.mark.run_only_on("GPU") - def test_llama32_1B_ckpt_mcore(self): - """Functional test for LLaMA recipes with appropriate parallelism configurations.""" - - script_path = "test_llama3_1b_mcore.sh" - process = subprocess.Popen( - ["bash", script_path], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh similarity index 100% rename from tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh rename to tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py new file mode 100644 index 0000000000..bfab5ab120 --- /dev/null +++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py @@ -0,0 +1,50 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for LLaMA recipe configurations.""" + +import pytest +import subprocess + +from megatron.bridge.recipes.nemotronh import nemotronh_4b_pretrain_config +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain + + +class TestNemotronhMBridgeCkpt: + """Test class for LLaMA recipe functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_nemotronh_4b_ckpt_mbridge(self): + """Functional test for LLaMA recipes with appropriate parallelism configurations.""" + + config = nemotronh_4b_pretrain_config() + + config.model.num_layers = 26 + config.model.hybrid_override_pattern = "M-M-M-M*-M-M-M-M*-M-M-M-M*" + + config.train.train_iters = 5 + config.train.eval_iters = 5 + config.train.save_interval = 5 + config.train.global_batch_size = 4 + config.train.micro_batch_size = 1 + + config.scheduler.lr_warmup_iters = 2 + + config.logger.log_interval = 1 + + config.checkpoint.save = "/workspace/test_ckpts/nemotronh_4b_mbridge" + + pretrain(config=config, forward_step_func=forward_step) + diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh new file mode 100644 index 0000000000..cf3d80c4e5 --- /dev/null +++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh @@ -0,0 +1,56 @@ +LOAD_DIR=/workspace/test_ckpts/nemotronh_4b_mbridge +SAVE_DIR=/workspace/test_ckpts/nemotronh_4b_mcore + +CUDA_VISIBLE_DEVICES=0,1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_mamba.py \ + --init-method-std 0.014 \ + --disable-bias-linear \ + --use-rope-scaling \ + --squared-relu \ + --qk-layernorm \ + --rotary-percent 1.0 \ + --rotary-base 1000000 \ + --use-rotary-position-embeddings \ + --hybrid-override-pattern "M-M-M-M*-M-M-M-M*-M-M-M-M*" \ + --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ + --num-layers 26 \ + --hidden-size 3072 \ + --num-attention-heads 32 \ + --mamba-num-heads 112 \ + --ffn-hidden-size 12288 \ + --kv-channels 128 \ + --group-query-attention \ + --position-embedding-type none \ + --attention-backend fused \ + --num-query-groups 8 \ + --normalization RMSNorm \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --tensor-model-parallel-size 2 \ + --pipeline-model-parallel-size 1 \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --micro-batch-size 1 \ + --global-batch-size 4 \ + --train-iters 10 \ + --mock-data \ + --tokenizer-type NullTokenizer \ + --vocab-size 151936 \ + --save-interval 5 \ + --eval-interval 5 \ + --eval-iters 4 \ + --load ${LOAD_DIR} \ + --save ${SAVE_DIR} \ + --ckpt-format torch_dist \ + --log-progress \ + --bf16 \ + --lr 4.5e-4 \ + --min-lr 4.5e-5 \ + --num-workers 2 \ + --tensorboard-dir /workspace/tb \ + --log-interval 1 \ + --log-throughput \ + --no-load-optim \ + --no-load-rng + +echo rm -rf ${LOAD_DIR} +echo rm -rf ${SAVE_DIR} diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py new file mode 100644 index 0000000000..ac362ea2ed --- /dev/null +++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for LLaMA recipe configurations.""" + +import pytest +import subprocess + +from megatron.bridge.recipes.qwen import qwen3_4b_pretrain_config +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain + + +class TestQwen3MBridgeCkpt: + """Test class for LLaMA recipe functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_qwen3_4b_ckpt_mbridge(self): + """Functional test for LLaMA recipes with appropriate parallelism configurations.""" + + config = qwen3_4b_pretrain_config() + + config.train.train_iters = 5 + config.train.eval_iters = 5 + config.train.save_interval = 5 + config.train.global_batch_size = 4 + config.train.micro_batch_size = 1 + + config.scheduler.lr_warmup_iters = 2 + + config.logger.log_interval = 1 + + config.checkpoint.save = "/workspace/test_ckpts/qwen3_4b_mbridge" + + pretrain(config=config, forward_step_func=forward_step) + diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh new file mode 100755 index 0000000000..9aa93b06da --- /dev/null +++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh @@ -0,0 +1,53 @@ +LOAD_DIR=/workspace/test_ckpts/qwen3_4b_mbridge +SAVE_DIR=/workspace/test_ckpts/qwen3_4b_mcore + +CUDA_VISIBLE_DEVICES=0,1,2,3 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py \ + --init-method-std 0.014 \ + --disable-bias-linear \ + --use-rope-scaling \ + --swiglu \ + --qk-layernorm \ + --rotary-percent 1.0 \ + --rotary-base 1000000 \ + --use-rotary-position-embeddings \ + --num-layers 36 \ + --hidden-size 2560 \ + --num-attention-heads 32 \ + --ffn-hidden-size 9728 \ + --kv-channels 128 \ + --group-query-attention \ + --position-embedding-type rope \ + --attention-backend fused \ + --num-query-groups 8 \ + --normalization RMSNorm \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --tensor-model-parallel-size 2 \ + --pipeline-model-parallel-size 1 \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --micro-batch-size 1 \ + --global-batch-size 4 \ + --train-iters 10 \ + --mock-data \ + --tokenizer-type NullTokenizer \ + --vocab-size 151936 \ + --save-interval 5 \ + --eval-interval 5 \ + --eval-iters 4 \ + --load ${LOAD_DIR} \ + --save ${SAVE_DIR} \ + --ckpt-format torch_dist \ + --log-progress \ + --bf16 \ + --lr 4.5e-4 \ + --min-lr 4.5e-5 \ + --num-workers 2 \ + --tensorboard-dir /workspace/tb \ + --log-interval 1 \ + --log-throughput \ + --no-load-optim \ + --no-load-rng + +echo rm -rf ${LOAD_DIR} +echo rm -rf ${SAVE_DIR} From 5bbc5424a9a556f949ca02e3c9a587806d75e548 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 2 Mar 2026 09:44:15 -0800 Subject: [PATCH 04/11] refactor checkpointing tests Signed-off-by: dimapihtar --- ${SAVE_DIR}/progress.txt | 1 + .github/workflows/cicd-main.yml | 3 + ..._Launch_ckpts_mbridge_to_mlm_llama32_1b.sh | 6 +- ...aunch_ckpts_mbridge_to_mlm_nemotronh_4b.sh | 6 +- ...L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh | 6 +- .../ckpts/llama32_1b/test_llama32_1b_ckpt.py | 131 +++++++++++++++++ .../llama32_1b/test_llama32_1b_mbridge.py | 49 ------- .../ckpts/llama32_1b/test_llama32_1b_mcore.sh | 49 ------- .../nemotronh_4b/test_nemotronh_4b_ckpt.py | 136 ++++++++++++++++++ .../nemotronh_4b/test_nemotronh_4b_mbridge.py | 50 ------- .../nemotronh_4b/test_nemotronh_4b_mcore.sh | 56 -------- .../ckpts/qwen3_4b/test_qwen3_4b_ckpt.py | 132 +++++++++++++++++ .../ckpts/qwen3_4b/test_qwen3_4b_mbridge.py | 47 ------ .../ckpts/qwen3_4b/test_qwen3_4b_mcore.sh | 53 ------- ...t.tfevents.1772467438.017ddd03c11b.10202.0 | Bin 0 -> 88 bytes ...t.tfevents.1772467438.017ddd03c11b.10204.0 | Bin 0 -> 88 bytes ...t.tfevents.1772467705.017ddd03c11b.10985.0 | Bin 0 -> 88 bytes ...t.tfevents.1772467705.017ddd03c11b.10987.0 | Bin 0 -> 88 bytes ...t.tfevents.1772468322.017ddd03c11b.11735.0 | Bin 0 -> 88 bytes ...t.tfevents.1772468322.017ddd03c11b.11737.0 | Bin 0 -> 88 bytes 20 files changed, 415 insertions(+), 310 deletions(-) create mode 100644 ${SAVE_DIR}/progress.txt create mode 100644 tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py delete mode 100644 tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py delete mode 100755 tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh create mode 100644 tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py delete mode 100644 tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py delete mode 100644 tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh create mode 100644 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py delete mode 100644 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py delete mode 100755 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0 create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0 create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0 create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0 create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0 create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0 diff --git a/${SAVE_DIR}/progress.txt b/${SAVE_DIR}/progress.txt new file mode 100644 index 0000000000..0f93fe87b6 --- /dev/null +++ b/${SAVE_DIR}/progress.txt @@ -0,0 +1 @@ +2026-03-02 17:36:47 Job ID: # GPUs: 2 Starting job diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index a53e0e92e1..191d0a57a9 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -376,6 +376,9 @@ jobs: - script: L2_Launch_quantization_export - script: L2_Launch_recipes_llama_cuda_graphs - script: L2_Launch_utils + - script: L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh + - script: L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh + - script: L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh needs: [pre-flight, cicd-unit-tests] runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2 if: | diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh index d28728b2f0..aeadc54ddd 100755 --- a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh +++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh @@ -20,7 +20,9 @@ export CUDA_VISIBLE_DEVICES="0,1" # Run recipe functional tests on 2 GPUs # This script tests recipe configurations with their default settings to ensure # they can run basic training without crashes -uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_mbridge coverage combine -q -bash tests/functional_tests/ckpts/llama3_1b/test_llama32_1b_mcore.sh +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_core + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_remove_artifacts diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh index f0f5c3bbaa..c39b782f98 100755 --- a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh +++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh @@ -20,7 +20,9 @@ export CUDA_VISIBLE_DEVICES="0,1" # Run recipe functional tests on 2 GPUs # This script tests recipe configurations with their default settings to ensure # they can run basic training without crashes -uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mbridge coverage combine -q -bash tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mcore + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_remove_artifacts diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh index 21c5083c2c..e392f76aad 100755 --- a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh +++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh @@ -20,7 +20,9 @@ export CUDA_VISIBLE_DEVICES="0,1" # Run recipe functional tests on 2 GPUs # This script tests recipe configurations with their default settings to ensure # they can run basic training without crashes -uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mbridge coverage combine -q -bash tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mcore + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_remove_artifacts diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py new file mode 100644 index 0000000000..ae5e11b83f --- /dev/null +++ b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py @@ -0,0 +1,131 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for LLaMA checkpointing.""" + +import os +import pytest +import shutil +import sys + +from torch.distributed.run import main as torchrun_main + +from megatron.bridge.recipes.llama import llama32_1b_pretrain_config +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain + + +BASE_DIR = "/workspace/test_ckpts/llama32_1b" +MBRIDGE_CKPT = f"{BASE_DIR}/mbridge" +MCORE_CKPT = f"{BASE_DIR}/mcore" +TB_DIR = f"{BASE_DIR}/tb" + + +class TestLlama32Ckpt: + """Test class for LLama checkpoint functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_llama32_1B_ckpt_mbridge(self): + """Functional test for LLama MBridge checkpoint.""" + + config = llama32_1b_pretrain_config() + + config.model.seq_length = 8192 + + config.train.train_iters = 5 + config.train.eval_iters = 5 + config.train.save_interval = 5 + config.train.global_batch_size = 8 + config.train.micro_batch_size = 1 + + config.scheduler.lr_warmup_iters = 2 + + config.logger.log_interval = 1 + + config.checkpoint.save = MBRIDGE_CKPT + + pretrain(config=config, forward_step_func=forward_step) + + @pytest.mark.run_only_on("GPU") + def test_llama32_1B_ckpt_core(self, monkeypatch): + """Functional test for LLama MCore checkpoint.""" + + # Set environment variables + monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1") + monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1") + + # Set MLM script + monkeypatch.setattr( + sys, + "argv", + [ + "torchrun", + "--nproc-per-node=2", + "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py", + "--load", "/workspace/test_ckpts/llama32_1b_mbridge", + "--save", "/workspace/test_ckpts/llama32_1b_mcore", + "--init-method-std", "0.014", + "--disable-bias-linear", + "--use-rope-scaling", + "--swiglu", + "--use-rotary-position-embeddings", + "--num-layers", "16", + "--hidden-size", "2048", + "--num-attention-heads", "32", + "--ffn-hidden-size", "8192", + "--kv-channels", "64", + "--group-query-attention", + "--position-embedding-type", "rope", + "--attention-backend", "fused", + "--num-query-groups", "8", + "--normalization", "RMSNorm", + "--attention-dropout", "0.0", + "--hidden-dropout", "0.0", + "--tensor-model-parallel-size", "1", + "--pipeline-model-parallel-size", "1", + "--seq-length", "8192", + "--max-position-embeddings", "8192", + "--micro-batch-size", "1", + "--global-batch-size", "8", + "--train-iters", "10", + "--mock-data", + "--tokenizer-type", "NullTokenizer", + "--vocab-size", "131072", + "--save-interval", "5", + "--eval-interval", "5", + "--eval-iters", "5", + "--load", MBRIDGE_CKPT, + "--save", MCORE_CKPT, + "--ckpt-format", "torch_dist", + "--log-progress", + "--bf16", + "--lr", "4.5e-4", + "--min-lr", "4.5e-5", + "--num-workers", "2", + "--tensorboard-dir", TB_DIR, + "--log-interval", "1", + "--log-throughput", + "--no-load-optim", + ], + ) + + # Run MLM script + torchrun_main() + + def test_remove_artifacts(self): + """Removes model artifacts""" + shutil.rmtree(BASE_DIR) + + assert not os.path.exists(BASE_DIR) + diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py deleted file mode 100644 index 199f083140..0000000000 --- a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Functional smoke tests for LLaMA recipe configurations.""" - -import pytest -import subprocess - -from megatron.bridge.recipes.llama import llama32_1b_pretrain_config -from megatron.bridge.training.gpt_step import forward_step -from megatron.bridge.training.pretrain import pretrain - - -class TestLlama32MBridgeCkpt: - """Test class for LLaMA recipe functional tests.""" - - @pytest.mark.run_only_on("GPU") - def test_llama32_1B_ckpt_mbridge(self): - """Functional test for LLaMA recipes with appropriate parallelism configurations.""" - - config = llama32_1b_pretrain_config() - - config.model.seq_length = 8192 - - config.train.train_iters = 5 - config.train.eval_iters = 5 - config.train.save_interval = 5 - config.train.global_batch_size = 8 - config.train.micro_batch_size = 1 - - config.scheduler.lr_warmup_iters = 2 - - config.logger.log_interval = 1 - - config.checkpoint.save = "/workspace/test_ckpts/llama32_1b_mbridge" - - pretrain(config=config, forward_step_func=forward_step) - diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh deleted file mode 100755 index 16d686fd71..0000000000 --- a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh +++ /dev/null @@ -1,49 +0,0 @@ -LOAD_DIR=/workspace/test_ckpts/llama32_1b_mbridge -SAVE_DIR=/workspace/test_ckpts/llama32_1b_mcore - -CUDA_VISIBLE_DEVICES=0,1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py \ - --init-method-std 0.014 \ - --disable-bias-linear \ - --use-rope-scaling \ - --swiglu \ - --use-rotary-position-embeddings \ - --num-layers 16 \ - --hidden-size 2048 \ - --num-attention-heads 32 \ - --ffn-hidden-size 8192 \ - --kv-channels 64 \ - --group-query-attention \ - --position-embedding-type rope \ - --attention-backend fused \ - --num-query-groups 8 \ - --normalization RMSNorm \ - --attention-dropout 0.0 \ - --hidden-dropout 0.0 \ - --tensor-model-parallel-size 1 \ - --pipeline-model-parallel-size 1 \ - --seq-length 8192 \ - --max-position-embeddings 8192 \ - --micro-batch-size 1 \ - --global-batch-size 8 \ - --train-iters 10 \ - --mock-data \ - --tokenizer-type NullTokenizer \ - --vocab-size 131072 \ - --save-interval 5 \ - --eval-interval 5 \ - --eval-iters 4 \ - --load ${LOAD_DIR} \ - --save ${SAVE_DIR} \ - --ckpt-format torch_dist \ - --log-progress \ - --bf16 \ - --lr 4.5e-4 \ - --min-lr 4.5e-5 \ - --num-workers 2 \ - --tensorboard-dir /workspace/tb \ - --log-interval 1 \ - --log-throughput \ - --no-load-optim - -echo rm -rf ${LOAD_DIR} -echo rm -rf ${SAVE_DIR} diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py new file mode 100644 index 0000000000..3f4490e93d --- /dev/null +++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py @@ -0,0 +1,136 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for LLaMA recipe configurations.""" + +import os +import pytest +import shutil +import sys + +from torch.distributed.run import main as torchrun_main + +from megatron.bridge.recipes.nemotronh import nemotronh_4b_pretrain_config +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain + + +BASE_DIR = "/workspace/test_ckpts/nemotronh_4b" +MBRIDGE_CKPT = f"{BASE_DIR}/mbridge" +MCORE_CKPT = f"{BASE_DIR}/mcore" +TB_DIR = f"{BASE_DIR}/tb" + + +class TestNemotronhCkpt: + """Test class for Nempotron Hybrid checkpoint functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_nemotronh_4b_ckpt_mbridge(self): + """Functional test for Nemotron Hybrid MBridge checkpoint.""" + + config = nemotronh_4b_pretrain_config() + + config.model.num_layers = 26 + config.model.hybrid_override_pattern = "M-M-M-M*-M-M-M-M*-M-M-M-M*" + + config.train.train_iters = 5 + config.train.eval_iters = 5 + config.train.save_interval = 5 + config.train.global_batch_size = 4 + config.train.micro_batch_size = 1 + + config.scheduler.lr_warmup_iters = 2 + + config.logger.log_interval = 1 + + config.checkpoint.save = MBRIDGE_CKPT + + pretrain(config=config, forward_step_func=forward_step) + + @pytest.mark.run_only_on("GPU") + def test_nemotronh_4b_ckpt_mcore(self, monkeypatch): + """Functional test for Nemotron Hybrid MCore checkpoint.""" + + # Set environment variables + monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1") + monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1") + + # Set MLM script + monkeypatch.setattr( + sys, + "argv", + [ + "torchrun", + "--nproc_per_node=2", + "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_mamba.py", + "--init-method-std", "0.014", + "--disable-bias-linear", + "--use-rope-scaling", + "--squared-relu", + "--qk-layernorm", + "--rotary-percent", "1.0", + "--rotary-base", "1000000", + "--use-rotary-position-embeddings", + "--hybrid-override-pattern", "M-M-M-M*-M-M-M-M*-M-M-M-M*", + "--spec", "megatron.core.models.mamba.mamba_layer_specs", "mamba_stack_spec", + "--num-layers", "26", + "--hidden-size", "3072", + "--num-attention-heads", "32", + "--mamba-num-heads", "112", + "--ffn-hidden-size", "12288", + "--kv-channels", "128", + "--group-query-attention", + "--position-embedding-type", "none", + "--attention-backend", "fused", + "--num-query-groups", "8", + "--normalization", "RMSNorm", + "--attention-dropout", "0.0", + "--hidden-dropout", "0.0", + "--tensor-model-parallel-size", "2", + "--pipeline-model-parallel-size", "1", + "--seq-length", "8192", + "--max-position-embeddings", "8192", + "--micro-batch-size", "1", + "--global-batch-size", "4", + "--train-iters", "10", + "--mock-data", + "--tokenizer-type", "NullTokenizer", + "--vocab-size", "151936", + "--save-interval", "5", + "--eval-interval", "5", + "--eval-iters", "4", + "--load", MBRIDGE_CKPT, + "--save", MCORE_CKPT, + "--ckpt-format", "torch_dist", + "--log-progress", + "--bf16", + "--lr", "4.5e-4", + "--min-lr", "4.5e-5", + "--num-workers", "2", + "--tensorboard-dir", TB_DIR, + "--log-interval", "1", + "--log-throughput", + "--no-load-optim", + "--no-load-rng", + ], + ) + + # Run MLM script + torchrun_main() + + def test_remove_artifacts(self): + """Removes model artifacts""" + shutil.rmtree(BASE_DIR) + + assert not os.path.exists(BASE_DIR) diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py deleted file mode 100644 index bfab5ab120..0000000000 --- a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Functional smoke tests for LLaMA recipe configurations.""" - -import pytest -import subprocess - -from megatron.bridge.recipes.nemotronh import nemotronh_4b_pretrain_config -from megatron.bridge.training.gpt_step import forward_step -from megatron.bridge.training.pretrain import pretrain - - -class TestNemotronhMBridgeCkpt: - """Test class for LLaMA recipe functional tests.""" - - @pytest.mark.run_only_on("GPU") - def test_nemotronh_4b_ckpt_mbridge(self): - """Functional test for LLaMA recipes with appropriate parallelism configurations.""" - - config = nemotronh_4b_pretrain_config() - - config.model.num_layers = 26 - config.model.hybrid_override_pattern = "M-M-M-M*-M-M-M-M*-M-M-M-M*" - - config.train.train_iters = 5 - config.train.eval_iters = 5 - config.train.save_interval = 5 - config.train.global_batch_size = 4 - config.train.micro_batch_size = 1 - - config.scheduler.lr_warmup_iters = 2 - - config.logger.log_interval = 1 - - config.checkpoint.save = "/workspace/test_ckpts/nemotronh_4b_mbridge" - - pretrain(config=config, forward_step_func=forward_step) - diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh deleted file mode 100644 index cf3d80c4e5..0000000000 --- a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh +++ /dev/null @@ -1,56 +0,0 @@ -LOAD_DIR=/workspace/test_ckpts/nemotronh_4b_mbridge -SAVE_DIR=/workspace/test_ckpts/nemotronh_4b_mcore - -CUDA_VISIBLE_DEVICES=0,1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_mamba.py \ - --init-method-std 0.014 \ - --disable-bias-linear \ - --use-rope-scaling \ - --squared-relu \ - --qk-layernorm \ - --rotary-percent 1.0 \ - --rotary-base 1000000 \ - --use-rotary-position-embeddings \ - --hybrid-override-pattern "M-M-M-M*-M-M-M-M*-M-M-M-M*" \ - --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ - --num-layers 26 \ - --hidden-size 3072 \ - --num-attention-heads 32 \ - --mamba-num-heads 112 \ - --ffn-hidden-size 12288 \ - --kv-channels 128 \ - --group-query-attention \ - --position-embedding-type none \ - --attention-backend fused \ - --num-query-groups 8 \ - --normalization RMSNorm \ - --attention-dropout 0.0 \ - --hidden-dropout 0.0 \ - --tensor-model-parallel-size 2 \ - --pipeline-model-parallel-size 1 \ - --seq-length 8192 \ - --max-position-embeddings 8192 \ - --micro-batch-size 1 \ - --global-batch-size 4 \ - --train-iters 10 \ - --mock-data \ - --tokenizer-type NullTokenizer \ - --vocab-size 151936 \ - --save-interval 5 \ - --eval-interval 5 \ - --eval-iters 4 \ - --load ${LOAD_DIR} \ - --save ${SAVE_DIR} \ - --ckpt-format torch_dist \ - --log-progress \ - --bf16 \ - --lr 4.5e-4 \ - --min-lr 4.5e-5 \ - --num-workers 2 \ - --tensorboard-dir /workspace/tb \ - --log-interval 1 \ - --log-throughput \ - --no-load-optim \ - --no-load-rng - -echo rm -rf ${LOAD_DIR} -echo rm -rf ${SAVE_DIR} diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py new file mode 100644 index 0000000000..00769a40dd --- /dev/null +++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py @@ -0,0 +1,132 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for Qwen checkpointing.""" + +import os +import pytest +import shutil +import sys + +from torch.distributed.run import main as torchrun_main + +from megatron.bridge.recipes.qwen import qwen3_4b_pretrain_config +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain + + +BASE_DIR = "/workspace/test_ckpts/qwen3_4b" +MBRIDGE_CKPT = f"{BASE_DIR}/mbridge" +MCORE_CKPT = f"{BASE_DIR}/mcore" +TB_DIR = f"{BASE_DIR}/tb" + + +class TestQwen3Ckpt: + """Test class for Qwen checkpoint functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_qwen3_4b_ckpt_mbridge(self): + """Functional test for Qwen MBridge checkpoint.""" + + config = qwen3_4b_pretrain_config() + + config.model.num_layers = 24 + + config.train.train_iters = 5 + config.train.eval_iters = 5 + config.train.save_interval = 5 + config.train.global_batch_size = 4 + config.train.micro_batch_size = 1 + + config.scheduler.lr_warmup_iters = 2 + + config.logger.log_interval = 1 + + config.checkpoint.save = MBRIDGE_CKPT + + pretrain(config=config, forward_step_func=forward_step) + + @pytest.mark.run_only_on("GPU") + def test_qwen3_4b_ckpt_mcore(self, monkeypatch): + """Functional test for Qwen MCore checkpoint.""" + + # Set environment variables + monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1") + monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1") + + # Set MLM script + monkeypatch.setattr( + sys, + "argv", + [ + "torchrun", + "--nproc_per_node=2", + "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py", + "--init-method-std", "0.014", + "--disable-bias-linear", + "--use-rope-scaling", + "--swiglu", + "--qk-layernorm", + "--rotary-percent", "1.0", + "--rotary-base", "1000000", + "--use-rotary-position-embeddings", + "--num-layers", "24", + "--hidden-size", "2560", + "--num-attention-heads", "32", + "--ffn-hidden-size", "9728", + "--kv-channels", "128", + "--group-query-attention", + "--position-embedding-type", "rope", + "--attention-backend", "fused", + "--num-query-groups", "8", + "--normalization", "RMSNorm", + "--attention-dropout", "0.0", + "--hidden-dropout", "0.0", + "--tensor-model-parallel-size", "2", + "--pipeline-model-parallel-size", "1", + "--seq-length", "4096", + "--max-position-embeddings", "4096", + "--micro-batch-size", "1", + "--global-batch-size", "4", + "--train-iters", "10", + "--mock-data", + "--tokenizer-type", "NullTokenizer", + "--vocab-size", "151936", + "--save-interval", "5", + "--eval-interval", "5", + "--eval-iters", "4", + "--load", MBRIDGE_CKPT, + "--save", MCORE_CKPT, + "--ckpt-format", "torch_dist", + "--log-progress", + "--bf16", + "--lr", "4.5e-4", + "--min-lr", "4.5e-5", + "--num-workers", "2", + "--tensorboard-dir", "/workspace/tb", + "--log-interval", "1", + "--log-throughput", + "--no-load-optim", + "--no-load-rng", + ], + ) + + # Run MLM script + torchrun_main() + + def test_remove_artifacts(self): + """Removes model artifacts""" + shutil.rmtree(BASE_DIR) + + assert not os.path.exists(BASE_DIR) \ No newline at end of file diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py deleted file mode 100644 index ac362ea2ed..0000000000 --- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Functional smoke tests for LLaMA recipe configurations.""" - -import pytest -import subprocess - -from megatron.bridge.recipes.qwen import qwen3_4b_pretrain_config -from megatron.bridge.training.gpt_step import forward_step -from megatron.bridge.training.pretrain import pretrain - - -class TestQwen3MBridgeCkpt: - """Test class for LLaMA recipe functional tests.""" - - @pytest.mark.run_only_on("GPU") - def test_qwen3_4b_ckpt_mbridge(self): - """Functional test for LLaMA recipes with appropriate parallelism configurations.""" - - config = qwen3_4b_pretrain_config() - - config.train.train_iters = 5 - config.train.eval_iters = 5 - config.train.save_interval = 5 - config.train.global_batch_size = 4 - config.train.micro_batch_size = 1 - - config.scheduler.lr_warmup_iters = 2 - - config.logger.log_interval = 1 - - config.checkpoint.save = "/workspace/test_ckpts/qwen3_4b_mbridge" - - pretrain(config=config, forward_step_func=forward_step) - diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh deleted file mode 100755 index 9aa93b06da..0000000000 --- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh +++ /dev/null @@ -1,53 +0,0 @@ -LOAD_DIR=/workspace/test_ckpts/qwen3_4b_mbridge -SAVE_DIR=/workspace/test_ckpts/qwen3_4b_mcore - -CUDA_VISIBLE_DEVICES=0,1,2,3 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py \ - --init-method-std 0.014 \ - --disable-bias-linear \ - --use-rope-scaling \ - --swiglu \ - --qk-layernorm \ - --rotary-percent 1.0 \ - --rotary-base 1000000 \ - --use-rotary-position-embeddings \ - --num-layers 36 \ - --hidden-size 2560 \ - --num-attention-heads 32 \ - --ffn-hidden-size 9728 \ - --kv-channels 128 \ - --group-query-attention \ - --position-embedding-type rope \ - --attention-backend fused \ - --num-query-groups 8 \ - --normalization RMSNorm \ - --attention-dropout 0.0 \ - --hidden-dropout 0.0 \ - --tensor-model-parallel-size 2 \ - --pipeline-model-parallel-size 1 \ - --seq-length 4096 \ - --max-position-embeddings 4096 \ - --micro-batch-size 1 \ - --global-batch-size 4 \ - --train-iters 10 \ - --mock-data \ - --tokenizer-type NullTokenizer \ - --vocab-size 151936 \ - --save-interval 5 \ - --eval-interval 5 \ - --eval-iters 4 \ - --load ${LOAD_DIR} \ - --save ${SAVE_DIR} \ - --ckpt-format torch_dist \ - --log-progress \ - --bf16 \ - --lr 4.5e-4 \ - --min-lr 4.5e-5 \ - --num-workers 2 \ - --tensorboard-dir /workspace/tb \ - --log-interval 1 \ - --log-throughput \ - --no-load-optim \ - --no-load-rng - -echo rm -rf ${LOAD_DIR} -echo rm -rf ${SAVE_DIR} diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0 new file mode 100644 index 0000000000000000000000000000000000000000..47455eb593a7985968b0da0f007801cec45ae4e1 GIT binary patch literal 88 zcmeZZfPjCKJmzxFc-v^5n|aGoiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-QuJ5iC;%>_Ain?r literal 0 HcmV?d00001 diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0 new file mode 100644 index 0000000000000000000000000000000000000000..16cd1a614553ab5d1bb4a8f48af99b95d42cad04 GIT binary patch literal 88 zcmeZZfPjCKJmzw$GtaQj&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$ hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{ZVJVPo007Lw9y|a5 literal 0 HcmV?d00001 diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0 new file mode 100644 index 0000000000000000000000000000000000000000..94b1235eb75983635cd55b40b1b53304b49e69db GIT binary patch literal 88 zcmeZZfPjCKJmzv*eoL#%&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$ hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{ZIX>x_2>>B$AeaCE literal 0 HcmV?d00001 diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0 new file mode 100644 index 0000000000000000000000000000000000000000..8c76604d9172bde22402d34062c209dbe5da6726 GIT binary patch literal 88 zcmeZZfPjCKJmzv{olLIF&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$ hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{Zxs>!^EdU}iAqM~e literal 0 HcmV?d00001 diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0 b/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0 new file mode 100644 index 0000000000000000000000000000000000000000..0cbe458547164260d95245eee077c7cd5a239512 GIT binary patch literal 88 zcmeZZfPjCKJmzwiU7aqGmwC%kiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-@?qCcQ2-pXAmso6 literal 0 HcmV?d00001 diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0 b/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0 new file mode 100644 index 0000000000000000000000000000000000000000..71b6950ddc84bc0952de7b212ffadc31faae2c71 GIT binary patch literal 88 zcmeZZfPjCKJmzxh?Vll$mwC%kiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-qWL}E3jqEVADI9E literal 0 HcmV?d00001 From 591a58999911e9201edc065c48145fa15c7360ae Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 2 Mar 2026 09:45:11 -0800 Subject: [PATCH 05/11] remove extra files Signed-off-by: dimapihtar --- ${SAVE_DIR}/progress.txt | 1 - ...ts.out.tfevents.1772467438.017ddd03c11b.10202.0 | Bin 88 -> 0 bytes ...ts.out.tfevents.1772467438.017ddd03c11b.10204.0 | Bin 88 -> 0 bytes ...ts.out.tfevents.1772467705.017ddd03c11b.10985.0 | Bin 88 -> 0 bytes ...ts.out.tfevents.1772467705.017ddd03c11b.10987.0 | Bin 88 -> 0 bytes ...ts.out.tfevents.1772468322.017ddd03c11b.11735.0 | Bin 88 -> 0 bytes ...ts.out.tfevents.1772468322.017ddd03c11b.11737.0 | Bin 88 -> 0 bytes 7 files changed, 1 deletion(-) delete mode 100644 ${SAVE_DIR}/progress.txt delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0 delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0 delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0 delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0 delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0 delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0 diff --git a/${SAVE_DIR}/progress.txt b/${SAVE_DIR}/progress.txt deleted file mode 100644 index 0f93fe87b6..0000000000 --- a/${SAVE_DIR}/progress.txt +++ /dev/null @@ -1 +0,0 @@ -2026-03-02 17:36:47 Job ID: # GPUs: 2 Starting job diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0 deleted file mode 100644 index 47455eb593a7985968b0da0f007801cec45ae4e1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 88 zcmeZZfPjCKJmzxFc-v^5n|aGoiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-QuJ5iC;%>_Ain?r diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0 deleted file mode 100644 index 16cd1a614553ab5d1bb4a8f48af99b95d42cad04..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 88 zcmeZZfPjCKJmzw$GtaQj&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$ hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{ZVJVPo007Lw9y|a5 diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0 deleted file mode 100644 index 94b1235eb75983635cd55b40b1b53304b49e69db..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 88 zcmeZZfPjCKJmzv*eoL#%&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$ hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{ZIX>x_2>>B$AeaCE diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0 deleted file mode 100644 index 8c76604d9172bde22402d34062c209dbe5da6726..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 88 zcmeZZfPjCKJmzv{olLIF&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$ hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{Zxs>!^EdU}iAqM~e diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0 b/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0 deleted file mode 100644 index 0cbe458547164260d95245eee077c7cd5a239512..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 88 zcmeZZfPjCKJmzwiU7aqGmwC%kiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-@?qCcQ2-pXAmso6 diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0 b/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0 deleted file mode 100644 index 71b6950ddc84bc0952de7b212ffadc31faae2c71..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 88 zcmeZZfPjCKJmzxh?Vll$mwC%kiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-qWL}E3jqEVADI9E From eb927cb2b87ef6bc6412ceceba85505a737f9d14 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 2 Mar 2026 13:17:18 -0800 Subject: [PATCH 06/11] add mlm to mbridge ckpt tests Signed-off-by: dimapihtar --- ..._Launch_ckpts_mlm_to_mbridge_llama32_1b.sh | 28 ++++++++++++++++++ ...aunch_ckpts_mlm_to_mbridge_nemotronh_4b.sh | 29 +++++++++++++++++++ ...L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh | 28 ++++++++++++++++++ .../ckpts/llama32_1b/test_llama32_1b_ckpt.py | 15 ++++++---- .../nemotronh_4b/test_nemotronh_4b_ckpt.py | 15 ++++++---- .../ckpts/qwen3_4b/test_qwen3_4b_ckpt.py | 17 +++++++---- 6 files changed, 116 insertions(+), 16 deletions(-) create mode 100755 tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_llama32_1b.sh create mode 100755 tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b.sh create mode 100755 tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh diff --git a/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_llama32_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_llama32_1b.sh new file mode 100755 index 0000000000..bbf174c4ee --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_llama32_1b.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_core + +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_mbridge +coverage combine -q + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_remove_artifacts diff --git a/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b.sh new file mode 100755 index 0000000000..04f71056e4 --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mcore + +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mbridge +coverage combine -q + + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_remove_artifacts diff --git a/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh new file mode 100755 index 0000000000..81e7a29ea2 --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mcore + +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mbridge +coverage combine -q + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_remove_artifacts diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py index ae5e11b83f..792ff7ca37 100644 --- a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py +++ b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py @@ -41,9 +41,13 @@ def test_llama32_1B_ckpt_mbridge(self): config = llama32_1b_pretrain_config() + config.checkpoint.save = MBRIDGE_CKPT + config.checkpoint.load = MCORE_CKPT if os.path.exists(MCORE_CKPT) else None + config.checkpoint.load_optim = False + config.model.seq_length = 8192 - config.train.train_iters = 5 + config.train.train_iters = 10 if config.checkpoint.load else 5 config.train.eval_iters = 5 config.train.save_interval = 5 config.train.global_batch_size = 8 @@ -53,14 +57,15 @@ def test_llama32_1B_ckpt_mbridge(self): config.logger.log_interval = 1 - config.checkpoint.save = MBRIDGE_CKPT - pretrain(config=config, forward_step_func=forward_step) @pytest.mark.run_only_on("GPU") def test_llama32_1B_ckpt_core(self, monkeypatch): """Functional test for LLama MCore checkpoint.""" + load_dir = MBRIDGE_CKPT if os.path.exists(MBRIDGE_CKPT) else None + train_iters = 10 if load_dir else 5 + # Set environment variables monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1") monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1") @@ -98,14 +103,14 @@ def test_llama32_1B_ckpt_core(self, monkeypatch): "--max-position-embeddings", "8192", "--micro-batch-size", "1", "--global-batch-size", "8", - "--train-iters", "10", "--mock-data", "--tokenizer-type", "NullTokenizer", "--vocab-size", "131072", + "--train-iters", f"{train_iters}", "--save-interval", "5", "--eval-interval", "5", "--eval-iters", "5", - "--load", MBRIDGE_CKPT, + "--load", load_dir, "--save", MCORE_CKPT, "--ckpt-format", "torch_dist", "--log-progress", diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py index 3f4490e93d..2d049b3adb 100644 --- a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py +++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py @@ -41,10 +41,14 @@ def test_nemotronh_4b_ckpt_mbridge(self): config = nemotronh_4b_pretrain_config() + config.checkpoint.save = MBRIDGE_CKPT + config.checkpoint.load = MCORE_CKPT if os.path.exists(MCORE_CKPT) else None + config.checkpoint.load_optim = False + config.model.num_layers = 26 config.model.hybrid_override_pattern = "M-M-M-M*-M-M-M-M*-M-M-M-M*" - config.train.train_iters = 5 + config.train.train_iters = 10 if config.checkpoint.load else 5 config.train.eval_iters = 5 config.train.save_interval = 5 config.train.global_batch_size = 4 @@ -54,14 +58,15 @@ def test_nemotronh_4b_ckpt_mbridge(self): config.logger.log_interval = 1 - config.checkpoint.save = MBRIDGE_CKPT - pretrain(config=config, forward_step_func=forward_step) @pytest.mark.run_only_on("GPU") def test_nemotronh_4b_ckpt_mcore(self, monkeypatch): """Functional test for Nemotron Hybrid MCore checkpoint.""" + load_dir = MBRIDGE_CKPT if os.path.exists(MBRIDGE_CKPT) else None + train_iters = 10 if load_dir else 5 + # Set environment variables monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1") monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1") @@ -103,14 +108,14 @@ def test_nemotronh_4b_ckpt_mcore(self, monkeypatch): "--max-position-embeddings", "8192", "--micro-batch-size", "1", "--global-batch-size", "4", - "--train-iters", "10", "--mock-data", "--tokenizer-type", "NullTokenizer", "--vocab-size", "151936", + "--train-iters", f"{train_iters}", "--save-interval", "5", "--eval-interval", "5", "--eval-iters", "4", - "--load", MBRIDGE_CKPT, + "--load", load_dir, "--save", MCORE_CKPT, "--ckpt-format", "torch_dist", "--log-progress", diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py index 00769a40dd..5030d609da 100644 --- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py +++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py @@ -40,10 +40,14 @@ def test_qwen3_4b_ckpt_mbridge(self): """Functional test for Qwen MBridge checkpoint.""" config = qwen3_4b_pretrain_config() - + + config.checkpoint.save = MBRIDGE_CKPT + config.checkpoint.load = MCORE_CKPT if os.path.exists(MCORE_CKPT) else None + config.checkpoint.load_optim = False + config.model.num_layers = 24 - config.train.train_iters = 5 + config.train.train_iters = 10 if config.checkpoint.load else 5 config.train.eval_iters = 5 config.train.save_interval = 5 config.train.global_batch_size = 4 @@ -53,14 +57,15 @@ def test_qwen3_4b_ckpt_mbridge(self): config.logger.log_interval = 1 - config.checkpoint.save = MBRIDGE_CKPT - pretrain(config=config, forward_step_func=forward_step) @pytest.mark.run_only_on("GPU") def test_qwen3_4b_ckpt_mcore(self, monkeypatch): """Functional test for Qwen MCore checkpoint.""" + load_dir = MBRIDGE_CKPT if os.path.exists(MBRIDGE_CKPT) else None + train_iters = 10 if load_dir else 5 + # Set environment variables monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1") monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1") @@ -99,14 +104,14 @@ def test_qwen3_4b_ckpt_mcore(self, monkeypatch): "--max-position-embeddings", "4096", "--micro-batch-size", "1", "--global-batch-size", "4", - "--train-iters", "10", "--mock-data", "--tokenizer-type", "NullTokenizer", "--vocab-size", "151936", + "--train-iters", f"{train_iters}", "--save-interval", "5", "--eval-interval", "5", "--eval-iters", "4", - "--load", MBRIDGE_CKPT, + "--load", load_dir, "--save", MCORE_CKPT, "--ckpt-format", "torch_dist", "--log-progress", From 846de61db31bedbc0641b0b0fd410b9fbe095b95 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 3 Mar 2026 06:15:39 -0800 Subject: [PATCH 07/11] fix code style Signed-off-by: dimapihtar --- .../ckpts/llama32_1b/test_llama32_1b_ckpt.py | 104 ++++++++++------ .../nemotronh_4b/test_nemotronh_4b_ckpt.py | 114 ++++++++++++------ .../ckpts/qwen3_4b/test_qwen3_4b_ckpt.py | 104 ++++++++++------ 3 files changed, 214 insertions(+), 108 deletions(-) diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py index 792ff7ca37..5a78ab6c0b 100644 --- a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py +++ b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py @@ -15,10 +15,10 @@ """Functional smoke tests for LLaMA checkpointing.""" import os -import pytest import shutil import sys +import pytest from torch.distributed.run import main as torchrun_main from megatron.bridge.recipes.llama import llama32_1b_pretrain_config @@ -78,48 +78,82 @@ def test_llama32_1B_ckpt_core(self, monkeypatch): "torchrun", "--nproc-per-node=2", "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py", - "--load", "/workspace/test_ckpts/llama32_1b_mbridge", - "--save", "/workspace/test_ckpts/llama32_1b_mcore", - "--init-method-std", "0.014", + "--load", + "/workspace/test_ckpts/llama32_1b_mbridge", + "--save", + "/workspace/test_ckpts/llama32_1b_mcore", + "--init-method-std", + "0.014", "--disable-bias-linear", "--use-rope-scaling", "--swiglu", "--use-rotary-position-embeddings", - "--num-layers", "16", - "--hidden-size", "2048", - "--num-attention-heads", "32", - "--ffn-hidden-size", "8192", - "--kv-channels", "64", + "--num-layers", + "16", + "--hidden-size", + "2048", + "--num-attention-heads", + "32", + "--ffn-hidden-size", + "8192", + "--kv-channels", + "64", "--group-query-attention", - "--position-embedding-type", "rope", - "--attention-backend", "fused", - "--num-query-groups", "8", - "--normalization", "RMSNorm", - "--attention-dropout", "0.0", - "--hidden-dropout", "0.0", - "--tensor-model-parallel-size", "1", - "--pipeline-model-parallel-size", "1", - "--seq-length", "8192", - "--max-position-embeddings", "8192", - "--micro-batch-size", "1", - "--global-batch-size", "8", + "--position-embedding-type", + "rope", + "--attention-backend", + "fused", + "--num-query-groups", + "8", + "--normalization", + "RMSNorm", + "--attention-dropout", + "0.0", + "--hidden-dropout", + "0.0", + "--tensor-model-parallel-size", + "1", + "--pipeline-model-parallel-size", + "1", + "--seq-length", + "8192", + "--max-position-embeddings", + "8192", + "--micro-batch-size", + "1", + "--global-batch-size", + "8", "--mock-data", - "--tokenizer-type", "NullTokenizer", - "--vocab-size", "131072", - "--train-iters", f"{train_iters}", - "--save-interval", "5", - "--eval-interval", "5", - "--eval-iters", "5", - "--load", load_dir, - "--save", MCORE_CKPT, - "--ckpt-format", "torch_dist", + "--tokenizer-type", + "NullTokenizer", + "--vocab-size", + "131072", + "--train-iters", + f"{train_iters}", + "--save-interval", + "5", + "--eval-interval", + "5", + "--eval-iters", + "5", + "--load", + load_dir, + "--save", + MCORE_CKPT, + "--ckpt-format", + "torch_dist", "--log-progress", "--bf16", - "--lr", "4.5e-4", - "--min-lr", "4.5e-5", - "--num-workers", "2", - "--tensorboard-dir", TB_DIR, - "--log-interval", "1", + "--lr", + "4.5e-4", + "--min-lr", + "4.5e-5", + "--num-workers", + "2", + "--tensorboard-dir", + TB_DIR, + "--log-interval", + "1", "--log-throughput", "--no-load-optim", ], diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py index 2d049b3adb..5a46e0e207 100644 --- a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py +++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py @@ -15,10 +15,10 @@ """Functional smoke tests for LLaMA recipe configurations.""" import os -import pytest import shutil import sys +import pytest from torch.distributed.run import main as torchrun_main from megatron.bridge.recipes.nemotronh import nemotronh_4b_pretrain_config @@ -79,52 +79,90 @@ def test_nemotronh_4b_ckpt_mcore(self, monkeypatch): "torchrun", "--nproc_per_node=2", "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_mamba.py", - "--init-method-std", "0.014", + "--init-method-std", + "0.014", "--disable-bias-linear", "--use-rope-scaling", "--squared-relu", "--qk-layernorm", - "--rotary-percent", "1.0", - "--rotary-base", "1000000", + "--rotary-percent", + "1.0", + "--rotary-base", + "1000000", "--use-rotary-position-embeddings", - "--hybrid-override-pattern", "M-M-M-M*-M-M-M-M*-M-M-M-M*", - "--spec", "megatron.core.models.mamba.mamba_layer_specs", "mamba_stack_spec", - "--num-layers", "26", - "--hidden-size", "3072", - "--num-attention-heads", "32", - "--mamba-num-heads", "112", - "--ffn-hidden-size", "12288", - "--kv-channels", "128", + "--hybrid-override-pattern", + "M-M-M-M*-M-M-M-M*-M-M-M-M*", + "--spec", + "megatron.core.models.mamba.mamba_layer_specs", + "mamba_stack_spec", + "--num-layers", + "26", + "--hidden-size", + "3072", + "--num-attention-heads", + "32", + "--mamba-num-heads", + "112", + "--ffn-hidden-size", + "12288", + "--kv-channels", + "128", "--group-query-attention", - "--position-embedding-type", "none", - "--attention-backend", "fused", - "--num-query-groups", "8", - "--normalization", "RMSNorm", - "--attention-dropout", "0.0", - "--hidden-dropout", "0.0", - "--tensor-model-parallel-size", "2", - "--pipeline-model-parallel-size", "1", - "--seq-length", "8192", - "--max-position-embeddings", "8192", - "--micro-batch-size", "1", - "--global-batch-size", "4", + "--position-embedding-type", + "none", + "--attention-backend", + "fused", + "--num-query-groups", + "8", + "--normalization", + "RMSNorm", + "--attention-dropout", + "0.0", + "--hidden-dropout", + "0.0", + "--tensor-model-parallel-size", + "2", + "--pipeline-model-parallel-size", + "1", + "--seq-length", + "8192", + "--max-position-embeddings", + "8192", + "--micro-batch-size", + "1", + "--global-batch-size", + "4", "--mock-data", - "--tokenizer-type", "NullTokenizer", - "--vocab-size", "151936", - "--train-iters", f"{train_iters}", - "--save-interval", "5", - "--eval-interval", "5", - "--eval-iters", "4", - "--load", load_dir, - "--save", MCORE_CKPT, - "--ckpt-format", "torch_dist", + "--tokenizer-type", + "NullTokenizer", + "--vocab-size", + "151936", + "--train-iters", + f"{train_iters}", + "--save-interval", + "5", + "--eval-interval", + "5", + "--eval-iters", + "4", + "--load", + load_dir, + "--save", + MCORE_CKPT, + "--ckpt-format", + "torch_dist", "--log-progress", "--bf16", - "--lr", "4.5e-4", - "--min-lr", "4.5e-5", - "--num-workers", "2", - "--tensorboard-dir", TB_DIR, - "--log-interval", "1", + "--lr", + "4.5e-4", + "--min-lr", + "4.5e-5", + "--num-workers", + "2", + "--tensorboard-dir", + TB_DIR, + "--log-interval", + "1", "--log-throughput", "--no-load-optim", "--no-load-rng", diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py index 5030d609da..26282656ae 100644 --- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py +++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py @@ -15,10 +15,10 @@ """Functional smoke tests for Qwen checkpointing.""" import os -import pytest import shutil import sys +import pytest from torch.distributed.run import main as torchrun_main from megatron.bridge.recipes.qwen import qwen3_4b_pretrain_config @@ -78,49 +78,83 @@ def test_qwen3_4b_ckpt_mcore(self, monkeypatch): "torchrun", "--nproc_per_node=2", "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py", - "--init-method-std", "0.014", + "--init-method-std", + "0.014", "--disable-bias-linear", "--use-rope-scaling", "--swiglu", "--qk-layernorm", - "--rotary-percent", "1.0", - "--rotary-base", "1000000", + "--rotary-percent", + "1.0", + "--rotary-base", + "1000000", "--use-rotary-position-embeddings", - "--num-layers", "24", - "--hidden-size", "2560", - "--num-attention-heads", "32", - "--ffn-hidden-size", "9728", - "--kv-channels", "128", + "--num-layers", + "24", + "--hidden-size", + "2560", + "--num-attention-heads", + "32", + "--ffn-hidden-size", + "9728", + "--kv-channels", + "128", "--group-query-attention", - "--position-embedding-type", "rope", - "--attention-backend", "fused", - "--num-query-groups", "8", - "--normalization", "RMSNorm", - "--attention-dropout", "0.0", - "--hidden-dropout", "0.0", - "--tensor-model-parallel-size", "2", - "--pipeline-model-parallel-size", "1", - "--seq-length", "4096", - "--max-position-embeddings", "4096", - "--micro-batch-size", "1", - "--global-batch-size", "4", + "--position-embedding-type", + "rope", + "--attention-backend", + "fused", + "--num-query-groups", + "8", + "--normalization", + "RMSNorm", + "--attention-dropout", + "0.0", + "--hidden-dropout", + "0.0", + "--tensor-model-parallel-size", + "2", + "--pipeline-model-parallel-size", + "1", + "--seq-length", + "4096", + "--max-position-embeddings", + "4096", + "--micro-batch-size", + "1", + "--global-batch-size", + "4", "--mock-data", - "--tokenizer-type", "NullTokenizer", - "--vocab-size", "151936", - "--train-iters", f"{train_iters}", - "--save-interval", "5", - "--eval-interval", "5", - "--eval-iters", "4", - "--load", load_dir, - "--save", MCORE_CKPT, - "--ckpt-format", "torch_dist", + "--tokenizer-type", + "NullTokenizer", + "--vocab-size", + "151936", + "--train-iters", + f"{train_iters}", + "--save-interval", + "5", + "--eval-interval", + "5", + "--eval-iters", + "4", + "--load", + load_dir, + "--save", + MCORE_CKPT, + "--ckpt-format", + "torch_dist", "--log-progress", "--bf16", - "--lr", "4.5e-4", - "--min-lr", "4.5e-5", - "--num-workers", "2", - "--tensorboard-dir", "/workspace/tb", - "--log-interval", "1", + "--lr", + "4.5e-4", + "--min-lr", + "4.5e-5", + "--num-workers", + "2", + "--tensorboard-dir", + "/workspace/tb", + "--log-interval", + "1", "--log-throughput", "--no-load-optim", "--no-load-rng", From 80dcef067d03914102cd096c0711a8932a11bb34 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 3 Mar 2026 06:19:37 -0800 Subject: [PATCH 08/11] fix code style Signed-off-by: dimapihtar --- 3rdparty/Megatron-LM | 2 +- .../functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py | 1 - .../ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py | 4 ++-- tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM index 23dd639cf3..3d1a4ba71e 160000 --- a/3rdparty/Megatron-LM +++ b/3rdparty/Megatron-LM @@ -1 +1 @@ -Subproject commit 23dd639cf3de30f3b9d8d0fae71ee31180be9ddd +Subproject commit 3d1a4ba71ecc49f1a0c9480c90f819d2b00f9915 diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py index 5a78ab6c0b..2a1dd30bdf 100644 --- a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py +++ b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py @@ -167,4 +167,3 @@ def test_remove_artifacts(self): shutil.rmtree(BASE_DIR) assert not os.path.exists(BASE_DIR) - diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py index 5a46e0e207..17510074dd 100644 --- a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py +++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py @@ -59,7 +59,7 @@ def test_nemotronh_4b_ckpt_mbridge(self): config.logger.log_interval = 1 pretrain(config=config, forward_step_func=forward_step) - + @pytest.mark.run_only_on("GPU") def test_nemotronh_4b_ckpt_mcore(self, monkeypatch): """Functional test for Nemotron Hybrid MCore checkpoint.""" @@ -149,7 +149,7 @@ def test_nemotronh_4b_ckpt_mcore(self, monkeypatch): load_dir, "--save", MCORE_CKPT, - "--ckpt-format", + "--ckpt-format", "torch_dist", "--log-progress", "--bf16", diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py index 26282656ae..9931c5bb05 100644 --- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py +++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py @@ -168,4 +168,4 @@ def test_remove_artifacts(self): """Removes model artifacts""" shutil.rmtree(BASE_DIR) - assert not os.path.exists(BASE_DIR) \ No newline at end of file + assert not os.path.exists(BASE_DIR) From 038f18d762fb53ab5bba98f05c34b6611e9eaabf Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 3 Mar 2026 06:24:55 -0800 Subject: [PATCH 09/11] revert mlm commit Signed-off-by: dimapihtar --- 3rdparty/Megatron-LM | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM index 3d1a4ba71e..23dd639cf3 160000 --- a/3rdparty/Megatron-LM +++ b/3rdparty/Megatron-LM @@ -1 +1 @@ -Subproject commit 3d1a4ba71ecc49f1a0c9480c90f819d2b00f9915 +Subproject commit 23dd639cf3de30f3b9d8d0fae71ee31180be9ddd From b96fda166394302a4ac2adef1e9d9afbb9b906fa Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 3 Mar 2026 06:54:07 -0800 Subject: [PATCH 10/11] fix typi Signed-off-by: dimapihtar --- .github/workflows/cicd-main.yml | 6 +++--- tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index ddccb0a5ff..c54a24163e 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -380,9 +380,9 @@ jobs: # - script: L2_Launch_quantization_export - script: L2_Launch_recipes_llama_cuda_graphs - script: L2_Launch_utils - - script: L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh - - script: L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh - - script: L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh + - script: L2_Launch_ckpts_mbridge_to_mlm_llama32_1b + - script: L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b + - script: L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b needs: [pre-flight, cicd-unit-tests] runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2 if: | diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py index 9931c5bb05..9990b59679 100644 --- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py +++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py @@ -45,8 +45,6 @@ def test_qwen3_4b_ckpt_mbridge(self): config.checkpoint.load = MCORE_CKPT if os.path.exists(MCORE_CKPT) else None config.checkpoint.load_optim = False - config.model.num_layers = 24 - config.train.train_iters = 10 if config.checkpoint.load else 5 config.train.eval_iters = 5 config.train.save_interval = 5 @@ -90,7 +88,7 @@ def test_qwen3_4b_ckpt_mcore(self, monkeypatch): "1000000", "--use-rotary-position-embeddings", "--num-layers", - "24", + "36", "--hidden-size", "2560", "--num-attention-heads", From 83fd74aa748fbd268227f12895be94070f7faae8 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Wed, 4 Mar 2026 09:02:58 -0800 Subject: [PATCH 11/11] run mlm to mbridge ckpt tests Signed-off-by: dimapihtar --- .github/workflows/cicd-main.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index c54a24163e..a312690372 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -381,8 +381,11 @@ jobs: - script: L2_Launch_recipes_llama_cuda_graphs - script: L2_Launch_utils - script: L2_Launch_ckpts_mbridge_to_mlm_llama32_1b + - script: L2_Launch_ckpts_mlm_to_mbridge_llama32_1b - script: L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b + - script: L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b - script: L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b + - script: L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b needs: [pre-flight, cicd-unit-tests] runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2 if: |