diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 7257c75c46..a312690372 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -380,6 +380,12 @@ jobs: # - script: L2_Launch_quantization_export - script: L2_Launch_recipes_llama_cuda_graphs - script: L2_Launch_utils + - script: L2_Launch_ckpts_mbridge_to_mlm_llama32_1b + - script: L2_Launch_ckpts_mlm_to_mbridge_llama32_1b + - script: L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b + - script: L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b + - script: L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b + - script: L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b needs: [pre-flight, cicd-unit-tests] runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2 if: | diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh new file mode 100755 index 0000000000..aeadc54ddd --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_mbridge +coverage combine -q + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_core + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_remove_artifacts diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh new file mode 100755 index 0000000000..c39b782f98 --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mbridge +coverage combine -q + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mcore + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_remove_artifacts diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh new file mode 100755 index 0000000000..e392f76aad --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mbridge +coverage combine -q + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mcore + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_remove_artifacts diff --git a/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_llama32_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_llama32_1b.sh new file mode 100755 index 0000000000..bbf174c4ee --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_llama32_1b.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_core + +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_mbridge +coverage combine -q + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_remove_artifacts diff --git a/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b.sh new file mode 100755 index 0000000000..04f71056e4 --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mcore + +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mbridge +coverage combine -q + + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_remove_artifacts diff --git a/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh new file mode 100755 index 0000000000..81e7a29ea2 --- /dev/null +++ b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +export CUDA_VISIBLE_DEVICES="0,1" + +# Run recipe functional tests on 2 GPUs +# This script tests recipe configurations with their default settings to ensure +# they can run basic training without crashes +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mcore + +uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mbridge +coverage combine -q + +pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_remove_artifacts diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py new file mode 100644 index 0000000000..2a1dd30bdf --- /dev/null +++ b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py @@ -0,0 +1,169 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for LLaMA checkpointing.""" + +import os +import shutil +import sys + +import pytest +from torch.distributed.run import main as torchrun_main + +from megatron.bridge.recipes.llama import llama32_1b_pretrain_config +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain + + +BASE_DIR = "/workspace/test_ckpts/llama32_1b" +MBRIDGE_CKPT = f"{BASE_DIR}/mbridge" +MCORE_CKPT = f"{BASE_DIR}/mcore" +TB_DIR = f"{BASE_DIR}/tb" + + +class TestLlama32Ckpt: + """Test class for LLama checkpoint functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_llama32_1B_ckpt_mbridge(self): + """Functional test for LLama MBridge checkpoint.""" + + config = llama32_1b_pretrain_config() + + config.checkpoint.save = MBRIDGE_CKPT + config.checkpoint.load = MCORE_CKPT if os.path.exists(MCORE_CKPT) else None + config.checkpoint.load_optim = False + + config.model.seq_length = 8192 + + config.train.train_iters = 10 if config.checkpoint.load else 5 + config.train.eval_iters = 5 + config.train.save_interval = 5 + config.train.global_batch_size = 8 + config.train.micro_batch_size = 1 + + config.scheduler.lr_warmup_iters = 2 + + config.logger.log_interval = 1 + + pretrain(config=config, forward_step_func=forward_step) + + @pytest.mark.run_only_on("GPU") + def test_llama32_1B_ckpt_core(self, monkeypatch): + """Functional test for LLama MCore checkpoint.""" + + load_dir = MBRIDGE_CKPT if os.path.exists(MBRIDGE_CKPT) else None + train_iters = 10 if load_dir else 5 + + # Set environment variables + monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1") + monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1") + + # Set MLM script + monkeypatch.setattr( + sys, + "argv", + [ + "torchrun", + "--nproc-per-node=2", + "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py", + "--load", + "/workspace/test_ckpts/llama32_1b_mbridge", + "--save", + "/workspace/test_ckpts/llama32_1b_mcore", + "--init-method-std", + "0.014", + "--disable-bias-linear", + "--use-rope-scaling", + "--swiglu", + "--use-rotary-position-embeddings", + "--num-layers", + "16", + "--hidden-size", + "2048", + "--num-attention-heads", + "32", + "--ffn-hidden-size", + "8192", + "--kv-channels", + "64", + "--group-query-attention", + "--position-embedding-type", + "rope", + "--attention-backend", + "fused", + "--num-query-groups", + "8", + "--normalization", + "RMSNorm", + "--attention-dropout", + "0.0", + "--hidden-dropout", + "0.0", + "--tensor-model-parallel-size", + "1", + "--pipeline-model-parallel-size", + "1", + "--seq-length", + "8192", + "--max-position-embeddings", + "8192", + "--micro-batch-size", + "1", + "--global-batch-size", + "8", + "--mock-data", + "--tokenizer-type", + "NullTokenizer", + "--vocab-size", + "131072", + "--train-iters", + f"{train_iters}", + "--save-interval", + "5", + "--eval-interval", + "5", + "--eval-iters", + "5", + "--load", + load_dir, + "--save", + MCORE_CKPT, + "--ckpt-format", + "torch_dist", + "--log-progress", + "--bf16", + "--lr", + "4.5e-4", + "--min-lr", + "4.5e-5", + "--num-workers", + "2", + "--tensorboard-dir", + TB_DIR, + "--log-interval", + "1", + "--log-throughput", + "--no-load-optim", + ], + ) + + # Run MLM script + torchrun_main() + + def test_remove_artifacts(self): + """Removes model artifacts""" + shutil.rmtree(BASE_DIR) + + assert not os.path.exists(BASE_DIR) diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py new file mode 100644 index 0000000000..17510074dd --- /dev/null +++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py @@ -0,0 +1,179 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for LLaMA recipe configurations.""" + +import os +import shutil +import sys + +import pytest +from torch.distributed.run import main as torchrun_main + +from megatron.bridge.recipes.nemotronh import nemotronh_4b_pretrain_config +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain + + +BASE_DIR = "/workspace/test_ckpts/nemotronh_4b" +MBRIDGE_CKPT = f"{BASE_DIR}/mbridge" +MCORE_CKPT = f"{BASE_DIR}/mcore" +TB_DIR = f"{BASE_DIR}/tb" + + +class TestNemotronhCkpt: + """Test class for Nempotron Hybrid checkpoint functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_nemotronh_4b_ckpt_mbridge(self): + """Functional test for Nemotron Hybrid MBridge checkpoint.""" + + config = nemotronh_4b_pretrain_config() + + config.checkpoint.save = MBRIDGE_CKPT + config.checkpoint.load = MCORE_CKPT if os.path.exists(MCORE_CKPT) else None + config.checkpoint.load_optim = False + + config.model.num_layers = 26 + config.model.hybrid_override_pattern = "M-M-M-M*-M-M-M-M*-M-M-M-M*" + + config.train.train_iters = 10 if config.checkpoint.load else 5 + config.train.eval_iters = 5 + config.train.save_interval = 5 + config.train.global_batch_size = 4 + config.train.micro_batch_size = 1 + + config.scheduler.lr_warmup_iters = 2 + + config.logger.log_interval = 1 + + pretrain(config=config, forward_step_func=forward_step) + + @pytest.mark.run_only_on("GPU") + def test_nemotronh_4b_ckpt_mcore(self, monkeypatch): + """Functional test for Nemotron Hybrid MCore checkpoint.""" + + load_dir = MBRIDGE_CKPT if os.path.exists(MBRIDGE_CKPT) else None + train_iters = 10 if load_dir else 5 + + # Set environment variables + monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1") + monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1") + + # Set MLM script + monkeypatch.setattr( + sys, + "argv", + [ + "torchrun", + "--nproc_per_node=2", + "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_mamba.py", + "--init-method-std", + "0.014", + "--disable-bias-linear", + "--use-rope-scaling", + "--squared-relu", + "--qk-layernorm", + "--rotary-percent", + "1.0", + "--rotary-base", + "1000000", + "--use-rotary-position-embeddings", + "--hybrid-override-pattern", + "M-M-M-M*-M-M-M-M*-M-M-M-M*", + "--spec", + "megatron.core.models.mamba.mamba_layer_specs", + "mamba_stack_spec", + "--num-layers", + "26", + "--hidden-size", + "3072", + "--num-attention-heads", + "32", + "--mamba-num-heads", + "112", + "--ffn-hidden-size", + "12288", + "--kv-channels", + "128", + "--group-query-attention", + "--position-embedding-type", + "none", + "--attention-backend", + "fused", + "--num-query-groups", + "8", + "--normalization", + "RMSNorm", + "--attention-dropout", + "0.0", + "--hidden-dropout", + "0.0", + "--tensor-model-parallel-size", + "2", + "--pipeline-model-parallel-size", + "1", + "--seq-length", + "8192", + "--max-position-embeddings", + "8192", + "--micro-batch-size", + "1", + "--global-batch-size", + "4", + "--mock-data", + "--tokenizer-type", + "NullTokenizer", + "--vocab-size", + "151936", + "--train-iters", + f"{train_iters}", + "--save-interval", + "5", + "--eval-interval", + "5", + "--eval-iters", + "4", + "--load", + load_dir, + "--save", + MCORE_CKPT, + "--ckpt-format", + "torch_dist", + "--log-progress", + "--bf16", + "--lr", + "4.5e-4", + "--min-lr", + "4.5e-5", + "--num-workers", + "2", + "--tensorboard-dir", + TB_DIR, + "--log-interval", + "1", + "--log-throughput", + "--no-load-optim", + "--no-load-rng", + ], + ) + + # Run MLM script + torchrun_main() + + def test_remove_artifacts(self): + """Removes model artifacts""" + shutil.rmtree(BASE_DIR) + + assert not os.path.exists(BASE_DIR) diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py new file mode 100644 index 0000000000..9990b59679 --- /dev/null +++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py @@ -0,0 +1,169 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for Qwen checkpointing.""" + +import os +import shutil +import sys + +import pytest +from torch.distributed.run import main as torchrun_main + +from megatron.bridge.recipes.qwen import qwen3_4b_pretrain_config +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain + + +BASE_DIR = "/workspace/test_ckpts/qwen3_4b" +MBRIDGE_CKPT = f"{BASE_DIR}/mbridge" +MCORE_CKPT = f"{BASE_DIR}/mcore" +TB_DIR = f"{BASE_DIR}/tb" + + +class TestQwen3Ckpt: + """Test class for Qwen checkpoint functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_qwen3_4b_ckpt_mbridge(self): + """Functional test for Qwen MBridge checkpoint.""" + + config = qwen3_4b_pretrain_config() + + config.checkpoint.save = MBRIDGE_CKPT + config.checkpoint.load = MCORE_CKPT if os.path.exists(MCORE_CKPT) else None + config.checkpoint.load_optim = False + + config.train.train_iters = 10 if config.checkpoint.load else 5 + config.train.eval_iters = 5 + config.train.save_interval = 5 + config.train.global_batch_size = 4 + config.train.micro_batch_size = 1 + + config.scheduler.lr_warmup_iters = 2 + + config.logger.log_interval = 1 + + pretrain(config=config, forward_step_func=forward_step) + + @pytest.mark.run_only_on("GPU") + def test_qwen3_4b_ckpt_mcore(self, monkeypatch): + """Functional test for Qwen MCore checkpoint.""" + + load_dir = MBRIDGE_CKPT if os.path.exists(MBRIDGE_CKPT) else None + train_iters = 10 if load_dir else 5 + + # Set environment variables + monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1") + monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1") + + # Set MLM script + monkeypatch.setattr( + sys, + "argv", + [ + "torchrun", + "--nproc_per_node=2", + "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py", + "--init-method-std", + "0.014", + "--disable-bias-linear", + "--use-rope-scaling", + "--swiglu", + "--qk-layernorm", + "--rotary-percent", + "1.0", + "--rotary-base", + "1000000", + "--use-rotary-position-embeddings", + "--num-layers", + "36", + "--hidden-size", + "2560", + "--num-attention-heads", + "32", + "--ffn-hidden-size", + "9728", + "--kv-channels", + "128", + "--group-query-attention", + "--position-embedding-type", + "rope", + "--attention-backend", + "fused", + "--num-query-groups", + "8", + "--normalization", + "RMSNorm", + "--attention-dropout", + "0.0", + "--hidden-dropout", + "0.0", + "--tensor-model-parallel-size", + "2", + "--pipeline-model-parallel-size", + "1", + "--seq-length", + "4096", + "--max-position-embeddings", + "4096", + "--micro-batch-size", + "1", + "--global-batch-size", + "4", + "--mock-data", + "--tokenizer-type", + "NullTokenizer", + "--vocab-size", + "151936", + "--train-iters", + f"{train_iters}", + "--save-interval", + "5", + "--eval-interval", + "5", + "--eval-iters", + "4", + "--load", + load_dir, + "--save", + MCORE_CKPT, + "--ckpt-format", + "torch_dist", + "--log-progress", + "--bf16", + "--lr", + "4.5e-4", + "--min-lr", + "4.5e-5", + "--num-workers", + "2", + "--tensorboard-dir", + "/workspace/tb", + "--log-interval", + "1", + "--log-throughput", + "--no-load-optim", + "--no-load-rng", + ], + ) + + # Run MLM script + torchrun_main() + + def test_remove_artifacts(self): + """Removes model artifacts""" + shutil.rmtree(BASE_DIR) + + assert not os.path.exists(BASE_DIR)