From 0ab5f13d1043bfa45aa4828e4b7ab5a00906d9c1 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 23 Feb 2026 10:53:58 -0800
Subject: [PATCH 01/11] add ckpt tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 ...2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh | 24 ++++++++++
 .../ckpts/llama3_1b/test_llama3_1b_mbridge.py | 45 +++++++++++++++++++
 .../ckpts/llama3_1b/test_llama3_1b_mcore.sh   | 45 +++++++++++++++++++
 3 files changed, 114 insertions(+)
 create mode 100755 tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh
 create mode 100644 tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py
 create mode 100755 tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh

diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh
new file mode 100755
index 0000000000..c988c7a766
--- /dev/null
+++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
+
+export CUDA_VISIBLE_DEVICES="0,1"
+
+# Run recipe functional tests on 2 GPUs
+# This script tests recipe configurations with their default settings to ensure
+# they can run basic training without crashes
+uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py
+coverage combine -q
diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py
new file mode 100644
index 0000000000..466fd13d34
--- /dev/null
+++ b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for LLaMA recipe configurations."""
+
+import pytest
+
+from megatron.bridge.recipes.llama import llama32_1b_pretrain_config
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.pretrain import pretrain
+
+
+class TestLlama3MBridgeCkpt:
+    """Test class for LLaMA recipe functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    def test_llama_pretrain_recipes(self):
+        """Functional test for LLaMA recipes with appropriate parallelism configurations."""
+
+        config = llama32_1b_pretrain_config()
+
+        config.model.seq_length = 8192
+
+        config.train.train_iters = 5
+        config.train.eval_iters = 5
+        config.train.save_interval = 5
+        config.train.global_batch_size = 8
+        config.train.micro_batch_size = 1
+
+        config.scheduler.lr_warmup_iters = 2
+
+        config.logger.log_interval = 1
+
+        pretrain(config=config, forward_step_func=forward_step)
\ No newline at end of file
diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh
new file mode 100755
index 0000000000..89ed67ba03
--- /dev/null
+++ b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh
@@ -0,0 +1,45 @@
+CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/megatron-lm/pretrain_gpt.py \
+    --init-method-std 0.014 \
+    --disable-bias-linear \
+    --use-rope-scaling \
+    --swiglu \
+    --use-rotary-position-embeddings \
+    --num-layers 16 \
+    --hidden-size 2048 \
+    --num-attention-heads 32 \
+    --ffn-hidden-size 8192 \
+    --kv-channels 64 \
+    --group-query-attention \
+    --position-embedding-type rope \
+    --attention-backend fused \
+    --num-query-groups 8 \
+    --normalization RMSNorm \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --seq-length 8192 \
+    --max-position-embeddings 8192 \
+    --micro-batch-size 1 \
+    --global-batch-size 8 \
+    --train-iters 5 \
+    --log-interval 1 \
+    --tokenizer-type SentencePieceTokenizer \
+    --tokenizer-model /opt/data/tokenizers/sentencepiece/tokenizer.model \
+    --vocab-size 131072 \
+    --save-interval 5 \
+    --eval-interval 5 \
+    --eval-iters 4 \
+    --load /path/to/mbridge/ckpt \
+    --save /path/to/save/ckpt \
+    --ckpt-format torch_dist \
+    --log-progress \
+    --bf16 \
+    --lr 4.5e-4 \
+    --min-lr 4.5e-5 \
+    --num-workers 2 \
+    --tensorboard-dir /workspace/tb \
+    --log-straggler \
+    --log-interval 1 \
+    --log-throughput \
+    --no-load-optim

From c352dbb86a263240d57ef944c4c6469fd4048464 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 26 Feb 2026 11:14:47 -0800
Subject: [PATCH 02/11] fix llama32_1b test

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 ...2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh |  2 ++
 .../ckpts/llama3_1b/test_llama3_1b_mbridge.py | 19 ++++++++++++++++--
 .../ckpts/llama3_1b/test_llama3_1b_mcore.sh   | 20 +++++++++++--------
 3 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh
index c988c7a766..7a178110b5 100755
--- a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh
+++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh
@@ -22,3 +22,5 @@ export CUDA_VISIBLE_DEVICES="0,1"
 # they can run basic training without crashes
 uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py
 coverage combine -q
+
+bash tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh
diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py
index 466fd13d34..3d774b6473 100644
--- a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py
+++ b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py
@@ -15,6 +15,7 @@
 """Functional smoke tests for LLaMA recipe configurations."""
 
 import pytest
+import subprocess
 
 from megatron.bridge.recipes.llama import llama32_1b_pretrain_config
 from megatron.bridge.training.gpt_step import forward_step
@@ -25,7 +26,7 @@ class TestLlama3MBridgeCkpt:
     """Test class for LLaMA recipe functional tests."""
 
     @pytest.mark.run_only_on("GPU")
-    def test_llama_pretrain_recipes(self):
+    def test_llama32_1B_ckpt_mbridge(self):
         """Functional test for LLaMA recipes with appropriate parallelism configurations."""
 
         config = llama32_1b_pretrain_config()
@@ -42,4 +43,18 @@ def test_llama_pretrain_recipes(self):
 
         config.logger.log_interval = 1
 
-        pretrain(config=config, forward_step_func=forward_step)
\ No newline at end of file
+        config.checkpoint.save = "/workspace/test_ckpts/llama32_1b_mbridge"
+
+        pretrain(config=config, forward_step_func=forward_step)
+
+    @pytest.mark.run_only_on("GPU")
+    def test_llama32_1B_ckpt_mcore(self):
+        """Functional test for LLaMA recipes with appropriate parallelism configurations."""
+
+        script_path = "test_llama3_1b_mcore.sh"
+        process = subprocess.Popen(
+            ["bash", script_path],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh
index 89ed67ba03..16d686fd71 100755
--- a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh
+++ b/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh
@@ -1,4 +1,7 @@
-CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/megatron-lm/pretrain_gpt.py \
+LOAD_DIR=/workspace/test_ckpts/llama32_1b_mbridge
+SAVE_DIR=/workspace/test_ckpts/llama32_1b_mcore
+
+CUDA_VISIBLE_DEVICES=0,1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py \
     --init-method-std 0.014 \
     --disable-bias-linear \
     --use-rope-scaling \
@@ -22,16 +25,15 @@ CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/megatron-lm/pretr
     --max-position-embeddings 8192 \
     --micro-batch-size 1 \
     --global-batch-size 8 \
-    --train-iters 5 \
-    --log-interval 1 \
-    --tokenizer-type SentencePieceTokenizer \
-    --tokenizer-model /opt/data/tokenizers/sentencepiece/tokenizer.model \
+    --train-iters 10 \
+    --mock-data \
+    --tokenizer-type NullTokenizer \
     --vocab-size 131072 \
     --save-interval 5 \
     --eval-interval 5 \
     --eval-iters 4 \
-    --load /path/to/mbridge/ckpt \
-    --save /path/to/save/ckpt \
+    --load ${LOAD_DIR} \
+    --save ${SAVE_DIR} \
     --ckpt-format torch_dist \
     --log-progress \
     --bf16 \
@@ -39,7 +41,9 @@ CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/megatron-lm/pretr
     --min-lr 4.5e-5 \
     --num-workers 2 \
     --tensorboard-dir /workspace/tb \
-    --log-straggler \
     --log-interval 1 \
     --log-throughput \
     --no-load-optim
+
+echo rm -rf ${LOAD_DIR}
+echo rm -rf ${SAVE_DIR}

From 0c25e0bcdf14c169f0cb5a5c989a0c8d94d2b14e Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 27 Feb 2026 14:42:03 -0800
Subject: [PATCH 03/11] add qwen3_4b & nemotronh_4b ckpt tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 ..._Launch_ckpts_mbridge_to_mlm_llama32_1b.sh | 26 +++++++++
 ...aunch_ckpts_mbridge_to_mlm_nemotronh_4b.sh | 26 +++++++++
 ...2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh} |  6 +-
 .../test_llama32_1b_mbridge.py}               | 13 +----
 .../test_llama32_1b_mcore.sh}                 |  0
 .../nemotronh_4b/test_nemotronh_4b_mbridge.py | 50 +++++++++++++++++
 .../nemotronh_4b/test_nemotronh_4b_mcore.sh   | 56 +++++++++++++++++++
 .../ckpts/qwen3_4b/test_qwen3_4b_mbridge.py   | 47 ++++++++++++++++
 .../ckpts/qwen3_4b/test_qwen3_4b_mcore.sh     | 53 ++++++++++++++++++
 9 files changed, 262 insertions(+), 15 deletions(-)
 create mode 100755 tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh
 create mode 100755 tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh
 rename tests/functional_tests/{L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh => L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh} (84%)
 rename tests/functional_tests/ckpts/{llama3_1b/test_llama3_1b_mbridge.py => llama32_1b/test_llama32_1b_mbridge.py} (79%)
 rename tests/functional_tests/ckpts/{llama3_1b/test_llama3_1b_mcore.sh => llama32_1b/test_llama32_1b_mcore.sh} (100%)
 create mode 100644 tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py
 create mode 100644 tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh
 create mode 100644 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py
 create mode 100755 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh

diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh
new file mode 100755
index 0000000000..d28728b2f0
--- /dev/null
+++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
+
+export CUDA_VISIBLE_DEVICES="0,1"
+
+# Run recipe functional tests on 2 GPUs
+# This script tests recipe configurations with their default settings to ensure
+# they can run basic training without crashes
+uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py
+coverage combine -q
+
+bash tests/functional_tests/ckpts/llama3_1b/test_llama32_1b_mcore.sh
diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh
new file mode 100755
index 0000000000..f0f5c3bbaa
--- /dev/null
+++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
+
+export CUDA_VISIBLE_DEVICES="0,1"
+
+# Run recipe functional tests on 2 GPUs
+# This script tests recipe configurations with their default settings to ensure
+# they can run basic training without crashes
+uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py
+coverage combine -q
+
+bash tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh
diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh
similarity index 84%
rename from tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh
rename to tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh
index 7a178110b5..21c5083c2c 100755
--- a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama3_1b.sh
+++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@ export CUDA_VISIBLE_DEVICES="0,1"
 # Run recipe functional tests on 2 GPUs
 # This script tests recipe configurations with their default settings to ensure
 # they can run basic training without crashes
-uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py
+uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py
 coverage combine -q
 
-bash tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh
+bash tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh
diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py
similarity index 79%
rename from tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py
rename to tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py
index 3d774b6473..199f083140 100644
--- a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mbridge.py
+++ b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py
@@ -22,7 +22,7 @@
 from megatron.bridge.training.pretrain import pretrain
 
 
-class TestLlama3MBridgeCkpt:
+class TestLlama32MBridgeCkpt:
     """Test class for LLaMA recipe functional tests."""
 
     @pytest.mark.run_only_on("GPU")
@@ -47,14 +47,3 @@ def test_llama32_1B_ckpt_mbridge(self):
 
         pretrain(config=config, forward_step_func=forward_step)
 
-    @pytest.mark.run_only_on("GPU")
-    def test_llama32_1B_ckpt_mcore(self):
-        """Functional test for LLaMA recipes with appropriate parallelism configurations."""
-
-        script_path = "test_llama3_1b_mcore.sh"
-        process = subprocess.Popen(
-            ["bash", script_path],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-        )
diff --git a/tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh
similarity index 100%
rename from tests/functional_tests/ckpts/llama3_1b/test_llama3_1b_mcore.sh
rename to tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh
diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py
new file mode 100644
index 0000000000..bfab5ab120
--- /dev/null
+++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for LLaMA recipe configurations."""
+
+import pytest
+import subprocess
+
+from megatron.bridge.recipes.nemotronh import nemotronh_4b_pretrain_config
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.pretrain import pretrain
+
+
+class TestNemotronhMBridgeCkpt:
+    """Test class for LLaMA recipe functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    def test_nemotronh_4b_ckpt_mbridge(self):
+        """Functional test for LLaMA recipes with appropriate parallelism configurations."""
+
+        config = nemotronh_4b_pretrain_config()
+
+        config.model.num_layers = 26
+        config.model.hybrid_override_pattern = "M-M-M-M*-M-M-M-M*-M-M-M-M*"
+
+        config.train.train_iters = 5
+        config.train.eval_iters = 5
+        config.train.save_interval = 5
+        config.train.global_batch_size = 4
+        config.train.micro_batch_size = 1
+
+        config.scheduler.lr_warmup_iters = 2
+
+        config.logger.log_interval = 1
+
+        config.checkpoint.save = "/workspace/test_ckpts/nemotronh_4b_mbridge"
+
+        pretrain(config=config, forward_step_func=forward_step)
+
diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh
new file mode 100644
index 0000000000..cf3d80c4e5
--- /dev/null
+++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh
@@ -0,0 +1,56 @@
+LOAD_DIR=/workspace/test_ckpts/nemotronh_4b_mbridge
+SAVE_DIR=/workspace/test_ckpts/nemotronh_4b_mcore
+
+CUDA_VISIBLE_DEVICES=0,1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_mamba.py \
+    --init-method-std 0.014 \
+    --disable-bias-linear \
+    --use-rope-scaling \
+    --squared-relu \
+    --qk-layernorm \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --use-rotary-position-embeddings \
+    --hybrid-override-pattern "M-M-M-M*-M-M-M-M*-M-M-M-M*" \
+    --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+    --num-layers 26 \
+    --hidden-size 3072 \
+    --num-attention-heads 32 \
+    --mamba-num-heads 112 \
+    --ffn-hidden-size 12288 \
+    --kv-channels 128 \
+    --group-query-attention \
+    --position-embedding-type none \
+    --attention-backend fused \
+    --num-query-groups 8 \
+    --normalization RMSNorm \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --tensor-model-parallel-size 2 \
+    --pipeline-model-parallel-size 1 \
+    --seq-length 8192 \
+    --max-position-embeddings 8192 \
+    --micro-batch-size 1 \
+    --global-batch-size 4 \
+    --train-iters 10 \
+    --mock-data \
+    --tokenizer-type NullTokenizer \
+    --vocab-size 151936 \
+    --save-interval 5 \
+    --eval-interval 5 \
+    --eval-iters 4 \
+    --load ${LOAD_DIR} \
+    --save ${SAVE_DIR} \
+    --ckpt-format torch_dist \
+    --log-progress \
+    --bf16 \
+    --lr 4.5e-4 \
+    --min-lr 4.5e-5 \
+    --num-workers 2 \
+    --tensorboard-dir /workspace/tb \
+    --log-interval 1 \
+    --log-throughput \
+    --no-load-optim \
+    --no-load-rng
+
+echo rm -rf ${LOAD_DIR}
+echo rm -rf ${SAVE_DIR}
diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py
new file mode 100644
index 0000000000..ac362ea2ed
--- /dev/null
+++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for LLaMA recipe configurations."""
+
+import pytest
+import subprocess
+
+from megatron.bridge.recipes.qwen import qwen3_4b_pretrain_config
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.pretrain import pretrain
+
+
+class TestQwen3MBridgeCkpt:
+    """Test class for LLaMA recipe functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    def test_qwen3_4b_ckpt_mbridge(self):
+        """Functional test for LLaMA recipes with appropriate parallelism configurations."""
+
+        config = qwen3_4b_pretrain_config()
+
+        config.train.train_iters = 5
+        config.train.eval_iters = 5
+        config.train.save_interval = 5
+        config.train.global_batch_size = 4
+        config.train.micro_batch_size = 1
+
+        config.scheduler.lr_warmup_iters = 2
+
+        config.logger.log_interval = 1
+
+        config.checkpoint.save = "/workspace/test_ckpts/qwen3_4b_mbridge"
+
+        pretrain(config=config, forward_step_func=forward_step)
+
diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh
new file mode 100755
index 0000000000..9aa93b06da
--- /dev/null
+++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh
@@ -0,0 +1,53 @@
+LOAD_DIR=/workspace/test_ckpts/qwen3_4b_mbridge
+SAVE_DIR=/workspace/test_ckpts/qwen3_4b_mcore
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py \
+    --init-method-std 0.014 \
+    --disable-bias-linear \
+    --use-rope-scaling \
+    --swiglu \
+    --qk-layernorm \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --use-rotary-position-embeddings \
+    --num-layers 36 \
+    --hidden-size 2560 \
+    --num-attention-heads 32 \
+    --ffn-hidden-size 9728 \
+    --kv-channels 128 \
+    --group-query-attention \
+    --position-embedding-type rope \
+    --attention-backend fused \
+    --num-query-groups 8 \
+    --normalization RMSNorm \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --tensor-model-parallel-size 2 \
+    --pipeline-model-parallel-size 1 \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 1 \
+    --global-batch-size 4 \
+    --train-iters 10 \
+    --mock-data \
+    --tokenizer-type NullTokenizer \
+    --vocab-size 151936 \
+    --save-interval 5 \
+    --eval-interval 5 \
+    --eval-iters 4 \
+    --load ${LOAD_DIR} \
+    --save ${SAVE_DIR} \
+    --ckpt-format torch_dist \
+    --log-progress \
+    --bf16 \
+    --lr 4.5e-4 \
+    --min-lr 4.5e-5 \
+    --num-workers 2 \
+    --tensorboard-dir /workspace/tb \
+    --log-interval 1 \
+    --log-throughput \
+    --no-load-optim \
+    --no-load-rng
+
+echo rm -rf ${LOAD_DIR}
+echo rm -rf ${SAVE_DIR}

From 5bbc5424a9a556f949ca02e3c9a587806d75e548 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 2 Mar 2026 09:44:15 -0800
Subject: [PATCH 04/11] refactor checkpointing tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 ${SAVE_DIR}/progress.txt                      |   1 +
 .github/workflows/cicd-main.yml               |   3 +
 ..._Launch_ckpts_mbridge_to_mlm_llama32_1b.sh |   6 +-
 ...aunch_ckpts_mbridge_to_mlm_nemotronh_4b.sh |   6 +-
 ...L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh |   6 +-
 .../ckpts/llama32_1b/test_llama32_1b_ckpt.py  | 131 +++++++++++++++++
 .../llama32_1b/test_llama32_1b_mbridge.py     |  49 -------
 .../ckpts/llama32_1b/test_llama32_1b_mcore.sh |  49 -------
 .../nemotronh_4b/test_nemotronh_4b_ckpt.py    | 136 ++++++++++++++++++
 .../nemotronh_4b/test_nemotronh_4b_mbridge.py |  50 -------
 .../nemotronh_4b/test_nemotronh_4b_mcore.sh   |  56 --------
 .../ckpts/qwen3_4b/test_qwen3_4b_ckpt.py      | 132 +++++++++++++++++
 .../ckpts/qwen3_4b/test_qwen3_4b_mbridge.py   |  47 ------
 .../ckpts/qwen3_4b/test_qwen3_4b_mcore.sh     |  53 -------
 ...t.tfevents.1772467438.017ddd03c11b.10202.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1772467438.017ddd03c11b.10204.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1772467705.017ddd03c11b.10985.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1772467705.017ddd03c11b.10987.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1772468322.017ddd03c11b.11735.0 | Bin 0 -> 88 bytes
 ...t.tfevents.1772468322.017ddd03c11b.11737.0 | Bin 0 -> 88 bytes
 20 files changed, 415 insertions(+), 310 deletions(-)
 create mode 100644 ${SAVE_DIR}/progress.txt
 create mode 100644 tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
 delete mode 100644 tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py
 delete mode 100755 tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh
 create mode 100644 tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
 delete mode 100644 tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py
 delete mode 100644 tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh
 create mode 100644 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
 delete mode 100644 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py
 delete mode 100755 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh
 create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0
 create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0
 create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0
 create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0
 create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0
 create mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0

diff --git a/${SAVE_DIR}/progress.txt b/${SAVE_DIR}/progress.txt
new file mode 100644
index 0000000000..0f93fe87b6
--- /dev/null
+++ b/${SAVE_DIR}/progress.txt
@@ -0,0 +1 @@
+2026-03-02 17:36:47	Job ID: 	# GPUs: 2	Starting job
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index a53e0e92e1..191d0a57a9 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -376,6 +376,9 @@ jobs:
           - script: L2_Launch_quantization_export
           - script: L2_Launch_recipes_llama_cuda_graphs
           - script: L2_Launch_utils
+          - script: L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh
+          - script: L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh 
+          - script: L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh
     needs: [pre-flight, cicd-unit-tests]
     runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
     if: |
diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh
index d28728b2f0..aeadc54ddd 100755
--- a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh
+++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh
@@ -20,7 +20,9 @@ export CUDA_VISIBLE_DEVICES="0,1"
 # Run recipe functional tests on 2 GPUs
 # This script tests recipe configurations with their default settings to ensure
 # they can run basic training without crashes
-uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py
+uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_mbridge
 coverage combine -q
 
-bash tests/functional_tests/ckpts/llama3_1b/test_llama32_1b_mcore.sh
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_core
+
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_remove_artifacts
diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh
index f0f5c3bbaa..c39b782f98 100755
--- a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh
+++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh
@@ -20,7 +20,9 @@ export CUDA_VISIBLE_DEVICES="0,1"
 # Run recipe functional tests on 2 GPUs
 # This script tests recipe configurations with their default settings to ensure
 # they can run basic training without crashes
-uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py
+uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mbridge
 coverage combine -q
 
-bash tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mcore
+
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_remove_artifacts
diff --git a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh
index 21c5083c2c..e392f76aad 100755
--- a/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh
+++ b/tests/functional_tests/L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh
@@ -20,7 +20,9 @@ export CUDA_VISIBLE_DEVICES="0,1"
 # Run recipe functional tests on 2 GPUs
 # This script tests recipe configurations with their default settings to ensure
 # they can run basic training without crashes
-uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py
+uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mbridge
 coverage combine -q
 
-bash tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mcore
+
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_remove_artifacts
diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
new file mode 100644
index 0000000000..ae5e11b83f
--- /dev/null
+++ b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for LLaMA checkpointing."""
+
+import os
+import pytest
+import shutil
+import sys
+
+from torch.distributed.run import main as torchrun_main
+
+from megatron.bridge.recipes.llama import llama32_1b_pretrain_config
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.pretrain import pretrain
+
+
+BASE_DIR = "/workspace/test_ckpts/llama32_1b"
+MBRIDGE_CKPT = f"{BASE_DIR}/mbridge"
+MCORE_CKPT = f"{BASE_DIR}/mcore"
+TB_DIR = f"{BASE_DIR}/tb"
+
+
+class TestLlama32Ckpt:
+    """Test class for LLama checkpoint functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    def test_llama32_1B_ckpt_mbridge(self):
+        """Functional test for LLama MBridge checkpoint."""
+
+        config = llama32_1b_pretrain_config()
+
+        config.model.seq_length = 8192
+
+        config.train.train_iters = 5
+        config.train.eval_iters = 5
+        config.train.save_interval = 5
+        config.train.global_batch_size = 8
+        config.train.micro_batch_size = 1
+
+        config.scheduler.lr_warmup_iters = 2
+
+        config.logger.log_interval = 1
+
+        config.checkpoint.save = MBRIDGE_CKPT
+
+        pretrain(config=config, forward_step_func=forward_step)
+
+    @pytest.mark.run_only_on("GPU")
+    def test_llama32_1B_ckpt_core(self, monkeypatch):
+        """Functional test for LLama MCore checkpoint."""
+
+        # Set environment variables
+        monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1")
+        monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1")
+
+        # Set MLM script
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "torchrun",
+                "--nproc-per-node=2",
+                "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py",
+                "--load", "/workspace/test_ckpts/llama32_1b_mbridge",
+                "--save", "/workspace/test_ckpts/llama32_1b_mcore",
+                "--init-method-std", "0.014",
+                "--disable-bias-linear",
+                "--use-rope-scaling",
+                "--swiglu",
+                "--use-rotary-position-embeddings",
+                "--num-layers", "16",
+                "--hidden-size", "2048",
+                "--num-attention-heads", "32",
+                "--ffn-hidden-size", "8192",
+                "--kv-channels", "64",
+                "--group-query-attention",
+                "--position-embedding-type", "rope",
+                "--attention-backend", "fused",
+                "--num-query-groups", "8",
+                "--normalization", "RMSNorm",
+                "--attention-dropout", "0.0",
+                "--hidden-dropout", "0.0",
+                "--tensor-model-parallel-size", "1",
+                "--pipeline-model-parallel-size", "1",
+                "--seq-length", "8192",
+                "--max-position-embeddings", "8192",
+                "--micro-batch-size", "1",
+                "--global-batch-size", "8",
+                "--train-iters", "10",
+                "--mock-data",
+                "--tokenizer-type", "NullTokenizer",
+                "--vocab-size", "131072",
+                "--save-interval", "5",
+                "--eval-interval", "5",
+                "--eval-iters", "5",
+                "--load", MBRIDGE_CKPT,
+                "--save", MCORE_CKPT,
+                "--ckpt-format", "torch_dist",
+                "--log-progress",
+                "--bf16",
+                "--lr", "4.5e-4",
+                "--min-lr", "4.5e-5",
+                "--num-workers", "2",
+                "--tensorboard-dir", TB_DIR,
+                "--log-interval", "1",
+                "--log-throughput",
+                "--no-load-optim",
+            ],
+        )
+
+        # Run MLM script
+        torchrun_main()
+
+    def test_remove_artifacts(self):
+        """Removes model artifacts"""
+        shutil.rmtree(BASE_DIR)
+
+        assert not os.path.exists(BASE_DIR)
+
diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py
deleted file mode 100644
index 199f083140..0000000000
--- a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mbridge.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Functional smoke tests for LLaMA recipe configurations."""
-
-import pytest
-import subprocess
-
-from megatron.bridge.recipes.llama import llama32_1b_pretrain_config
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.pretrain import pretrain
-
-
-class TestLlama32MBridgeCkpt:
-    """Test class for LLaMA recipe functional tests."""
-
-    @pytest.mark.run_only_on("GPU")
-    def test_llama32_1B_ckpt_mbridge(self):
-        """Functional test for LLaMA recipes with appropriate parallelism configurations."""
-
-        config = llama32_1b_pretrain_config()
-
-        config.model.seq_length = 8192
-
-        config.train.train_iters = 5
-        config.train.eval_iters = 5
-        config.train.save_interval = 5
-        config.train.global_batch_size = 8
-        config.train.micro_batch_size = 1
-
-        config.scheduler.lr_warmup_iters = 2
-
-        config.logger.log_interval = 1
-
-        config.checkpoint.save = "/workspace/test_ckpts/llama32_1b_mbridge"
-
-        pretrain(config=config, forward_step_func=forward_step)
-
diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh
deleted file mode 100755
index 16d686fd71..0000000000
--- a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_mcore.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-LOAD_DIR=/workspace/test_ckpts/llama32_1b_mbridge
-SAVE_DIR=/workspace/test_ckpts/llama32_1b_mcore
-
-CUDA_VISIBLE_DEVICES=0,1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py \
-    --init-method-std 0.014 \
-    --disable-bias-linear \
-    --use-rope-scaling \
-    --swiglu \
-    --use-rotary-position-embeddings \
-    --num-layers 16 \
-    --hidden-size 2048 \
-    --num-attention-heads 32 \
-    --ffn-hidden-size 8192 \
-    --kv-channels 64 \
-    --group-query-attention \
-    --position-embedding-type rope \
-    --attention-backend fused \
-    --num-query-groups 8 \
-    --normalization RMSNorm \
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --tensor-model-parallel-size 1 \
-    --pipeline-model-parallel-size 1 \
-    --seq-length 8192 \
-    --max-position-embeddings 8192 \
-    --micro-batch-size 1 \
-    --global-batch-size 8 \
-    --train-iters 10 \
-    --mock-data \
-    --tokenizer-type NullTokenizer \
-    --vocab-size 131072 \
-    --save-interval 5 \
-    --eval-interval 5 \
-    --eval-iters 4 \
-    --load ${LOAD_DIR} \
-    --save ${SAVE_DIR} \
-    --ckpt-format torch_dist \
-    --log-progress \
-    --bf16 \
-    --lr 4.5e-4 \
-    --min-lr 4.5e-5 \
-    --num-workers 2 \
-    --tensorboard-dir /workspace/tb \
-    --log-interval 1 \
-    --log-throughput \
-    --no-load-optim
-
-echo rm -rf ${LOAD_DIR}
-echo rm -rf ${SAVE_DIR}
diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
new file mode 100644
index 0000000000..3f4490e93d
--- /dev/null
+++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for LLaMA recipe configurations."""
+
+import os
+import pytest
+import shutil
+import sys
+
+from torch.distributed.run import main as torchrun_main
+
+from megatron.bridge.recipes.nemotronh import nemotronh_4b_pretrain_config
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.pretrain import pretrain
+
+
+BASE_DIR = "/workspace/test_ckpts/nemotronh_4b"
+MBRIDGE_CKPT = f"{BASE_DIR}/mbridge"
+MCORE_CKPT = f"{BASE_DIR}/mcore"
+TB_DIR = f"{BASE_DIR}/tb"
+
+
+class TestNemotronhCkpt:
+    """Test class for Nempotron Hybrid checkpoint functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    def test_nemotronh_4b_ckpt_mbridge(self):
+        """Functional test for Nemotron Hybrid MBridge checkpoint."""
+
+        config = nemotronh_4b_pretrain_config()
+
+        config.model.num_layers = 26
+        config.model.hybrid_override_pattern = "M-M-M-M*-M-M-M-M*-M-M-M-M*"
+
+        config.train.train_iters = 5
+        config.train.eval_iters = 5
+        config.train.save_interval = 5
+        config.train.global_batch_size = 4
+        config.train.micro_batch_size = 1
+
+        config.scheduler.lr_warmup_iters = 2
+
+        config.logger.log_interval = 1
+
+        config.checkpoint.save = MBRIDGE_CKPT
+
+        pretrain(config=config, forward_step_func=forward_step)
+    
+    @pytest.mark.run_only_on("GPU")
+    def test_nemotronh_4b_ckpt_mcore(self, monkeypatch):
+        """Functional test for Nemotron Hybrid MCore checkpoint."""
+
+        # Set environment variables
+        monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1")
+        monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1")
+
+        # Set MLM script
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "torchrun",
+                "--nproc_per_node=2",
+                "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_mamba.py",
+                "--init-method-std", "0.014",
+                "--disable-bias-linear",
+                "--use-rope-scaling",
+                "--squared-relu",
+                "--qk-layernorm",
+                "--rotary-percent", "1.0",
+                "--rotary-base", "1000000",
+                "--use-rotary-position-embeddings",
+                "--hybrid-override-pattern", "M-M-M-M*-M-M-M-M*-M-M-M-M*",
+                "--spec", "megatron.core.models.mamba.mamba_layer_specs", "mamba_stack_spec",
+                "--num-layers", "26",
+                "--hidden-size", "3072",
+                "--num-attention-heads", "32",
+                "--mamba-num-heads", "112",
+                "--ffn-hidden-size", "12288",
+                "--kv-channels", "128",
+                "--group-query-attention",
+                "--position-embedding-type", "none",
+                "--attention-backend", "fused",
+                "--num-query-groups", "8",
+                "--normalization", "RMSNorm",
+                "--attention-dropout", "0.0",
+                "--hidden-dropout", "0.0",
+                "--tensor-model-parallel-size", "2",
+                "--pipeline-model-parallel-size", "1",
+                "--seq-length", "8192",
+                "--max-position-embeddings", "8192",
+                "--micro-batch-size", "1",
+                "--global-batch-size", "4",
+                "--train-iters", "10",
+                "--mock-data",
+                "--tokenizer-type", "NullTokenizer",
+                "--vocab-size", "151936",
+                "--save-interval", "5",
+                "--eval-interval", "5",
+                "--eval-iters", "4",
+                "--load", MBRIDGE_CKPT,
+                "--save", MCORE_CKPT,
+                "--ckpt-format", "torch_dist",
+                "--log-progress",
+                "--bf16",
+                "--lr", "4.5e-4",
+                "--min-lr", "4.5e-5",
+                "--num-workers", "2",
+                "--tensorboard-dir", TB_DIR,
+                "--log-interval", "1",
+                "--log-throughput",
+                "--no-load-optim",
+                "--no-load-rng",
+            ],
+        )
+
+        # Run MLM script
+        torchrun_main()
+
+    def test_remove_artifacts(self):
+        """Removes model artifacts"""
+        shutil.rmtree(BASE_DIR)
+
+        assert not os.path.exists(BASE_DIR)
diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py
deleted file mode 100644
index bfab5ab120..0000000000
--- a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mbridge.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Functional smoke tests for LLaMA recipe configurations."""
-
-import pytest
-import subprocess
-
-from megatron.bridge.recipes.nemotronh import nemotronh_4b_pretrain_config
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.pretrain import pretrain
-
-
-class TestNemotronhMBridgeCkpt:
-    """Test class for LLaMA recipe functional tests."""
-
-    @pytest.mark.run_only_on("GPU")
-    def test_nemotronh_4b_ckpt_mbridge(self):
-        """Functional test for LLaMA recipes with appropriate parallelism configurations."""
-
-        config = nemotronh_4b_pretrain_config()
-
-        config.model.num_layers = 26
-        config.model.hybrid_override_pattern = "M-M-M-M*-M-M-M-M*-M-M-M-M*"
-
-        config.train.train_iters = 5
-        config.train.eval_iters = 5
-        config.train.save_interval = 5
-        config.train.global_batch_size = 4
-        config.train.micro_batch_size = 1
-
-        config.scheduler.lr_warmup_iters = 2
-
-        config.logger.log_interval = 1
-
-        config.checkpoint.save = "/workspace/test_ckpts/nemotronh_4b_mbridge"
-
-        pretrain(config=config, forward_step_func=forward_step)
-
diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh
deleted file mode 100644
index cf3d80c4e5..0000000000
--- a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_mcore.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-LOAD_DIR=/workspace/test_ckpts/nemotronh_4b_mbridge
-SAVE_DIR=/workspace/test_ckpts/nemotronh_4b_mcore
-
-CUDA_VISIBLE_DEVICES=0,1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=2 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_mamba.py \
-    --init-method-std 0.014 \
-    --disable-bias-linear \
-    --use-rope-scaling \
-    --squared-relu \
-    --qk-layernorm \
-    --rotary-percent 1.0 \
-    --rotary-base 1000000 \
-    --use-rotary-position-embeddings \
-    --hybrid-override-pattern "M-M-M-M*-M-M-M-M*-M-M-M-M*" \
-    --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
-    --num-layers 26 \
-    --hidden-size 3072 \
-    --num-attention-heads 32 \
-    --mamba-num-heads 112 \
-    --ffn-hidden-size 12288 \
-    --kv-channels 128 \
-    --group-query-attention \
-    --position-embedding-type none \
-    --attention-backend fused \
-    --num-query-groups 8 \
-    --normalization RMSNorm \
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --tensor-model-parallel-size 2 \
-    --pipeline-model-parallel-size 1 \
-    --seq-length 8192 \
-    --max-position-embeddings 8192 \
-    --micro-batch-size 1 \
-    --global-batch-size 4 \
-    --train-iters 10 \
-    --mock-data \
-    --tokenizer-type NullTokenizer \
-    --vocab-size 151936 \
-    --save-interval 5 \
-    --eval-interval 5 \
-    --eval-iters 4 \
-    --load ${LOAD_DIR} \
-    --save ${SAVE_DIR} \
-    --ckpt-format torch_dist \
-    --log-progress \
-    --bf16 \
-    --lr 4.5e-4 \
-    --min-lr 4.5e-5 \
-    --num-workers 2 \
-    --tensorboard-dir /workspace/tb \
-    --log-interval 1 \
-    --log-throughput \
-    --no-load-optim \
-    --no-load-rng
-
-echo rm -rf ${LOAD_DIR}
-echo rm -rf ${SAVE_DIR}
diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
new file mode 100644
index 0000000000..00769a40dd
--- /dev/null
+++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for Qwen checkpointing."""
+
+import os
+import pytest
+import shutil
+import sys
+
+from torch.distributed.run import main as torchrun_main
+
+from megatron.bridge.recipes.qwen import qwen3_4b_pretrain_config
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.pretrain import pretrain
+
+
+BASE_DIR = "/workspace/test_ckpts/qwen3_4b"
+MBRIDGE_CKPT = f"{BASE_DIR}/mbridge"
+MCORE_CKPT = f"{BASE_DIR}/mcore"
+TB_DIR = f"{BASE_DIR}/tb"
+
+
+class TestQwen3Ckpt:
+    """Test class for Qwen checkpoint functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    def test_qwen3_4b_ckpt_mbridge(self):
+        """Functional test for Qwen MBridge checkpoint."""
+
+        config = qwen3_4b_pretrain_config()
+        
+        config.model.num_layers = 24
+
+        config.train.train_iters = 5
+        config.train.eval_iters = 5
+        config.train.save_interval = 5
+        config.train.global_batch_size = 4
+        config.train.micro_batch_size = 1
+
+        config.scheduler.lr_warmup_iters = 2
+
+        config.logger.log_interval = 1
+
+        config.checkpoint.save = MBRIDGE_CKPT
+
+        pretrain(config=config, forward_step_func=forward_step)
+
+    @pytest.mark.run_only_on("GPU")
+    def test_qwen3_4b_ckpt_mcore(self, monkeypatch):
+        """Functional test for Qwen MCore checkpoint."""
+
+        # Set environment variables
+        monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1")
+        monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1")
+
+        # Set MLM script
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "torchrun",
+                "--nproc_per_node=2",
+                "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py",
+                "--init-method-std", "0.014",
+                "--disable-bias-linear",
+                "--use-rope-scaling",
+                "--swiglu",
+                "--qk-layernorm",
+                "--rotary-percent", "1.0",
+                "--rotary-base", "1000000",
+                "--use-rotary-position-embeddings",
+                "--num-layers", "24",
+                "--hidden-size", "2560",
+                "--num-attention-heads", "32",
+                "--ffn-hidden-size", "9728",
+                "--kv-channels", "128",
+                "--group-query-attention",
+                "--position-embedding-type", "rope",
+                "--attention-backend", "fused",
+                "--num-query-groups", "8",
+                "--normalization", "RMSNorm",
+                "--attention-dropout", "0.0",
+                "--hidden-dropout", "0.0",
+                "--tensor-model-parallel-size", "2",
+                "--pipeline-model-parallel-size", "1",
+                "--seq-length", "4096",
+                "--max-position-embeddings", "4096",
+                "--micro-batch-size", "1",
+                "--global-batch-size", "4",
+                "--train-iters", "10",
+                "--mock-data",
+                "--tokenizer-type", "NullTokenizer",
+                "--vocab-size", "151936",
+                "--save-interval", "5",
+                "--eval-interval", "5",
+                "--eval-iters", "4",
+                "--load", MBRIDGE_CKPT,
+                "--save", MCORE_CKPT,
+                "--ckpt-format", "torch_dist",
+                "--log-progress",
+                "--bf16",
+                "--lr", "4.5e-4",
+                "--min-lr", "4.5e-5",
+                "--num-workers", "2",
+                "--tensorboard-dir", "/workspace/tb",
+                "--log-interval", "1",
+                "--log-throughput",
+                "--no-load-optim",
+                "--no-load-rng",
+            ],
+        )
+
+        # Run MLM script
+        torchrun_main()
+
+    def test_remove_artifacts(self):
+        """Removes model artifacts"""
+        shutil.rmtree(BASE_DIR)
+
+        assert not os.path.exists(BASE_DIR)
\ No newline at end of file
diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py
deleted file mode 100644
index ac362ea2ed..0000000000
--- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mbridge.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Functional smoke tests for LLaMA recipe configurations."""
-
-import pytest
-import subprocess
-
-from megatron.bridge.recipes.qwen import qwen3_4b_pretrain_config
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.pretrain import pretrain
-
-
-class TestQwen3MBridgeCkpt:
-    """Test class for LLaMA recipe functional tests."""
-
-    @pytest.mark.run_only_on("GPU")
-    def test_qwen3_4b_ckpt_mbridge(self):
-        """Functional test for LLaMA recipes with appropriate parallelism configurations."""
-
-        config = qwen3_4b_pretrain_config()
-
-        config.train.train_iters = 5
-        config.train.eval_iters = 5
-        config.train.save_interval = 5
-        config.train.global_batch_size = 4
-        config.train.micro_batch_size = 1
-
-        config.scheduler.lr_warmup_iters = 2
-
-        config.logger.log_interval = 1
-
-        config.checkpoint.save = "/workspace/test_ckpts/qwen3_4b_mbridge"
-
-        pretrain(config=config, forward_step_func=forward_step)
-
diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh
deleted file mode 100755
index 9aa93b06da..0000000000
--- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_mcore.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-LOAD_DIR=/workspace/test_ckpts/qwen3_4b_mbridge
-SAVE_DIR=/workspace/test_ckpts/qwen3_4b_mcore
-
-CUDA_VISIBLE_DEVICES=0,1,2,3 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 /opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py \
-    --init-method-std 0.014 \
-    --disable-bias-linear \
-    --use-rope-scaling \
-    --swiglu \
-    --qk-layernorm \
-    --rotary-percent 1.0 \
-    --rotary-base 1000000 \
-    --use-rotary-position-embeddings \
-    --num-layers 36 \
-    --hidden-size 2560 \
-    --num-attention-heads 32 \
-    --ffn-hidden-size 9728 \
-    --kv-channels 128 \
-    --group-query-attention \
-    --position-embedding-type rope \
-    --attention-backend fused \
-    --num-query-groups 8 \
-    --normalization RMSNorm \
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --tensor-model-parallel-size 2 \
-    --pipeline-model-parallel-size 1 \
-    --seq-length 4096 \
-    --max-position-embeddings 4096 \
-    --micro-batch-size 1 \
-    --global-batch-size 4 \
-    --train-iters 10 \
-    --mock-data \
-    --tokenizer-type NullTokenizer \
-    --vocab-size 151936 \
-    --save-interval 5 \
-    --eval-interval 5 \
-    --eval-iters 4 \
-    --load ${LOAD_DIR} \
-    --save ${SAVE_DIR} \
-    --ckpt-format torch_dist \
-    --log-progress \
-    --bf16 \
-    --lr 4.5e-4 \
-    --min-lr 4.5e-5 \
-    --num-workers 2 \
-    --tensorboard-dir /workspace/tb \
-    --log-interval 1 \
-    --log-throughput \
-    --no-load-optim \
-    --no-load-rng
-
-echo rm -rf ${LOAD_DIR}
-echo rm -rf ${SAVE_DIR}
diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0
new file mode 100644
index 0000000000000000000000000000000000000000..47455eb593a7985968b0da0f007801cec45ae4e1
GIT binary patch
literal 88
zcmeZZfPjCKJmzxFc-v^5n|aGoiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-QuJ5iC;%>_Ain?r

literal 0
HcmV?d00001

diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0
new file mode 100644
index 0000000000000000000000000000000000000000..16cd1a614553ab5d1bb4a8f48af99b95d42cad04
GIT binary patch
literal 88
zcmeZZfPjCKJmzw$GtaQj&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$
hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{ZVJVPo007Lw9y|a5

literal 0
HcmV?d00001

diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0
new file mode 100644
index 0000000000000000000000000000000000000000..94b1235eb75983635cd55b40b1b53304b49e69db
GIT binary patch
literal 88
zcmeZZfPjCKJmzv*eoL#%&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$
hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{ZIX>x_2>>B$AeaCE

literal 0
HcmV?d00001

diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0
new file mode 100644
index 0000000000000000000000000000000000000000..8c76604d9172bde22402d34062c209dbe5da6726
GIT binary patch
literal 88
zcmeZZfPjCKJmzv{olLIF&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$
hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{Zxs>!^EdU}iAqM~e

literal 0
HcmV?d00001

diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0 b/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0
new file mode 100644
index 0000000000000000000000000000000000000000..0cbe458547164260d95245eee077c7cd5a239512
GIT binary patch
literal 88
zcmeZZfPjCKJmzwiU7aqGmwC%kiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-@?qCcQ2-pXAmso6

literal 0
HcmV?d00001

diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0 b/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0
new file mode 100644
index 0000000000000000000000000000000000000000..71b6950ddc84bc0952de7b212ffadc31faae2c71
GIT binary patch
literal 88
zcmeZZfPjCKJmzxh?Vll$mwC%kiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-qWL}E3jqEVADI9E

literal 0
HcmV?d00001


From 591a58999911e9201edc065c48145fa15c7360ae Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 2 Mar 2026 09:45:11 -0800
Subject: [PATCH 05/11] remove extra files

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 ${SAVE_DIR}/progress.txt                           |   1 -
 ...ts.out.tfevents.1772467438.017ddd03c11b.10202.0 | Bin 88 -> 0 bytes
 ...ts.out.tfevents.1772467438.017ddd03c11b.10204.0 | Bin 88 -> 0 bytes
 ...ts.out.tfevents.1772467705.017ddd03c11b.10985.0 | Bin 88 -> 0 bytes
 ...ts.out.tfevents.1772467705.017ddd03c11b.10987.0 | Bin 88 -> 0 bytes
 ...ts.out.tfevents.1772468322.017ddd03c11b.11735.0 | Bin 88 -> 0 bytes
 ...ts.out.tfevents.1772468322.017ddd03c11b.11737.0 | Bin 88 -> 0 bytes
 7 files changed, 1 deletion(-)
 delete mode 100644 ${SAVE_DIR}/progress.txt
 delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0
 delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0
 delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0
 delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0
 delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0
 delete mode 100644 {BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0

diff --git a/${SAVE_DIR}/progress.txt b/${SAVE_DIR}/progress.txt
deleted file mode 100644
index 0f93fe87b6..0000000000
--- a/${SAVE_DIR}/progress.txt
+++ /dev/null
@@ -1 +0,0 @@
-2026-03-02 17:36:47	Job ID: 	# GPUs: 2	Starting job
diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10202.0
deleted file mode 100644
index 47455eb593a7985968b0da0f007801cec45ae4e1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzxFc-v^5n|aGoiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-QuJ5iC;%>_Ain?r

diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467438.017ddd03c11b.10204.0
deleted file mode 100644
index 16cd1a614553ab5d1bb4a8f48af99b95d42cad04..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzw$GtaQj&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$
hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{ZVJVPo007Lw9y|a5

diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10985.0
deleted file mode 100644
index 94b1235eb75983635cd55b40b1b53304b49e69db..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzv*eoL#%&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$
hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{ZIX>x_2>>B$AeaCE

diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0 b/{BASE_DIR}/tb/events.out.tfevents.1772467705.017ddd03c11b.10987.0
deleted file mode 100644
index 8c76604d9172bde22402d34062c209dbe5da6726..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzv{olLIF&AjC(#hX-=n3<>NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$
hnwy(gRH;{9lv$Emq?Za(6`z)wlNt{Zxs>!^EdU}iAqM~e

diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0 b/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11735.0
deleted file mode 100644
index 0cbe458547164260d95245eee077c7cd5a239512..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzwiU7aqGmwC%kiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-@?qCcQ2-pXAmso6

diff --git a/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0 b/{BASE_DIR}/tb/events.out.tfevents.1772468322.017ddd03c11b.11737.0
deleted file mode 100644
index 71b6950ddc84bc0952de7b212ffadc31faae2c71..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeZZfPjCKJmzxh?Vll$mwC%kiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-qWL}E3jqEVADI9E


From eb927cb2b87ef6bc6412ceceba85505a737f9d14 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 2 Mar 2026 13:17:18 -0800
Subject: [PATCH 06/11] add mlm to mbridge ckpt tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 ..._Launch_ckpts_mlm_to_mbridge_llama32_1b.sh | 28 ++++++++++++++++++
 ...aunch_ckpts_mlm_to_mbridge_nemotronh_4b.sh | 29 +++++++++++++++++++
 ...L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh | 28 ++++++++++++++++++
 .../ckpts/llama32_1b/test_llama32_1b_ckpt.py  | 15 ++++++----
 .../nemotronh_4b/test_nemotronh_4b_ckpt.py    | 15 ++++++----
 .../ckpts/qwen3_4b/test_qwen3_4b_ckpt.py      | 17 +++++++----
 6 files changed, 116 insertions(+), 16 deletions(-)
 create mode 100755 tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_llama32_1b.sh
 create mode 100755 tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b.sh
 create mode 100755 tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh

diff --git a/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_llama32_1b.sh b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_llama32_1b.sh
new file mode 100755
index 0000000000..bbf174c4ee
--- /dev/null
+++ b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_llama32_1b.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
+
+export CUDA_VISIBLE_DEVICES="0,1"
+
+# Run recipe functional tests on 2 GPUs
+# This script tests recipe configurations with their default settings to ensure
+# they can run basic training without crashes
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_core
+
+uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_llama32_1B_ckpt_mbridge
+coverage combine -q
+
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py::TestLlama32Ckpt::test_remove_artifacts
diff --git a/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b.sh
new file mode 100755
index 0000000000..04f71056e4
--- /dev/null
+++ b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
+
+export CUDA_VISIBLE_DEVICES="0,1"
+
+# Run recipe functional tests on 2 GPUs
+# This script tests recipe configurations with their default settings to ensure
+# they can run basic training without crashes
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mcore
+
+uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_nemotronh_4b_ckpt_mbridge
+coverage combine -q
+
+
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py::TestNemotronhCkpt::test_remove_artifacts
diff --git a/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh
new file mode 100755
index 0000000000..81e7a29ea2
--- /dev/null
+++ b/tests/functional_tests/L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
+
+export CUDA_VISIBLE_DEVICES="0,1"
+
+# Run recipe functional tests on 2 GPUs
+# This script tests recipe configurations with their default settings to ensure
+# they can run basic training without crashes
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mcore
+
+uv run python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_qwen3_4b_ckpt_mbridge
+coverage combine -q
+
+pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py::TestQwen3Ckpt::test_remove_artifacts
diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
index ae5e11b83f..792ff7ca37 100644
--- a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
+++ b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
@@ -41,9 +41,13 @@ def test_llama32_1B_ckpt_mbridge(self):
 
         config = llama32_1b_pretrain_config()
 
+        config.checkpoint.save = MBRIDGE_CKPT
+        config.checkpoint.load = MCORE_CKPT if os.path.exists(MCORE_CKPT) else None
+        config.checkpoint.load_optim = False
+
         config.model.seq_length = 8192
 
-        config.train.train_iters = 5
+        config.train.train_iters = 10 if config.checkpoint.load else 5
         config.train.eval_iters = 5
         config.train.save_interval = 5
         config.train.global_batch_size = 8
@@ -53,14 +57,15 @@ def test_llama32_1B_ckpt_mbridge(self):
 
         config.logger.log_interval = 1
 
-        config.checkpoint.save = MBRIDGE_CKPT
-
         pretrain(config=config, forward_step_func=forward_step)
 
     @pytest.mark.run_only_on("GPU")
     def test_llama32_1B_ckpt_core(self, monkeypatch):
         """Functional test for LLama MCore checkpoint."""
 
+        load_dir = MBRIDGE_CKPT if os.path.exists(MBRIDGE_CKPT) else None
+        train_iters = 10 if load_dir else 5
+
         # Set environment variables
         monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1")
         monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1")
@@ -98,14 +103,14 @@ def test_llama32_1B_ckpt_core(self, monkeypatch):
                 "--max-position-embeddings", "8192",
                 "--micro-batch-size", "1",
                 "--global-batch-size", "8",
-                "--train-iters", "10",
                 "--mock-data",
                 "--tokenizer-type", "NullTokenizer",
                 "--vocab-size", "131072",
+                "--train-iters", f"{train_iters}",
                 "--save-interval", "5",
                 "--eval-interval", "5",
                 "--eval-iters", "5",
-                "--load", MBRIDGE_CKPT,
+                "--load", load_dir,
                 "--save", MCORE_CKPT,
                 "--ckpt-format", "torch_dist",
                 "--log-progress",
diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
index 3f4490e93d..2d049b3adb 100644
--- a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
+++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
@@ -41,10 +41,14 @@ def test_nemotronh_4b_ckpt_mbridge(self):
 
         config = nemotronh_4b_pretrain_config()
 
+        config.checkpoint.save = MBRIDGE_CKPT
+        config.checkpoint.load = MCORE_CKPT if os.path.exists(MCORE_CKPT) else None
+        config.checkpoint.load_optim = False
+
         config.model.num_layers = 26
         config.model.hybrid_override_pattern = "M-M-M-M*-M-M-M-M*-M-M-M-M*"
 
-        config.train.train_iters = 5
+        config.train.train_iters = 10 if config.checkpoint.load else 5
         config.train.eval_iters = 5
         config.train.save_interval = 5
         config.train.global_batch_size = 4
@@ -54,14 +58,15 @@ def test_nemotronh_4b_ckpt_mbridge(self):
 
         config.logger.log_interval = 1
 
-        config.checkpoint.save = MBRIDGE_CKPT
-
         pretrain(config=config, forward_step_func=forward_step)
     
     @pytest.mark.run_only_on("GPU")
     def test_nemotronh_4b_ckpt_mcore(self, monkeypatch):
         """Functional test for Nemotron Hybrid MCore checkpoint."""
 
+        load_dir = MBRIDGE_CKPT if os.path.exists(MBRIDGE_CKPT) else None
+        train_iters = 10 if load_dir else 5
+
         # Set environment variables
         monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1")
         monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1")
@@ -103,14 +108,14 @@ def test_nemotronh_4b_ckpt_mcore(self, monkeypatch):
                 "--max-position-embeddings", "8192",
                 "--micro-batch-size", "1",
                 "--global-batch-size", "4",
-                "--train-iters", "10",
                 "--mock-data",
                 "--tokenizer-type", "NullTokenizer",
                 "--vocab-size", "151936",
+                "--train-iters", f"{train_iters}",
                 "--save-interval", "5",
                 "--eval-interval", "5",
                 "--eval-iters", "4",
-                "--load", MBRIDGE_CKPT,
+                "--load", load_dir,
                 "--save", MCORE_CKPT,
                 "--ckpt-format", "torch_dist",
                 "--log-progress",
diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
index 00769a40dd..5030d609da 100644
--- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
+++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
@@ -40,10 +40,14 @@ def test_qwen3_4b_ckpt_mbridge(self):
         """Functional test for Qwen MBridge checkpoint."""
 
         config = qwen3_4b_pretrain_config()
-        
+
+        config.checkpoint.save = MBRIDGE_CKPT
+        config.checkpoint.load = MCORE_CKPT if os.path.exists(MCORE_CKPT) else None
+        config.checkpoint.load_optim = False
+
         config.model.num_layers = 24
 
-        config.train.train_iters = 5
+        config.train.train_iters = 10 if config.checkpoint.load else 5
         config.train.eval_iters = 5
         config.train.save_interval = 5
         config.train.global_batch_size = 4
@@ -53,14 +57,15 @@ def test_qwen3_4b_ckpt_mbridge(self):
 
         config.logger.log_interval = 1
 
-        config.checkpoint.save = MBRIDGE_CKPT
-
         pretrain(config=config, forward_step_func=forward_step)
 
     @pytest.mark.run_only_on("GPU")
     def test_qwen3_4b_ckpt_mcore(self, monkeypatch):
         """Functional test for Qwen MCore checkpoint."""
 
+        load_dir = MBRIDGE_CKPT if os.path.exists(MBRIDGE_CKPT) else None
+        train_iters = 10 if load_dir else 5
+
         # Set environment variables
         monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1")
         monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1")
@@ -99,14 +104,14 @@ def test_qwen3_4b_ckpt_mcore(self, monkeypatch):
                 "--max-position-embeddings", "4096",
                 "--micro-batch-size", "1",
                 "--global-batch-size", "4",
-                "--train-iters", "10",
                 "--mock-data",
                 "--tokenizer-type", "NullTokenizer",
                 "--vocab-size", "151936",
+                "--train-iters", f"{train_iters}",
                 "--save-interval", "5",
                 "--eval-interval", "5",
                 "--eval-iters", "4",
-                "--load", MBRIDGE_CKPT,
+                "--load", load_dir,
                 "--save", MCORE_CKPT,
                 "--ckpt-format", "torch_dist",
                 "--log-progress",

From 846de61db31bedbc0641b0b0fd410b9fbe095b95 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 3 Mar 2026 06:15:39 -0800
Subject: [PATCH 07/11] fix code style

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../ckpts/llama32_1b/test_llama32_1b_ckpt.py  | 104 ++++++++++------
 .../nemotronh_4b/test_nemotronh_4b_ckpt.py    | 114 ++++++++++++------
 .../ckpts/qwen3_4b/test_qwen3_4b_ckpt.py      | 104 ++++++++++------
 3 files changed, 214 insertions(+), 108 deletions(-)

diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
index 792ff7ca37..5a78ab6c0b 100644
--- a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
+++ b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
@@ -15,10 +15,10 @@
 """Functional smoke tests for LLaMA checkpointing."""
 
 import os
-import pytest
 import shutil
 import sys
 
+import pytest
 from torch.distributed.run import main as torchrun_main
 
 from megatron.bridge.recipes.llama import llama32_1b_pretrain_config
@@ -78,48 +78,82 @@ def test_llama32_1B_ckpt_core(self, monkeypatch):
                 "torchrun",
                 "--nproc-per-node=2",
                 "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py",
-                "--load", "/workspace/test_ckpts/llama32_1b_mbridge",
-                "--save", "/workspace/test_ckpts/llama32_1b_mcore",
-                "--init-method-std", "0.014",
+                "--load",
+                "/workspace/test_ckpts/llama32_1b_mbridge",
+                "--save",
+                "/workspace/test_ckpts/llama32_1b_mcore",
+                "--init-method-std",
+                "0.014",
                 "--disable-bias-linear",
                 "--use-rope-scaling",
                 "--swiglu",
                 "--use-rotary-position-embeddings",
-                "--num-layers", "16",
-                "--hidden-size", "2048",
-                "--num-attention-heads", "32",
-                "--ffn-hidden-size", "8192",
-                "--kv-channels", "64",
+                "--num-layers",
+                "16",
+                "--hidden-size",
+                "2048",
+                "--num-attention-heads",
+                "32",
+                "--ffn-hidden-size",
+                "8192",
+                "--kv-channels",
+                "64",
                 "--group-query-attention",
-                "--position-embedding-type", "rope",
-                "--attention-backend", "fused",
-                "--num-query-groups", "8",
-                "--normalization", "RMSNorm",
-                "--attention-dropout", "0.0",
-                "--hidden-dropout", "0.0",
-                "--tensor-model-parallel-size", "1",
-                "--pipeline-model-parallel-size", "1",
-                "--seq-length", "8192",
-                "--max-position-embeddings", "8192",
-                "--micro-batch-size", "1",
-                "--global-batch-size", "8",
+                "--position-embedding-type",
+                "rope",
+                "--attention-backend",
+                "fused",
+                "--num-query-groups",
+                "8",
+                "--normalization",
+                "RMSNorm",
+                "--attention-dropout",
+                "0.0",
+                "--hidden-dropout",
+                "0.0",
+                "--tensor-model-parallel-size",
+                "1",
+                "--pipeline-model-parallel-size",
+                "1",
+                "--seq-length",
+                "8192",
+                "--max-position-embeddings",
+                "8192",
+                "--micro-batch-size",
+                "1",
+                "--global-batch-size",
+                "8",
                 "--mock-data",
-                "--tokenizer-type", "NullTokenizer",
-                "--vocab-size", "131072",
-                "--train-iters", f"{train_iters}",
-                "--save-interval", "5",
-                "--eval-interval", "5",
-                "--eval-iters", "5",
-                "--load", load_dir,
-                "--save", MCORE_CKPT,
-                "--ckpt-format", "torch_dist",
+                "--tokenizer-type",
+                "NullTokenizer",
+                "--vocab-size",
+                "131072",
+                "--train-iters",
+                f"{train_iters}",
+                "--save-interval",
+                "5",
+                "--eval-interval",
+                "5",
+                "--eval-iters",
+                "5",
+                "--load",
+                load_dir,
+                "--save",
+                MCORE_CKPT,
+                "--ckpt-format",
+                "torch_dist",
                 "--log-progress",
                 "--bf16",
-                "--lr", "4.5e-4",
-                "--min-lr", "4.5e-5",
-                "--num-workers", "2",
-                "--tensorboard-dir", TB_DIR,
-                "--log-interval", "1",
+                "--lr",
+                "4.5e-4",
+                "--min-lr",
+                "4.5e-5",
+                "--num-workers",
+                "2",
+                "--tensorboard-dir",
+                TB_DIR,
+                "--log-interval",
+                "1",
                 "--log-throughput",
                 "--no-load-optim",
             ],
diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
index 2d049b3adb..5a46e0e207 100644
--- a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
+++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
@@ -15,10 +15,10 @@
 """Functional smoke tests for LLaMA recipe configurations."""
 
 import os
-import pytest
 import shutil
 import sys
 
+import pytest
 from torch.distributed.run import main as torchrun_main
 
 from megatron.bridge.recipes.nemotronh import nemotronh_4b_pretrain_config
@@ -79,52 +79,90 @@ def test_nemotronh_4b_ckpt_mcore(self, monkeypatch):
                 "torchrun",
                 "--nproc_per_node=2",
                 "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_mamba.py",
-                "--init-method-std", "0.014",
+                "--init-method-std",
+                "0.014",
                 "--disable-bias-linear",
                 "--use-rope-scaling",
                 "--squared-relu",
                 "--qk-layernorm",
-                "--rotary-percent", "1.0",
-                "--rotary-base", "1000000",
+                "--rotary-percent",
+                "1.0",
+                "--rotary-base",
+                "1000000",
                 "--use-rotary-position-embeddings",
-                "--hybrid-override-pattern", "M-M-M-M*-M-M-M-M*-M-M-M-M*",
-                "--spec", "megatron.core.models.mamba.mamba_layer_specs", "mamba_stack_spec",
-                "--num-layers", "26",
-                "--hidden-size", "3072",
-                "--num-attention-heads", "32",
-                "--mamba-num-heads", "112",
-                "--ffn-hidden-size", "12288",
-                "--kv-channels", "128",
+                "--hybrid-override-pattern",
+                "M-M-M-M*-M-M-M-M*-M-M-M-M*",
+                "--spec",
+                "megatron.core.models.mamba.mamba_layer_specs",
+                "mamba_stack_spec",
+                "--num-layers",
+                "26",
+                "--hidden-size",
+                "3072",
+                "--num-attention-heads",
+                "32",
+                "--mamba-num-heads",
+                "112",
+                "--ffn-hidden-size",
+                "12288",
+                "--kv-channels",
+                "128",
                 "--group-query-attention",
-                "--position-embedding-type", "none",
-                "--attention-backend", "fused",
-                "--num-query-groups", "8",
-                "--normalization", "RMSNorm",
-                "--attention-dropout", "0.0",
-                "--hidden-dropout", "0.0",
-                "--tensor-model-parallel-size", "2",
-                "--pipeline-model-parallel-size", "1",
-                "--seq-length", "8192",
-                "--max-position-embeddings", "8192",
-                "--micro-batch-size", "1",
-                "--global-batch-size", "4",
+                "--position-embedding-type",
+                "none",
+                "--attention-backend",
+                "fused",
+                "--num-query-groups",
+                "8",
+                "--normalization",
+                "RMSNorm",
+                "--attention-dropout",
+                "0.0",
+                "--hidden-dropout",
+                "0.0",
+                "--tensor-model-parallel-size",
+                "2",
+                "--pipeline-model-parallel-size",
+                "1",
+                "--seq-length",
+                "8192",
+                "--max-position-embeddings",
+                "8192",
+                "--micro-batch-size",
+                "1",
+                "--global-batch-size",
+                "4",
                 "--mock-data",
-                "--tokenizer-type", "NullTokenizer",
-                "--vocab-size", "151936",
-                "--train-iters", f"{train_iters}",
-                "--save-interval", "5",
-                "--eval-interval", "5",
-                "--eval-iters", "4",
-                "--load", load_dir,
-                "--save", MCORE_CKPT,
-                "--ckpt-format", "torch_dist",
+                "--tokenizer-type",
+                "NullTokenizer",
+                "--vocab-size",
+                "151936",
+                "--train-iters",
+                f"{train_iters}",
+                "--save-interval",
+                "5",
+                "--eval-interval",
+                "5",
+                "--eval-iters",
+                "4",
+                "--load",
+                load_dir,
+                "--save",
+                MCORE_CKPT,
+                "--ckpt-format", 
+                "torch_dist",
                 "--log-progress",
                 "--bf16",
-                "--lr", "4.5e-4",
-                "--min-lr", "4.5e-5",
-                "--num-workers", "2",
-                "--tensorboard-dir", TB_DIR,
-                "--log-interval", "1",
+                "--lr",
+                "4.5e-4",
+                "--min-lr",
+                "4.5e-5",
+                "--num-workers",
+                "2",
+                "--tensorboard-dir",
+                TB_DIR,
+                "--log-interval",
+                "1",
                 "--log-throughput",
                 "--no-load-optim",
                 "--no-load-rng",
diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
index 5030d609da..26282656ae 100644
--- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
+++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
@@ -15,10 +15,10 @@
 """Functional smoke tests for Qwen checkpointing."""
 
 import os
-import pytest
 import shutil
 import sys
 
+import pytest
 from torch.distributed.run import main as torchrun_main
 
 from megatron.bridge.recipes.qwen import qwen3_4b_pretrain_config
@@ -78,49 +78,83 @@ def test_qwen3_4b_ckpt_mcore(self, monkeypatch):
                 "torchrun",
                 "--nproc_per_node=2",
                 "/opt/Megatron-Bridge/3rdparty/Megatron-LM/pretrain_gpt.py",
-                "--init-method-std", "0.014",
+                "--init-method-std",
+                "0.014",
                 "--disable-bias-linear",
                 "--use-rope-scaling",
                 "--swiglu",
                 "--qk-layernorm",
-                "--rotary-percent", "1.0",
-                "--rotary-base", "1000000",
+                "--rotary-percent",
+                "1.0",
+                "--rotary-base",
+                "1000000",
                 "--use-rotary-position-embeddings",
-                "--num-layers", "24",
-                "--hidden-size", "2560",
-                "--num-attention-heads", "32",
-                "--ffn-hidden-size", "9728",
-                "--kv-channels", "128",
+                "--num-layers",
+                "24",
+                "--hidden-size",
+                "2560",
+                "--num-attention-heads",
+                "32",
+                "--ffn-hidden-size",
+                "9728",
+                "--kv-channels",
+                "128",
                 "--group-query-attention",
-                "--position-embedding-type", "rope",
-                "--attention-backend", "fused",
-                "--num-query-groups", "8",
-                "--normalization", "RMSNorm",
-                "--attention-dropout", "0.0",
-                "--hidden-dropout", "0.0",
-                "--tensor-model-parallel-size", "2",
-                "--pipeline-model-parallel-size", "1",
-                "--seq-length", "4096",
-                "--max-position-embeddings", "4096",
-                "--micro-batch-size", "1",
-                "--global-batch-size", "4",
+                "--position-embedding-type",
+                "rope",
+                "--attention-backend",
+                "fused",
+                "--num-query-groups",
+                "8",
+                "--normalization",
+                "RMSNorm",
+                "--attention-dropout",
+                "0.0",
+                "--hidden-dropout",
+                "0.0",
+                "--tensor-model-parallel-size",
+                "2",
+                "--pipeline-model-parallel-size",
+                "1",
+                "--seq-length",
+                "4096",
+                "--max-position-embeddings",
+                "4096",
+                "--micro-batch-size",
+                "1",
+                "--global-batch-size",
+                "4",
                 "--mock-data",
-                "--tokenizer-type", "NullTokenizer",
-                "--vocab-size", "151936",
-                "--train-iters", f"{train_iters}",
-                "--save-interval", "5",
-                "--eval-interval", "5",
-                "--eval-iters", "4",
-                "--load", load_dir,
-                "--save", MCORE_CKPT,
-                "--ckpt-format", "torch_dist",
+                "--tokenizer-type",
+                "NullTokenizer",
+                "--vocab-size",
+                "151936",
+                "--train-iters",
+                f"{train_iters}",
+                "--save-interval",
+                "5",
+                "--eval-interval",
+                "5",
+                "--eval-iters",
+                "4",
+                "--load",
+                load_dir,
+                "--save",
+                MCORE_CKPT,
+                "--ckpt-format",
+                "torch_dist",
                 "--log-progress",
                 "--bf16",
-                "--lr", "4.5e-4",
-                "--min-lr", "4.5e-5",
-                "--num-workers", "2",
-                "--tensorboard-dir", "/workspace/tb",
-                "--log-interval", "1",
+                "--lr",
+                "4.5e-4",
+                "--min-lr",
+                "4.5e-5",
+                "--num-workers",
+                "2",
+                "--tensorboard-dir",
+                "/workspace/tb",
+                "--log-interval",
+                "1",
                 "--log-throughput",
                 "--no-load-optim",
                 "--no-load-rng",

From 80dcef067d03914102cd096c0711a8932a11bb34 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 3 Mar 2026 06:19:37 -0800
Subject: [PATCH 08/11] fix code style

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 3rdparty/Megatron-LM                                          | 2 +-
 .../functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py | 1 -
 .../ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py              | 4 ++--
 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py   | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM
index 23dd639cf3..3d1a4ba71e 160000
--- a/3rdparty/Megatron-LM
+++ b/3rdparty/Megatron-LM
@@ -1 +1 @@
-Subproject commit 23dd639cf3de30f3b9d8d0fae71ee31180be9ddd
+Subproject commit 3d1a4ba71ecc49f1a0c9480c90f819d2b00f9915
diff --git a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
index 5a78ab6c0b..2a1dd30bdf 100644
--- a/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
+++ b/tests/functional_tests/ckpts/llama32_1b/test_llama32_1b_ckpt.py
@@ -167,4 +167,3 @@ def test_remove_artifacts(self):
         shutil.rmtree(BASE_DIR)
 
         assert not os.path.exists(BASE_DIR)
-
diff --git a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
index 5a46e0e207..17510074dd 100644
--- a/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
+++ b/tests/functional_tests/ckpts/nemotronh_4b/test_nemotronh_4b_ckpt.py
@@ -59,7 +59,7 @@ def test_nemotronh_4b_ckpt_mbridge(self):
         config.logger.log_interval = 1
 
         pretrain(config=config, forward_step_func=forward_step)
-    
+
     @pytest.mark.run_only_on("GPU")
     def test_nemotronh_4b_ckpt_mcore(self, monkeypatch):
         """Functional test for Nemotron Hybrid MCore checkpoint."""
@@ -149,7 +149,7 @@ def test_nemotronh_4b_ckpt_mcore(self, monkeypatch):
                 load_dir,
                 "--save",
                 MCORE_CKPT,
-                "--ckpt-format", 
+                "--ckpt-format",
                 "torch_dist",
                 "--log-progress",
                 "--bf16",
diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
index 26282656ae..9931c5bb05 100644
--- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
+++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
@@ -168,4 +168,4 @@ def test_remove_artifacts(self):
         """Removes model artifacts"""
         shutil.rmtree(BASE_DIR)
 
-        assert not os.path.exists(BASE_DIR)
\ No newline at end of file
+        assert not os.path.exists(BASE_DIR)

From 038f18d762fb53ab5bba98f05c34b6611e9eaabf Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 3 Mar 2026 06:24:55 -0800
Subject: [PATCH 09/11] revert mlm commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 3rdparty/Megatron-LM | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM
index 3d1a4ba71e..23dd639cf3 160000
--- a/3rdparty/Megatron-LM
+++ b/3rdparty/Megatron-LM
@@ -1 +1 @@
-Subproject commit 3d1a4ba71ecc49f1a0c9480c90f819d2b00f9915
+Subproject commit 23dd639cf3de30f3b9d8d0fae71ee31180be9ddd

From b96fda166394302a4ac2adef1e9d9afbb9b906fa Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 3 Mar 2026 06:54:07 -0800
Subject: [PATCH 10/11] fix typi

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .github/workflows/cicd-main.yml                             | 6 +++---
 tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py | 4 +---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index ddccb0a5ff..c54a24163e 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -380,9 +380,9 @@ jobs:
           # - script: L2_Launch_quantization_export
           - script: L2_Launch_recipes_llama_cuda_graphs
           - script: L2_Launch_utils
-          - script: L2_Launch_ckpts_mbridge_to_mlm_llama32_1b.sh
-          - script: L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b.sh 
-          - script: L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b.sh
+          - script: L2_Launch_ckpts_mbridge_to_mlm_llama32_1b
+          - script: L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b
+          - script: L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b
     needs: [pre-flight, cicd-unit-tests]
     runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
     if: |
diff --git a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
index 9931c5bb05..9990b59679 100644
--- a/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
+++ b/tests/functional_tests/ckpts/qwen3_4b/test_qwen3_4b_ckpt.py
@@ -45,8 +45,6 @@ def test_qwen3_4b_ckpt_mbridge(self):
         config.checkpoint.load = MCORE_CKPT if os.path.exists(MCORE_CKPT) else None
         config.checkpoint.load_optim = False
 
-        config.model.num_layers = 24
-
         config.train.train_iters = 10 if config.checkpoint.load else 5
         config.train.eval_iters = 5
         config.train.save_interval = 5
@@ -90,7 +88,7 @@ def test_qwen3_4b_ckpt_mcore(self, monkeypatch):
                 "1000000",
                 "--use-rotary-position-embeddings",
                 "--num-layers",
-                "24",
+                "36",
                 "--hidden-size",
                 "2560",
                 "--num-attention-heads",

From 83fd74aa748fbd268227f12895be94070f7faae8 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 4 Mar 2026 09:02:58 -0800
Subject: [PATCH 11/11] run mlm to mbridge ckpt tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .github/workflows/cicd-main.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index c54a24163e..a312690372 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -381,8 +381,11 @@ jobs:
           - script: L2_Launch_recipes_llama_cuda_graphs
           - script: L2_Launch_utils
           - script: L2_Launch_ckpts_mbridge_to_mlm_llama32_1b
+          - script: L2_Launch_ckpts_mlm_to_mbridge_llama32_1b
           - script: L2_Launch_ckpts_mbridge_to_mlm_qwen3_4b
+          - script: L2_Launch_ckpts_mlm_to_mbridge_qwen3_4b
           - script: L2_Launch_ckpts_mbridge_to_mlm_nemotronh_4b
+          - script: L2_Launch_ckpts_mlm_to_mbridge_nemotronh_4b
     needs: [pre-flight, cicd-unit-tests]
     runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
     if: |