NVIDIA-NeMo · ko3n1g · Feb 6, 2026 · Feb 5, 2026 · coderabbitai · Feb 5, 2026
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 name: Build, test, and publish a PyPi wheel (to testpypi).
 
 on:
@@ -35,55 +34,62 @@ concurrency:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.69.1
+    with:
+      default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
+      non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
+      default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }}
+      non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }}
+    secrets:
+      NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
 
-  # build-test-publish-wheel:
-  #   needs: [pre-flight]
-  #   if: |
-  #     !(needs.pre-flight.outputs.docs_only == 'true'
-  #     || needs.pre-flight.outputs.is_deployment_workflow == 'true')
-  #   uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.65.1
-  #   with:
-  #     dry-run: true
-  #     python-package: megatron.bridge
-  #     python-version: "3.10"
-  #     packaging: uv
-  #     no-publish: ${{ !(github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) }}
-  #     has-src-dir: true
-  #     skip-test-wheel: true
-  #     custom-container: nvcr.io/nvidia/pytorch:25.05-py3
-  #     runner: self-hosted-nemo
-  #     no-build-isolation: true
-  #     submodules: recursive
-  #     container-options: "--gpus all --runtime=nvidia"
-  #   secrets:
-  #     TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
-  #     TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
-  #     SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
-  #     SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
-  #     GH_TOKEN: ${{ secrets.PAT }}
+  build-test-publish-wheel:
+    needs: [pre-flight]
+    if: |
+      !(needs.pre-flight.outputs.docs_only == 'true'
+      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.70.1
+    with:
+      dry-run: true
+      python-package: megatron.bridge
+      python-version: "3.10"
+      packaging: uv
+      no-publish: ${{ !(github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) }}
+      has-src-dir: true
+      skip-test-wheel: true
+      custom-container: nvcr.io/nvidia/pytorch:25.11-py3
+      runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2-container
+      no-build-isolation: true
+      submodules: recursive
+      container-options: "--gpus all --runtime=nvidia"
+    secrets:
+      TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
+      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
+      GH_TOKEN: ${{ secrets.PAT }}
 
-  # build-test-publish-wheel-summary:
-  #   needs: [pre-flight, build-test-publish-wheel]
-  #   if: |
-  #     (
-  #       needs.pre-flight.outputs.docs_only == 'true'
-  #       || needs.pre-flight.outputs.is_deployment_workflow == 'true'
-  #       || always()
-  #     )
-  #     && !cancelled()
-  #   runs-on: ubuntu-latest
-  #   steps:
-  #     - name: Result
-  #       run: |
-  #         FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+  build-test-publish-wheel-summary:
+    needs: [pre-flight, build-test-publish-wheel]
+    if: |
+      (
+        needs.pre-flight.outputs.docs_only == 'true'
+        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
+        || always()
+      )
+      && !cancelled()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Result
+        run: |
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
-          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length' 2>/dev/null) || FAILED_JOBS=0
-          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length' 2>/dev/null) || FAILED_JOBS=0
 
-  #         if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
-  #             echo "✅ All previous jobs completed successfully"
-  #             exit 0
-  #         else
-  #             echo "❌ Found $FAILED_JOBS failed job(s)"
-  #             # Show which jobs failed
-  #             gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
-  #             exit 1
-  #         fi
+          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+              echo "✅ All previous jobs completed successfully"
+              exit 0
+          else
+              echo "❌ Found $FAILED_JOBS failed job(s)"
+              # Show which jobs failed
+              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
+              exit 1
+          fi
@@ -81,9 +81,9 @@ dependencies = [
     "hydra-core>1.3,<=1.3.2",
     "megatron-core[dev,mlm]>=0.15.0a0,<0.17.0",
     "qwen-vl-utils",
-    "transformer-engine[pytorch]>=2.10.0a0,<2.12.0",
+    "transformer-engine[pytorch,core_cu13]>=2.10.0a0,<2.13.0",
     "mamba-ssm",
-    "nvidia-resiliency-ext",
+    "nvidia-resiliency-ext~=0.4.1",
     "causal-conv1d",
     "flash-linear-attention",
     "timm",
@@ -108,21 +108,17 @@ no-build-isolation-package = [
 ]
 prerelease = "allow"
 override-dependencies = [
-    "nvidia-modelopt[torch]>=0.37.0",
     "torch; sys_platform == 'never'",
     "torchvision; sys_platform == 'never'",
     "triton; sys_platform == 'never'",
-    "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
+    "transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@6a34b6574fa6c29d9d07fdcddf9812cbb1488878",
+
 ]
 
-# uv.sources allows us to override dependencies with VCS commits.
-# Lets use this only for debugging purposes, but not for production (main).
 [tool.uv.sources]
-transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "6a34b6574fa6c29d9d07fdcddf9812cbb1488878" }
 megatron-core = { path = "3rdparty/Megatron-LM/" }
-nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "54f85fe422d296cf04ea524130014bd3a2c3add1" }
 nvidia-modelopt = { git = "https://github.com/NVIDIA/TensorRT-Model-Optimizer.git", rev = "0a4f0a8b933121f7af080261a0a5a7717f2c5d49" }
-# mamba-ssm = { git = "https://github.com/yfw/mamba", branch = "general_stride_fix" }
+nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "v0.4.1" } # Requires a source install to compile cupti for cuda13
 
 [project.optional-dependencies]
 recipes = [

diff --git a/src/megatron/bridge/training/mlm_compat/model.py b/src/megatron/bridge/training/mlm_compat/model.py
@@ -51,7 +51,7 @@ def _get_transformer_layer_spec(args: argparse.Namespace, use_te: bool, use_kitc
         use_kitchen: Whether to use kitchen extension
 
     Returns:
-        transformer_layer_spec: The transformer layer specification
+        ModuleSpec: The transformer layer specification
     """
     if use_te:
         return get_gpt_layer_with_transformer_engine_spec(