NVIDIA · danielkorzekwa · Mar 20, 2026 · Mar 4, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/.github/workflows/_example_tests_runner.yml b/.github/workflows/_example_tests_runner.yml
@@ -51,14 +51,15 @@ jobs:
           apt-get update && apt-get install -y git-lfs
           git lfs install --system
 
-          pip install ".${{ inputs.pip_install_extras }}"
+          # use `python -m pip` instead of `pip` to avoid conflicts with system pip for nemo containers
+          python -m pip install ".${{ inputs.pip_install_extras }}"
 
           if [[ "${{ inputs.example }}" == *"diffusers"* ]]; then
             echo "Uninstalling apex for diffusers: T5 Int8 (PixArt) + Apex is not supported as per https://github.com/huggingface/transformers/issues/21391"
-            pip uninstall -y apex || true
+            python -m pip uninstall -y apex || true
           fi
 
-          find examples/${{ inputs.example }} -name "requirements.txt" | while read req_file; do pip install -r "$req_file" || exit 1; done
+          find examples/${{ inputs.example }} -name "requirements.txt" | while read req_file; do python -m pip install -r "$req_file" || exit 1; done
       - name: Run tests
         run: |
           echo "Running tests for: ${{ inputs.example }}"

diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
@@ -56,8 +56,8 @@ jobs:
       match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
 
-  ##### TensorRT-LLM Example Tests #####
-  trtllm-pr:
+  ##### NeMo Example Tests #####
+  nemo-pr:
     needs: [check-file-changes, wait-checks]
     if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
     strategy:
@@ -67,7 +67,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
+      docker_image: "nvcr.io/nvidia/nemo:26.02"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,puzzletron,dev-test]"
       runner: linux-amd64-gpu-rtxpro6000-latest-2
@@ -76,13 +76,13 @@ jobs:
   example-pr-required-check:
     # Run even if example tests are skipped
     if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
-    needs: [check-file-changes, trtllm-pr]
+    needs: [check-file-changes, nemo-pr]
     runs-on: ubuntu-latest
     steps:
       - name: Required GPU tests did not succeed
         if: |
           needs.check-file-changes.result != 'success' ||
           (needs.check-file-changes.outputs.any_changed == 'true' && (
-            needs.trtllm-pr.result != 'success'
+            needs.nemo-pr.result != 'success'
           ))
         run: exit 1
diff --git a/examples/puzzletron/mbridge_distillation/README.md b/examples/puzzletron/mbridge_distillation/README.md
@@ -90,7 +90,7 @@ torchrun --nproc_per_node=8 examples/puzzletron/mbridge_distillation/distill_hf.
 
 - Add `--trust_remote_code` if student or teacher checkpoints need HuggingFace custom modeling code.
 - The distilled Megatron-Bridge checkpoint will be saved to `--output_dir/checkpoints/iter_<train_iters>`.
-- Add `--hf-export-path` to automatically export the final checkpoint to HuggingFace format after distillation. When using `--hf-export-path`, you must also provide `--hf-model` to specify the HuggingFace model ID to use as a template for export (e.g., `meta-llama/Llama-3.1-8B-Instruct`). The `--hf-model` should match the base architecture of the student model. The exported model can be evaluated for accuracy using the evaluation tools described in the main [README.md](../README.md#evaluation).
+- Add `--hf-export-path` (or `--hf_export_path`) to automatically export the final checkpoint to HuggingFace format after distillation. When exporting, you must also provide `--hf-model` / `--hf_model` as the HuggingFace model ID for the export template (e.g., `meta-llama/Llama-3.1-8B-Instruct`). It should match the base architecture of the student model. The exported model can be evaluated for accuracy using the evaluation tools described in the main [README.md](../README.md#evaluation).
 - For production use, use larger datasets like [Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1) and train for more iterations. See the [Megatron-Bridge distillation tutorial](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/megatron_bridge#distillation) for best practices.
 
 ## MMLU Evaluation Results

diff --git a/examples/puzzletron/mbridge_distillation/distill_hf.py b/examples/puzzletron/mbridge_distillation/distill_hf.py
@@ -144,6 +144,7 @@ def get_args():
     parser.add_argument("--wandb_exp_name", type=str, help="Wandb experiment name (optional)")
     # Export arguments
     parser.add_argument(
+        "--hf_export_path",
         "--hf-export-path",
         type=str,
         default=None,
@@ -153,6 +154,7 @@ def get_args():
         ),
     )
     parser.add_argument(
+        "--hf_model",
         "--hf-model",
         type=str,
         required=True,
@@ -307,6 +309,7 @@ def _build_model_provider(hf_path):
                     train_iters=args.train_iters,
                     hf_export_path=args.hf_export_path,
                     hf_model=args.hf_model,
+                    trust_remote_code=args.trust_remote_code,
                 )
             except Exception as e:
                 print(f"⚠️  Export failed: {e}")

diff --git a/modelopt/torch/puzzletron/export/mbridge/__init__.py b/modelopt/torch/puzzletron/export/mbridge/__init__.py
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron-Bridge adapters for Puzzletron AnyModel checkpoints.
+
+This module provides bridges for converting Puzzletron AnyModel checkpoints
+(heterogeneous layer architectures) to Megatron-Core format via Megatron-Bridge.
+"""
+
+# Import to register bridges (side effect)
+from modelopt.torch.puzzletron.export.mbridge.base import HeterogeneousBridgeMixin
+from modelopt.torch.puzzletron.export.mbridge.llama import (  # noqa: F401
+    PuzzletronLlamaAnyModelBridge,
+)
+from modelopt.torch.puzzletron.export.mbridge.qwen3 import (  # noqa: F401
+    PuzzletronQwen3AnyModelBridge,
+)
+
+__all__ = [
+    "HeterogeneousBridgeMixin",
+    "PuzzletronLlamaAnyModelBridge",
+    "PuzzletronQwen3AnyModelBridge",
+]
diff --git a/modelopt/torch/puzzletron/export/mbridge/base.py b/modelopt/torch/puzzletron/export/mbridge/base.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Mixin class for bridges that support heterogeneous layer architectures.
+
+This module provides a mixin class for converting models with block_configs
+(heterogeneous layer configurations) to Megatron-Core format via Megatron-Bridge.
+"""
+
+import dataclasses
+import json
+from collections.abc import Callable
+from dataclasses import dataclass, fields
+
+from megatron.bridge.models.gpt_provider import GPTModelProvider
+from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
+from megatron.bridge.models.transformer_config import HeterogeneousTransformerConfig
+from megatron.core.models.gpt.heterogeneous.heterogeneous_layer_specs import (
+    get_gpt_heterogeneous_layer_spec,
+)
+from megatron.core.transformer.spec_utils import ModuleSpec
+
+
+def heterogeneous_layer_spec(config) -> ModuleSpec:
+    """Get GPT heterogeneous layer spec using Transformer Engine."""
+    return get_gpt_heterogeneous_layer_spec(config, use_te=True)
+
+
+@dataclass
+class GenericHeterogeneousProvider(GPTModelProvider, HeterogeneousTransformerConfig):
+    """Generic provider for AnyModel checkpoints with block_configs."""
+
+    # Heterogeneous configuration fields
+    heterogeneous_layers_config_path: str | None = None
+    heterogeneous_layers_config_encoded_json: str = ""
+    transformer_layer_spec: ModuleSpec | Callable = heterogeneous_layer_spec
+
+    def __getattr__(self, name: str):
+        """Handle missing attributes for OmegaConf compatibility.
+
+        Returns empty list for per_block_parameters if not yet initialized (before finalize()).
+        This allows OmegaConf to serialize/deserialize configs without errors. Actual usage
+        should call finalize() first to set per_block_parameters as a real attribute.
+        """
+        if name == "per_block_parameters":
+            # Return existing attribute if set, otherwise [] for OmegaConf compatibility
+            try:
+                return object.__getattribute__(self, name)
+            except AttributeError:
+                return []
+        raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
+
+
+class HeterogeneousBridgeMixin:
+    """Mixin for bridges supporting heterogeneous layer architectures (block_configs).
+
+    Must be used with multiple inheritance alongside a model-specific bridge.
+    Example: class PuzzletronLlamaAnyModelBridge(HeterogeneousBridgeMixin, LlamaBridge)
+    """
+
+    def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> GPTModelProvider:
+        """Convert HF AnyModel config to Megatron GPTModelProvider.
+
+        This method:
+        1. Calls the parent bridge's provider_bridge() to get a GPTModelProvider with all
+           model-specific settings (e.g., LlamaBridge sets normalization="RMSNorm", etc.)
+        2. Converts the provider to a dict and filters to only fields accepted by
+           GenericHeterogeneousProvider (which inherits from GPTModelProvider, so all valid
+           GPTModelProvider fields are preserved)
+        3. Adds heterogeneous configuration and returns GenericHeterogeneousProvider
+
+        All parameters from the parent bridge (e.g., LlamaBridge) are maintained because
+        GenericHeterogeneousProvider inherits from GPTModelProvider, which includes all
+        the fields that the parent bridge sets.
+        """
+
+        parent_provider = super().provider_bridge(hf_pretrained)  # type: ignore[misc]
+
+        provider_kwargs = dataclasses.asdict(parent_provider)
+
+        # Filter to only fields that GenericHeterogeneousProvider accepts.
+        # GenericHeterogeneousProvider inherits from GPTModelProvider, so it includes all
+        # GPTModelProvider fields. Model-specific fields from subclasses (e.g., MistralModelProvider,
+        # GPTOSSModelProvider) are filtered out because GenericHeterogeneousProvider only inherits
+        # from GPTModelProvider, not from model-specific subclasses.
+        #
+        # Note: This logic may not work for bridges like MistralBridge or GPTOSSBridge if they
+        # use model-specific parameters not supported by GenericHeterogeneousProvider (e.g.,
+        # scale_factor, yarn_rotary_scaling_factor, moe_* parameters). In such cases, create a
+        # model-specific heterogeneous provider that inherits from the model-specific provider.
+        valid_fields = {f.name for f in fields(GenericHeterogeneousProvider)}
+
+        # Only keep kwargs that are valid fields
+        provider_kwargs = {k: v for k, v in provider_kwargs.items() if k in valid_fields}
+
+        provider_kwargs["heterogeneous_layers_config_encoded_json"] = (
+            self._build_heterogeneous_config_json(hf_pretrained.config)
+        )
+        return GenericHeterogeneousProvider(**provider_kwargs)
+
+    def _build_heterogeneous_config_json(self, hf_config) -> str:
+        """Build heterogeneous layers config JSON from HF config."""
+
+        hf_config_dict = json.loads(hf_config.to_json_string())
+
+        mcore_block_configs = [
+            self._convert_block_config(block) for block in hf_config_dict["block_configs"]
+        ]
+        return json.dumps({"block_configs": mcore_block_configs}, ensure_ascii=False)
+
+    def _convert_block_config(self, block: dict) -> dict:
+        """Convert a single block config from HF format to MCore format."""
+        return {
+            "attention": self._convert_attention_config(block["attention"]),
+            "ffn": self._convert_ffn_config(block["ffn"]),
+        }
+
+    def _convert_attention_config(self, attention_config: dict) -> dict:
+        """Convert attention config from HF format to MCore format."""
+        attention_config = attention_config.copy()
+        attention_config["num_query_groups"] = attention_config.pop("num_key_value_heads")
+        return attention_config
+
+    def _convert_ffn_config(self, ffn_config: dict) -> dict:
+        """Convert FFN/MLP config from HF format to MCore format."""
+        ffn_config = ffn_config.copy()
+        ffn_config["ffn_hidden_size"] = ffn_config.pop("intermediate_size")
+        return ffn_config