NVIDIA-NeMo · ko3n1g · Feb 2, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 name: Build, test, and publish a PyPi wheel (to testpypi).
 
 on:
@@ -35,55 +34,62 @@ concurrency:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.69.1
+    with:
+      default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
+      non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
+      default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }}
+      non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }}
+    secrets:
+      NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
 
-  # build-test-publish-wheel:
-  #   needs: [pre-flight]
-  #   if: |
-  #     !(needs.pre-flight.outputs.docs_only == 'true'
-  #     || needs.pre-flight.outputs.is_deployment_workflow == 'true')
-  #   uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.65.1
-  #   with:
-  #     dry-run: true
-  #     python-package: megatron.bridge
-  #     python-version: "3.10"
-  #     packaging: uv
-  #     no-publish: ${{ !(github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) }}
-  #     has-src-dir: true
-  #     skip-test-wheel: true
-  #     custom-container: nvcr.io/nvidia/pytorch:25.05-py3
-  #     runner: self-hosted-nemo
-  #     no-build-isolation: true
-  #     submodules: recursive
-  #     container-options: "--gpus all --runtime=nvidia"
-  #   secrets:
-  #     TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
-  #     TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
-  #     SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
-  #     SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
-  #     GH_TOKEN: ${{ secrets.PAT }}
+  build-test-publish-wheel:
+    needs: [pre-flight]
+    if: |
+      !(needs.pre-flight.outputs.docs_only == 'true'
+      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.70.1
+    with:
+      dry-run: true
+      python-package: megatron.bridge
+      python-version: "3.10"
+      packaging: uv
+      no-publish: ${{ !(github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) }}
+      has-src-dir: true
+      skip-test-wheel: true
+      custom-container: nvcr.io/nvidia/pytorch:25.11-py3
+      runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2-container
+      no-build-isolation: true
+      submodules: recursive
+      container-options: "--gpus all --runtime=nvidia"
+    secrets:
+      TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
+      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
+      GH_TOKEN: ${{ secrets.PAT }}
 
-  # build-test-publish-wheel-summary:
-  #   needs: [pre-flight, build-test-publish-wheel]
-  #   if: |
-  #     (
-  #       needs.pre-flight.outputs.docs_only == 'true'
-  #       || needs.pre-flight.outputs.is_deployment_workflow == 'true'
-  #       || always()
-  #     )
-  #     && !cancelled()
-  #   runs-on: ubuntu-latest
-  #   steps:
-  #     - name: Result
-  #       run: |
-  #         FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+  build-test-publish-wheel-summary:
+    needs: [pre-flight, build-test-publish-wheel]
+    if: |
+      (
+        needs.pre-flight.outputs.docs_only == 'true'
+        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
+        || always()
+      )
+      && !cancelled()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Result
+        run: |
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
 
-  #         if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
-  #             echo "✅ All previous jobs completed successfully"
-  #             exit 0
-  #         else
-  #             echo "❌ Found $FAILED_JOBS failed job(s)"
-  #             # Show which jobs failed
-  #             gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
-  #             exit 1
-  #         fi
+          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+              echo "✅ All previous jobs completed successfully"
+              exit 0
+          else
+              echo "❌ Found $FAILED_JOBS failed job(s)"
+              # Show which jobs failed
+              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
+              exit 1
+          fi
@@ -25,32 +25,32 @@ on:
   workflow_dispatch:
     inputs:
       mcore_commit:
-        description: 'MCore commit SHA to test against'
+        description: "MCore commit SHA to test against"
         required: false
         type: string
       mcore_branch:
-        description: 'MCore branch name (for reference)'
+        description: "MCore branch name (for reference)"
         required: false
         type: string
       mcore_repo:
-        description: 'MCore repository URL (for fetching from forks)'
+        description: "MCore repository URL (for fetching from forks)"
         required: false
         type: string
-        default: 'https://github.com/NVIDIA/Megatron-LM.git'
+        default: "https://github.com/NVIDIA/Megatron-LM.git"
       test_suite:
-        description: 'Test suite to run'
+        description: "Test suite to run"
         required: false
         type: choice
         options:
-          - 'all'
-          - 'unit-only'
-          - 'functional-only'
-        default: 'all'
+          - "all"
+          - "unit-only"
+          - "functional-only"
+        default: "all"
       triggered_by:
-        description: 'Trigger source (for tracking)'
+        description: "Trigger source (for tracking)"
         required: false
         type: string
-        default: 'manual'
+        default: "manual"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
@@ -393,8 +393,9 @@ jobs:
           - script: L2_Launch_models_nemotron_vl
           - script: L2_Launch_models_olmoe
           - script: L2_Launch_models_qwen
-          - script: L2_Launch_models_qwen_quantization
+          # - script: L2_Launch_models_qwen_quantization
           - script: L2_Launch_models_qwen_vl
+          - script: L2_Launch_recipes_gemma_vl
           - script: L2_Launch_recipes_gpt_oss
           - script: L2_Launch_recipes_llama_1b
           - script: L2_Launch_recipes_llama_3b

diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM
@@ -24,7 +24,11 @@ ENV UV_LINK_MODE=copy
 ENV UV_VERSION="0.7.2"
 
 RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
-    uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
+    uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages && \
+    # Address CVE-2025-68973
+    apt-get update && apt install -y --only-upgrade gnupg && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 COPY pyproject.toml uv.lock /opt/Megatron-Bridge/
 COPY src/megatron/bridge/__init__.py src/megatron/bridge/package_info.py /opt/Megatron-Bridge/src/megatron/bridge/

diff --git a/docs/conf.py b/docs/conf.py
@@ -27,7 +27,7 @@
 project = "Megatron Bridge"
 copyright = "2025, NVIDIA Corporation"
 author = "NVIDIA Corporation"
-release = "latest"
+release = "0.3.0"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/docs/images/mtp_loss.png b/docs/images/mtp_loss.png
diff --git a/docs/images/mtp_loss_comparison.png b/docs/images/mtp_loss_comparison.png
diff --git a/docs/index.md b/docs/index.md
@@ -49,6 +49,7 @@ training/activation-recomputation.md
 training/cpu-offloading.md
 training/peft.md
 training/packed-sequences.md
+training/multi-token-prediction.md
 training/distillation.md
 training/callbacks.md
 ```

diff --git a/docs/models/llm/gemma3.md b/docs/models/llm/gemma3.md
@@ -180,7 +180,7 @@ torchrun --nproc-per-node=8 run/run_recipe.py \
 - Gemma 3 1B: https://huggingface.co/google/gemma-3-1b-it
 
 ## Related Docs
-- Gemma3 Vision-Language Models: [Gemma 3 VL](../vlm/gemma3-vl.md)
+- Gemma3 Vision-Language Models: [Gemma 3 VL](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/vlm/gemma3_vl/README.md)
 - Recipe usage: [Recipe usage](../../recipe-usage.md)
 - Customizing the training recipe configuration: [Configuration overview](../../training/config-container-overview.md)
 - Training entry points: [Entry points](../../training/entry-points.md)
diff --git a/docs/models/vlm/README.md b/docs/models/vlm/README.md
@@ -9,6 +9,7 @@ Megatron Bridge supports the following VLM families:
 | Model | Documentation | Description |
 |-------|---------------|-------------|
 | **Gemma 3 VL** | [gemma3-vl.md](gemma3-vl.md) | Google Gemma 3 Vision Language model |
+| **Ministral 3** | [ministral3.md](ministral3.md) | Ministral 3 Vision Language model |
 | **Nemotron Nano V2 VL** | [nemotron-nano-v2-vl.md](nemotron-nano-v2-vl.md) | NVIDIA Nemotron Nano V2 Vision Language model |
 | **Qwen2.5 VL** | [qwen2.5-vl.md](qwen2.5-vl.md) | Alibaba Cloud Qwen2.5 Vision Language model |
 | **Qwen3 VL** | [qwen3-vl.md](qwen3-vl.md) | Alibaba Cloud Qwen3 Vision Language model |

diff --git a/docs/models/vlm/gemma3-vl.md b/docs/models/vlm/gemma3-vl.md
@@ -44,163 +44,9 @@ Gemma 3 VL builds on the Gemma 3 architecture with additional multimodal capabil
 - **Multimodal Integration**: Seamless integration of visual and textual information through learned projection layers
 - **Flexible Image Handling**: Supports variable resolution images and multiple images per conversation
 
-## Conversion with 🤗 Hugging Face
-
-### Import HF → Megatron
-To import the HF VL model to your desired Megatron path:
-```bash
-python examples/conversion/convert_checkpoints.py import \
---hf-model google/gemma-3-4b-it \
---megatron-path /models/gemma-3-4b-it
-```
-
-### Export Megatron → HF
-```bash
-python examples/conversion/convert_checkpoints.py export \
---hf-model google/gemma-3-4b-it \
---megatron-path /results/gemma3_vl_4b/checkpoints/iter_00001000 \
---hf-path ./gemma3-vl-hf-export
-```
-
-### Run Inference on Converted Checkpoint
-
-```bash
-python examples/conversion/hf_to_megatron_generate_vlm.py \
---hf_model_path google/gemma-3-4b-it \
---megatron_model_path /models/gemma-3-4b-it \
---image_path <example image path> \
---prompt "Describe this image." \
---max_new_tokens 100
-```
-
-Note:
-- `--megatron_model_path` is optional. If not specified, the script will convert the model and then run forward.
-- You can also use image URLs: `--image_path="https://example.com/image.jpg"`
-
-## Finetune Recipes
-
-- See: [bridge.recipes.gemma3_vl](../../apidocs/bridge/bridge.recipes.gemma3_vl.md)
-- Available recipes:
-  - `gemma3_vl_4b_finetune_config`: Finetuning for 4B VL model with PEFT support
-  - `gemma3_vl_12b_finetune_config`: Finetuning for 12B VL model with PEFT support
-  - `gemma3_vl_27b_finetune_config`: Finetuning for 27B VL model with PEFT support
-
-Before training, ensure the following environment variables are set:
-1. `SAVE_DIR`: checkpoint and log saving directory
-2. `HF_TOKEN`: to download models from HF Hub (if required)
-3. `HF_HOME`: (optional) to avoid re-downloading models and datasets
-4. `WANDB_API_KEY`: (optional) to enable WandB logging
-
-### Full Finetuning
-
-```bash
-torchrun --nproc-per-node=8 run/run_vlm_recipe.py \
---pretrained-checkpoint /models/gemma-3-4b-it \
---recipe gemma3_vl_4b_finetune_config \
---dataset-type hf \
-dataset.maker_name=make_cord_v2_dataset \
-train.global_batch_size=64 \
-train.train_iters=1000 \
-checkpoint.save=$SAVE_DIR/gemma3_vl_4b_finetune
-```
-
-Or programmatically:
-```python
-from megatron.bridge.recipes.gemma3_vl import gemma3_vl_4b_finetune_config
-
-# Full finetuning
-config = gemma3_vl_4b_finetune_config(
-    name="gemma3_vl_4b_full_finetune",
-    pretrained_checkpoint="/models/gemma-3-4b-it",
-    dataset_type="hf",
-    peft=None,
-    train_iters=1000,
-    global_batch_size=64,
-)
-```
-
-### Parameter-Efficient Finetuning (PEFT) with LoRA
-
-```bash
-torchrun --nproc-per-node=8 run/run_vlm_recipe.py \
---pretrained-checkpoint /models/gemma-3-4b-it \
---recipe gemma3_vl_4b_finetune_config \
---peft_scheme lora \
---dataset-type hf \
-dataset.maker_name=make_cord_v2_dataset \
-train.global_batch_size=128 \
-checkpoint.save=$SAVE_DIR/gemma3_vl_4b_lora
-```
-
-PEFT options:
-- `--peft_scheme`: Set to `lora` for LoRA or `dora` for DoRA. Omit for full finetuning.
-
-You can also combine PEFT with freeze options:
-- `model.freeze_language_model=True`: Freeze the language model
-- `model.freeze_vision_model=True`: Freeze the vision encoder
-- `model.freeze_vision_projection=True`: Freeze the vision projection layer
-
-Example with freeze options:
-```bash
-torchrun --nproc-per-node=8 run/run_vlm_recipe.py \
---pretrained-checkpoint /models/gemma-3-4b-it \
---recipe gemma3_vl_4b_finetune_config \
---peft_scheme lora \
-model.freeze_language_model=True \
-model.freeze_vision_model=False \
-checkpoint.save=$SAVE_DIR/gemma3_vl_4b_lora_vision
-```
-
-Programmatic configuration:
-```python
-from megatron.bridge.recipes.gemma3_vl import gemma3_vl_4b_finetune_config
-
-# LoRA finetuning
-config = gemma3_vl_4b_finetune_config(
-    name="gemma3_vl_4b_lora_finetune",
-    pretrained_checkpoint="/models/gemma-3-4b-it",
-    dataset_type="hf",
-    peft="lora",  # or "dora"
-    train_iters=1000,
-    global_batch_size=128,
-)
-
-# LoRA with vision model frozen
-config = gemma3_vl_4b_finetune_config(
-    name="gemma3_vl_4b_lora_language_only",
-    pretrained_checkpoint="/models/gemma-3-4b-it",
-    peft="lora",
-    freeze_vision_model=True,
-    freeze_vision_projection=True,
-)
-```
-
-### Recommended Configurations
-
-| Model | Mode | TP | PP | Global Batch Size | Learning Rate | Hardware |
-|-------|------|----|----|-------------------|---------------|----------|
-| Gemma 3 VL 4B | Full SFT | 1 | 1 | 32-64 | 5e-6 | 8 GPUs |
-| Gemma 3 VL 4B | LoRA/DoRA | 1 | 1 | 64-128 | 1e-4 | 8 GPUs |
-| Gemma 3 VL 12B | Full SFT | 4 | 1 | 32-64 | 5e-6 | 8 GPUs |
-| Gemma 3 VL 12B | LoRA/DoRA | 1 | 1 | 64-128 | 1e-4 | 8 GPUs |
-| Gemma 3 VL 27B | Full SFT | 8 | 2 | 16-32 | 5e-6 | 16 GPUs |
-| Gemma 3 VL 27B | LoRA/DoRA | 4 | 1 | 32-64 | 1e-4 | 16 GPUs |
-
-**Note:** LoRA/DoRA significantly reduces memory requirements, allowing for larger batch sizes and fewer GPUs.
-
-## Example Datasets
-
-| Dataset | Maker Name | Description |
-|---------|------------|-------------|
-| [cord-v2](https://huggingface.co/datasets/naver-clova-ix/cord-v2) | `make_cord_v2_dataset` | OCR receipts: Single-image-text dataset for receipt understanding |
-| [MedPix-VQA](https://huggingface.co/datasets/mmoukouba/MedPix-VQA) | `make_medpix_dataset` | Medical VQA: Single-image Q&A for clinical images |
-| [The Cauldron (Raven subset)](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron) | `make_raven_dataset` | Visual reasoning: Multi-image analogical reasoning |
-
-To change the dataset, specify `dataset.maker_name=<maker_name>` in your command.
-
 ## Examples
-- Checkpoint import/export: [examples/conversion/convert_checkpoints.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py)
-- Generate with VLM (HF→Megatron): [examples/conversion/hf_to_megatron_generate_vlm.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/hf_to_megatron_generate_vlm.py)
+
+For checkpoint conversion, inference, finetuning recipes, and step-by-step training guides, see the [Gemma 3 VL Examples](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/vlm/gemma3_vl/README.md).
 
 ## Hugging Face Model Cards
 
@@ -213,4 +59,3 @@ To change the dataset, specify `dataset.maker_name=<maker_name>` in your command
 - Recipe usage: [Recipe usage](../../recipe-usage.md)
 - Customizing the training recipe configuration: [Configuration overview](../../training/config-container-overview.md)
 - Training entry points: [Entry points](../../training/entry-points.md)
-