NVIDIA-NeMo · chtruong814 · Feb 4, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 29, 2026
diff --git a/examples/models/vlm/ministral3/README.md b/examples/models/vlm/ministral3/README.md
@@ -0,0 +1,72 @@
+# Ministral 3 - Vision Language Model
+
+This directory contains examples for Ministral 3 Vision Language Model, including checkpoint conversion, inference, and fine-tuning.
+
+## Workspace Configuration
+
+All scripts use a `WORKSPACE` environment variable to define the base directory for checkpoints and results. By default, this is set to `/workspace`. You can override it:
+
+```bash
+export WORKSPACE=/your/custom/path
+```
+
+Directory structure:
+- `${WORKSPACE}/models/` - Converted checkpoints
+- `${WORKSPACE}/results/` - Training outputs and experiment results
+
+## Checkpoint Conversion
+
+See the [conversion.sh](conversion.sh) script for commands to:
+- Import Hugging Face checkpoints to Megatron format
+- Export Megatron checkpoints back to Hugging Face format
+- Run multi-GPU round-trip validation between formats
+
+
+## Inference
+
+**See the [inference.sh](inference.sh) script for commands to:
+- Run inference with Hugging Face checkpoints
+- Run inference with imported Megatron checkpoints
+- Run inference with exported Hugging Face checkpoints
+
+**Expected output:**
+```
+...
+Generation step 46
+Generation step 47
+Generation step 48
+Generation step 49
+======== GENERATED TEXT OUTPUT ========
+Image: https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png
+Prompt: Describe this image.
+Generated: <bos><bos><start_of_turn>user
+...
+Describe this image.<end_of_turn>
+<start_of_turn>model
+Here's a description of the image you sent, breaking down the technical specifications of the H100 SXM and H100 NVL server cards:
+
+**Overall:**
+
+The image is a table comparing the technical specifications of two
+=======================================
+```
+
+## Pretrain
+
+Pretraining is not verified for this model.
+
+## Supervised Fine-Tuning (SFT)
+
+See the [sft.sh](sft.sh) script for full parameter fine-tuning with configurable model parallelisms.
+
+[W&B Report](TODO)
+
+## Parameter-Efficient Fine-Tuning (PEFT)
+
+See the [peft.sh](peft.sh) script for LoRA fine-tuning with configurable tensor and pipeline parallelism.
+
+[W&B Report](TODO)
+
+## Evaluation
+
+TBD
diff --git a/examples/models/vlm/ministral3/conversion.sh b/examples/models/vlm/ministral3/conversion.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+
+# Import HF → Megatron
+uv run python examples/conversion/convert_checkpoints.py import \
+    --hf-model mistralai/Ministral-3-3B-Instruct-2512-BF16 \
+    --megatron-path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16
+
+# Export Megatron → HF
+uv run python examples/conversion/convert_checkpoints.py export \
+    --hf-model mistralai/Ministral-3-3B-Instruct-2512 \
+    --megatron-path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16/iter_0000000 \
+    --hf-path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16-hf-export \
+    --not-strict # To avoid "*.extra_state" warnings
+
+# Round-trip validation
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
+    --hf-model-id mistralai/Ministral-3-3B-Instruct-2512-BF16 --tp 2 --pp 2
diff --git a/examples/models/vlm/ministral3/inference.sh b/examples/models/vlm/ministral3/inference.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+
+# Inference with Hugging Face checkpoints
+uv run python -m torch.distributed.run --nproc_per_node=4 examples/conversion/hf_to_megatron_generate_vlm.py \
+    --hf_model_path mistralai/Ministral-3-3B-Instruct-2512-BF16 \
+    --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \
+    --prompt "Describe this image." \
+    --max_new_tokens 100 \
+    --tp 2 \
+    --pp 2
+
+# Inference with imported Megatron checkpoints
+uv run python -m torch.distributed.run --nproc_per_node=4 examples/conversion/hf_to_megatron_generate_vlm.py \
+    --hf_model_path mistralai/Ministral-3-3B-Instruct-2512-BF16 \
+    --megatron_model_path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16/iter_0000000 \
+    --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \
+    --prompt "Describe this image." \
+    --max_new_tokens 100 \
+    --tp 2 \
+    --pp 2
+
+# Inference with exported HF checkpoints
+uv run python -m torch.distributed.run --nproc_per_node=4 examples/conversion/hf_to_megatron_generate_vlm.py \
+    --hf_model_path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16-hf-export \
+    --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \
+    --prompt "Describe this image." \
+    --max_new_tokens 100 \
+    --tp 2 \
+    --pp 2
diff --git a/examples/models/vlm/ministral3/peft.sh b/examples/models/vlm/ministral3/peft.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+
+# Common configurations
+PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16
+MODEL_NAME=ministral3_3b
+DATASET_NAME=cord_v2
+SEQ_LENGTH=4096
+TRAIN_ITERS=50
+GLOBAL_BATCH_SIZE=32
+MICRO_BATCH_SIZE=1
+EVAL_ITERS=10
+LR=0.0002
+MIN_LR=0.00002
+LR_WARMUP_ITERS=10
+LOG_INTERVAL=1
+WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
+
+# TP/PP combinations: "TP,PP"
+PARALLELISM_CONFIGS=("2,1" "1,2")
+
+for config in "${PARALLELISM_CONFIGS[@]}"; do
+    IFS=',' read -r TP PP <<< "$config"
+
+    echo "Running LoRA finetuning with TP=$TP, PP=$PP"
+    uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
+        --recipe ${MODEL_NAME}_finetune_config \
+        --step_func vlm_step \
+        --peft_scheme lora \
+        checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
+        model.seq_length=$SEQ_LENGTH \
+        train.train_iters=$TRAIN_ITERS \
+        train.global_batch_size=$GLOBAL_BATCH_SIZE \
+        train.micro_batch_size=$MICRO_BATCH_SIZE \
+        train.eval_iters=$EVAL_ITERS \
+        optimizer.lr=$LR \
+        optimizer.min_lr=$MIN_LR \
+        scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
+        checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_lora_tp${TP}_pp${PP} \
+        logger.log_interval=$LOG_INTERVAL \
+        logger.wandb_project=$WANDB_PROJECT \
+        logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_lora_tp${TP}_pp${PP} \
+        dataset.maker_name=make_${DATASET_NAME}_dataset \
+        dataset.seq_length=$SEQ_LENGTH \
+        model.tensor_model_parallel_size=$TP \
+        model.pipeline_model_parallel_size=$PP
+done
diff --git a/examples/models/vlm/ministral3/sft.sh b/examples/models/vlm/ministral3/sft.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+
+# Common configurations
+PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16
+MODEL_NAME=ministral3_3b
+DATASET_NAME=cord_v2
+SEQ_LENGTH=4096
+TRAIN_ITERS=50
+GLOBAL_BATCH_SIZE=32
+MICRO_BATCH_SIZE=1
+EVAL_ITERS=10
+LR=0.00005
+MIN_LR=0.000005
+LR_WARMUP_ITERS=10
+LOG_INTERVAL=1
+WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
+
+# TP/PP combinations: "TP,PP"
+PARALLELISM_CONFIGS=("2,1" "1,2")
+
+for config in "${PARALLELISM_CONFIGS[@]}"; do
+    IFS=',' read -r TP PP <<< "$config"
+
+    echo "Running full finetuning with TP=$TP, PP=$PP"
+    uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \
+        --recipe ${MODEL_NAME}_finetune_config \
+        --step_func vlm_step \
+        checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
+        model.seq_length=$SEQ_LENGTH \
+        train.train_iters=$TRAIN_ITERS \
+        train.global_batch_size=$GLOBAL_BATCH_SIZE \
+        train.micro_batch_size=$MICRO_BATCH_SIZE \
+        train.eval_iters=$EVAL_ITERS \
+        optimizer.lr=$LR \
+        optimizer.min_lr=$MIN_LR \
+        scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
+        checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_tp${TP}_pp${PP} \
+        logger.log_interval=$LOG_INTERVAL \
+        logger.wandb_project=$WANDB_PROJECT \
+        logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_tp${TP}_pp${PP} \
+        dataset.maker_name=make_${DATASET_NAME}_dataset \
+        dataset.seq_length=$SEQ_LENGTH \
+        model.tensor_model_parallel_size=$TP \
+        model.pipeline_model_parallel_size=$PP
+done
diff --git a/src/megatron/bridge/models/ministral3/modeling_ministral3.py b/src/megatron/bridge/models/ministral3/modeling_ministral3.py
@@ -225,7 +225,9 @@ def forward(
 
             if pixel_values is not None:
                 # Get image features using HF's method (monkey-patched)
-                image_features = self.get_image_features(pixel_values.to(inputs_embeds.dtype), image_sizes=image_sizes)
+                image_features = self.get_image_features(
+                    pixel_values.to(inputs_embeds.dtype), image_sizes=image_sizes
+                ).pooler_output
                 image_features = torch.cat(image_features, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
 
                 # Replace image tokens in text embeddings with image features

diff --git a/src/megatron/bridge/recipes/__init__.py b/src/megatron/bridge/recipes/__init__.py
@@ -24,6 +24,7 @@
 from megatron.bridge.recipes.gpt import *
 from megatron.bridge.recipes.gpt_oss import *
 from megatron.bridge.recipes.llama import *
+from megatron.bridge.recipes.ministral3 import *
 from megatron.bridge.recipes.moonlight import *
 from megatron.bridge.recipes.nemotronh import *
 from megatron.bridge.recipes.olmoe import *