diff --git a/docs/models/vlm/README.md b/docs/models/vlm/README.md index 86ab3cb955..71f9d1dd48 100644 --- a/docs/models/vlm/README.md +++ b/docs/models/vlm/README.md @@ -9,6 +9,7 @@ Megatron Bridge supports the following VLM families: | Model | Documentation | Description | |-------|---------------|-------------| | **Gemma 3 VL** | [gemma3-vl.md](gemma3-vl.md) | Google Gemma 3 Vision Language model | +| **Ministral 3** | [ministral3.md](ministral3.md) | Ministral 3 Vision Language model | | **Nemotron Nano V2 VL** | [nemotron-nano-v2-vl.md](nemotron-nano-v2-vl.md) | NVIDIA Nemotron Nano V2 Vision Language model | | **Qwen2.5 VL** | [qwen2.5-vl.md](qwen2.5-vl.md) | Alibaba Cloud Qwen2.5 Vision Language model | | **Qwen3 VL** | [qwen3-vl.md](qwen3-vl.md) | Alibaba Cloud Qwen3 Vision Language model | diff --git a/examples/conversion/hf_to_megatron_generate_vlm.py b/examples/conversion/hf_to_megatron_generate_vlm.py index 71583afb88..5658b7dfef 100644 --- a/examples/conversion/hf_to_megatron_generate_vlm.py +++ b/examples/conversion/hf_to_megatron_generate_vlm.py @@ -115,7 +115,13 @@ def vlm_forward_step(data_iterator, model, **kwargs) -> torch.Tensor: def loss_func(x, **kwargs): return x - return model(**forward_args), loss_func + model_output = model(**forward_args) + if isinstance(model_output, tuple): + output_tensor, _ = model_output + else: + output_tensor = model_output + + return output_tensor, loss_func def load_image(image_path: str) -> Image.Image: diff --git a/examples/models/vlm/ministral3/README.md b/examples/models/vlm/ministral3/README.md new file mode 100644 index 0000000000..16b3b7dbe7 --- /dev/null +++ b/examples/models/vlm/ministral3/README.md @@ -0,0 +1,190 @@ +# Ministral 3 - Vision Language Model + +[Mistral AI's Ministral 3](https://huggingface.co/collections/mistralai/ministral-3) is a family of edge-optimized vision-language models designed for deployment across various hardware configurations. The Ministral 3 architecture combines a powerful language model with a vision encoder for multimodal understanding. + +Ministral 3 models support multimodal tasks including image captioning, visual question answering, OCR, and general vision-language understanding. Despite their compact size, these models deliver strong performance for on-device and edge deployment scenarios. + +Ministral family models are supported via the Bridge system with auto-detected configuration and weight mapping. + +```{important} +Please upgrade to `transformers` v5 and upgrade `mistral-common` in order to use the Ministral 3 models. +``` + +## Available Models + +### Vision-Language Models +- **Ministral 3 3B** (`mistralai/Ministral-3-3B-Base-2512`): 3.4B parameter vision-language model + - 26 layers, 3072 hidden size + - 32 attention heads, 8 query groups (GQA) + - Vision encoder: ~0.4B parameters + - Recommended: 1 node, 8 GPUs + +- **Ministral 3 8B** (`mistralai/Ministral-3-8B-Base-2512`): 8.4B parameter vision-language model + - 34 layers, 4096 hidden size + - 32 attention heads, 8 query groups (GQA) + - Vision encoder: ~0.4B parameters + - Recommended: 1 node, 8 GPUs + +- **Ministral 3 14B** (`mistralai/Ministral-3-14B-Base-2512`): ~14B parameter vision-language model + - 40 layers, 5120 hidden size + - 32 attention heads, 8 query groups (GQA) + - Vision encoder: ~0.4B parameters + - Recommended: 1 node, 8 GPUs + +All models support extended context lengths up to 256K tokens using YaRN RoPE scaling. + +## Model Architecture Features + +Ministral 3 combines efficient language modeling with multimodal capabilities: + +**Language Model Features:** +- **YaRN RoPE Scaling**: Advanced rope scaling for extended context lengths (up to 256K tokens) +- **Grouped Query Attention (GQA)**: Memory-efficient attention mechanism with 8 query groups +- **SwiGLU Activation**: Gated linear units with SiLU activation for improved performance +- **RMSNorm**: Layer normalization without mean centering for faster computation +- **Llama 4 Attention Scaling**: Position-dependent attention scaling for improved long-context handling + +**Vision-Language Features:** +- **Vision Encoder**: Pre-trained vision encoder for robust visual understanding +- **Multimodal Projector**: Projects vision features to language model space +- **Flexible Image Handling**: Supports variable resolution images and multiple images per conversation + +## Workspace Configuration + +All scripts use a `WORKSPACE` environment variable to define the base directory for checkpoints and results. By default, this is set to `/workspace`. You can override it: + +```bash +export WORKSPACE=/your/custom/path +``` + +Directory structure: +- `${WORKSPACE}/models/` - Converted checkpoints +- `${WORKSPACE}/results/` - Training outputs and experiment results + +## Checkpoint Conversion + +### Import HF → Megatron +To import the HF VL model to your desired Megatron path: +```bash +python examples/conversion/convert_checkpoints.py import \ + --hf-model mistralai/Ministral-3-3B-Instruct-2512-BF16 \ + --megatron-path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16 +``` + +### Export Megatron → HF +```bash +python examples/conversion/convert_checkpoints.py export \ + --hf-model mistralai/Ministral-3-3B-Instruct-2512-BF16 \ + --megatron-path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16/iter_0000000 \ + --hf-path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16-hf-export +``` + +## Inference + +### Run Inference on Converted Checkpoint + +```bash +python examples/conversion/hf_to_megatron_generate_vlm.py \ + --hf_model_path mistralai/Ministral-3-3B-Instruct-2512-BF16 \ + --megatron_model_path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16/iter_0000000 \ + --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \ + --prompt "Describe this image." \ + --max_new_tokens 100 +``` + +Note: +- `--megatron_model_path` is optional. If not specified, the script will convert the model and then run forward. +- You can also use image URLs: `--image_path="https://example.com/image.jpg"` + +See the [inference.sh](inference.sh) script for commands to: +- Run inference with Hugging Face checkpoints +- Run inference with imported Megatron checkpoints +- Run inference with exported Hugging Face checkpoints + +**Expected output:** +``` +... +Generation step 46 +Generation step 47 +Generation step 48 +Generation step 49 +======== GENERATED TEXT OUTPUT ======== +Image: https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png +Prompt: Describe this image. +Generated: [SYSTEM_PROMPT]You are Ministral-3-3B-Instruct-2512, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris. +You power an AI assistant called Le Chat. +Your knowledge base was last updated on 2023-10-01. +The current date is {today}. +... +[IMG_END]Describe this image.[/INST]The image presents a comparison table of technical specifications between two NVIDIA GPUs: the **H100 SXM** and the **H100 NVL**. + +### **FPU Performance (Floating-Point Operations Per Second)** +- **FP64**: + - H100 SXM: 34 teraFLOPS + - H100 NVL: 30 teraFLOPS +- **FP64 Tensor +======================================= +``` + +## Finetune Recipes + +- See: [bridge.recipes.ministral3](../../apidocs/bridge/bridge.recipes.ministral3.md) +- Available recipes: + - `ministral3_3b_finetune_config`: Finetuning for 3B VL model with PEFT support + - `ministral3_8b_finetune_config`: Finetuning for 8B VL model with PEFT support + - `ministral3_14b_finetune_config`: Finetuning for 14B VL model with PEFT support + +Before training, ensure the following environment variables are set: +1. `SAVE_DIR`: checkpoint and log saving directory +2. `HF_TOKEN`: to download models from HF Hub (if required) +3. `HF_HOME`: (optional) to avoid re-downloading models and datasets +4. `WANDB_API_KEY`: (optional) to enable WandB logging + +### Pretrain + +Pretraining is not verified for this model. + +### Supervised Fine-Tuning (SFT) + +See the [sft.sh](sft.sh) script for full parameter fine-tuning with configurable model parallelisms. + +W&B report coming soon. + +### Parameter-Efficient Fine-Tuning (PEFT) with LoRA + +See the [peft.sh](peft.sh) script for LoRA fine-tuning with configurable tensor and pipeline parallelism. + +W&B report coming soon. + +### Recommended Configurations + +| Model | Mode | TP | PP | Global Batch Size | Learning Rate | Hardware | +|-------|------|----|----|-------------------|---------------|----------| +| Ministral 3 3B | Full SFT | 1 | 1 | 32-64 | 5e-6 | 8 GPUs | +| Ministral 3 3B | LoRA/DoRA | 1 | 1 | 64-128 | 1e-4 | 8 GPUs | +| Ministral 3 8B | Full SFT | 2 | 1 | 32-64 | 5e-6 | 8 GPUs | +| Ministral 3 8B | LoRA/DoRA | 1 | 1 | 64-128 | 1e-4 | 8 GPUs | +| Ministral 3 14B | Full SFT | 4 | 1 | 16-32 | 5e-6 | 8 GPUs | +| Ministral 3 14B | LoRA/DoRA | 2 | 1 | 32-64 | 1e-4 | 8 GPUs | + +**Note:** LoRA/DoRA significantly reduces memory requirements, allowing for larger batch sizes and fewer GPUs. + +## Evaluation + +Coming soon. + +## Hugging Face Model Cards + +- Ministral 3 3B Base: https://huggingface.co/mistralai/Ministral-3-3B-Base-2512 +- Ministral 3 3B Instruct: https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512 +- Ministral 3 8B Base: https://huggingface.co/mistralai/Ministral-3-8B-Base-2512 +- Ministral 3 8B Instruct: https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512 +- Ministral 3 14B Base: https://huggingface.co/mistralai/Ministral-3-14B-Base-2512 +- Ministral 3 14B Instruct: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512 + +## Related Docs +- Related LLM: [Mistral](../llm/mistral.md) +- Recipe usage: [Recipe usage](../../recipe-usage.md) +- Customizing the training recipe configuration: [Configuration overview](../../training/config-container-overview.md) +- Training entry points: [Entry points](../../training/entry-points.md) + diff --git a/examples/models/vlm/ministral3/conversion.sh b/examples/models/vlm/ministral3/conversion.sh new file mode 100755 index 0000000000..296af05d3c --- /dev/null +++ b/examples/models/vlm/ministral3/conversion.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Workspace directory for checkpoints and results +WORKSPACE=${WORKSPACE:-/workspace} + +# Import HF → Megatron +uv run python examples/conversion/convert_checkpoints.py import \ + --hf-model mistralai/Ministral-3-3B-Instruct-2512-BF16 \ + --megatron-path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16 + +# Export Megatron → HF +uv run python examples/conversion/convert_checkpoints.py export \ + --hf-model mistralai/Ministral-3-3B-Instruct-2512-BF16 \ + --megatron-path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16/iter_0000000 \ + --hf-path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16-hf-export + +# Round-trip validation +uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_megatron_roundtrip_multi_gpu.py \ + --hf-model-id mistralai/Ministral-3-3B-Instruct-2512-BF16 --tp 2 --pp 2 diff --git a/examples/models/vlm/ministral3/inference.sh b/examples/models/vlm/ministral3/inference.sh new file mode 100755 index 0000000000..98e20c2050 --- /dev/null +++ b/examples/models/vlm/ministral3/inference.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Workspace directory for checkpoints and results +WORKSPACE=${WORKSPACE:-/workspace} + +# Inference with Hugging Face checkpoints +uv run python -m torch.distributed.run --nproc_per_node=4 examples/conversion/hf_to_megatron_generate_vlm.py \ + --hf_model_path mistralai/Ministral-3-3B-Instruct-2512-BF16 \ + --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \ + --prompt "Describe this image." \ + --max_new_tokens 100 \ + --tp 2 \ + --pp 2 + +# Inference with imported Megatron checkpoints +uv run python -m torch.distributed.run --nproc_per_node=4 examples/conversion/hf_to_megatron_generate_vlm.py \ + --hf_model_path mistralai/Ministral-3-3B-Instruct-2512-BF16 \ + --megatron_model_path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16/iter_0000000 \ + --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \ + --prompt "Describe this image." \ + --max_new_tokens 100 \ + --tp 2 \ + --pp 2 + +# Inference with exported HF checkpoints +uv run python -m torch.distributed.run --nproc_per_node=4 examples/conversion/hf_to_megatron_generate_vlm.py \ + --hf_model_path ${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16-hf-export \ + --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \ + --prompt "Describe this image." \ + --max_new_tokens 100 \ + --tp 2 \ + --pp 2 diff --git a/examples/models/vlm/ministral3/peft.sh b/examples/models/vlm/ministral3/peft.sh new file mode 100755 index 0000000000..0fb8e1b38e --- /dev/null +++ b/examples/models/vlm/ministral3/peft.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Workspace directory for checkpoints and results +WORKSPACE=${WORKSPACE:-/workspace} + +# Common configurations +PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16 +MODEL_NAME=ministral3_3b +DATASET_NAME=cord_v2 +SEQ_LENGTH=4096 +TRAIN_ITERS=50 +GLOBAL_BATCH_SIZE=32 +MICRO_BATCH_SIZE=1 +EVAL_ITERS=10 +LR=0.0002 +MIN_LR=0.00002 +LR_WARMUP_ITERS=10 +LOG_INTERVAL=1 +WANDB_PROJECT=megatron-bridge-${DATASET_NAME} + +# TP/PP combinations: "TP,PP" +PARALLELISM_CONFIGS=("2,1" "1,2") + +for config in "${PARALLELISM_CONFIGS[@]}"; do + IFS=',' read -r TP PP <<< "$config" + + echo "Running LoRA finetuning with TP=$TP, PP=$PP" + uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ + --recipe ${MODEL_NAME}_finetune_config \ + --step_func vlm_step \ + --peft_scheme lora \ + checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ + model.seq_length=$SEQ_LENGTH \ + train.train_iters=$TRAIN_ITERS \ + train.global_batch_size=$GLOBAL_BATCH_SIZE \ + train.micro_batch_size=$MICRO_BATCH_SIZE \ + train.eval_iters=$EVAL_ITERS \ + optimizer.lr=$LR \ + optimizer.min_lr=$MIN_LR \ + scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ + checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_lora_tp${TP}_pp${PP} \ + logger.log_interval=$LOG_INTERVAL \ + logger.wandb_project=$WANDB_PROJECT \ + logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_lora_tp${TP}_pp${PP} \ + dataset.maker_name=make_${DATASET_NAME}_dataset \ + dataset.seq_length=$SEQ_LENGTH \ + model.tensor_model_parallel_size=$TP \ + model.pipeline_model_parallel_size=$PP +done diff --git a/examples/models/vlm/ministral3/sft.sh b/examples/models/vlm/ministral3/sft.sh new file mode 100755 index 0000000000..193afaf10e --- /dev/null +++ b/examples/models/vlm/ministral3/sft.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Workspace directory for checkpoints and results +WORKSPACE=${WORKSPACE:-/workspace} + +# Common configurations +PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Ministral-3-3B-Instruct-2512-BF16 +MODEL_NAME=ministral3_3b +DATASET_NAME=cord_v2 +SEQ_LENGTH=4096 +TRAIN_ITERS=50 +GLOBAL_BATCH_SIZE=32 +MICRO_BATCH_SIZE=1 +EVAL_ITERS=10 +LR=0.00005 +MIN_LR=0.000005 +LR_WARMUP_ITERS=10 +LOG_INTERVAL=1 +WANDB_PROJECT=megatron-bridge-${DATASET_NAME} + +# TP/PP combinations: "TP,PP" +PARALLELISM_CONFIGS=("2,1" "1,2") + +for config in "${PARALLELISM_CONFIGS[@]}"; do + IFS=',' read -r TP PP <<< "$config" + + echo "Running full finetuning with TP=$TP, PP=$PP" + uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ + --recipe ${MODEL_NAME}_finetune_config \ + --step_func vlm_step \ + checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ + model.seq_length=$SEQ_LENGTH \ + train.train_iters=$TRAIN_ITERS \ + train.global_batch_size=$GLOBAL_BATCH_SIZE \ + train.micro_batch_size=$MICRO_BATCH_SIZE \ + train.eval_iters=$EVAL_ITERS \ + optimizer.lr=$LR \ + optimizer.min_lr=$MIN_LR \ + scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ + checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_sft_tp${TP}_pp${PP} \ + logger.log_interval=$LOG_INTERVAL \ + logger.wandb_project=$WANDB_PROJECT \ + logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_sft_tp${TP}_pp${PP} \ + dataset.maker_name=make_${DATASET_NAME}_dataset \ + dataset.seq_length=$SEQ_LENGTH \ + model.tensor_model_parallel_size=$TP \ + model.pipeline_model_parallel_size=$PP +done diff --git a/src/megatron/bridge/models/ministral3/modeling_ministral3.py b/src/megatron/bridge/models/ministral3/modeling_ministral3.py index e541059e7a..eafe361dd0 100644 --- a/src/megatron/bridge/models/ministral3/modeling_ministral3.py +++ b/src/megatron/bridge/models/ministral3/modeling_ministral3.py @@ -225,7 +225,9 @@ def forward( if pixel_values is not None: # Get image features using HF's method (monkey-patched) - image_features = self.get_image_features(pixel_values.to(inputs_embeds.dtype), image_sizes=image_sizes) + image_features = self.get_image_features( + pixel_values.to(inputs_embeds.dtype), image_sizes=image_sizes + ).pooler_output image_features = torch.cat(image_features, dim=0).to(inputs_embeds.device, inputs_embeds.dtype) # Replace image tokens in text embeddings with image features diff --git a/src/megatron/bridge/recipes/__init__.py b/src/megatron/bridge/recipes/__init__.py index 10618ae372..ba816ad650 100644 --- a/src/megatron/bridge/recipes/__init__.py +++ b/src/megatron/bridge/recipes/__init__.py @@ -24,6 +24,7 @@ from megatron.bridge.recipes.gpt import * from megatron.bridge.recipes.gpt_oss import * from megatron.bridge.recipes.llama import * +from megatron.bridge.recipes.ministral3 import * from megatron.bridge.recipes.moonlight import * from megatron.bridge.recipes.nemotronh import * from megatron.bridge.recipes.olmoe import *