|
| 1 | +<!--Copyright 2025 The HuggingFace Team. All rights reserved. |
| 2 | +
|
| 3 | +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with |
| 4 | +the License. You may obtain a copy of the License at |
| 5 | +
|
| 6 | +http://www.apache.org/licenses/LICENSE-2.0 |
| 7 | +
|
| 8 | +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on |
| 9 | +an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the |
| 10 | +specific language governing permissions and limitations under the License. |
| 11 | +
|
| 12 | +⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be |
| 13 | +rendered properly in your Markdown viewer. |
| 14 | +
|
| 15 | +--> |
| 16 | + |
| 17 | +# Mistral3 |
| 18 | + |
| 19 | +## Overview |
| 20 | + |
| 21 | +Building upon Mistral Small 3 (2501), Mistral Small 3.1 (2503) adds state-of-the-art vision understanding and enhances long context capabilities up to 128k tokens without compromising text performance. With 24 billion parameters, this model achieves top-tier capabilities in both text and vision tasks. |
| 22 | + |
| 23 | +It is ideal for: |
| 24 | +- Fast-response conversational agents. |
| 25 | +- Low-latency function calling. |
| 26 | +- Subject matter experts via fine-tuning. |
| 27 | +- Local inference for hobbyists and organizations handling sensitive data. |
| 28 | +- Programming and math reasoning. |
| 29 | +- Long document understanding. |
| 30 | +- Visual understanding. |
| 31 | + |
| 32 | +This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez) and [yonigozlan](https://huggingface.co/yonigozlan). |
| 33 | + |
| 34 | +The original code can be found [here](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/pixtral.py) and [here](https://github.com/mistralai/mistral-common). |
| 35 | + |
| 36 | +## Usage example |
| 37 | + |
| 38 | +### Inference with Pipeline |
| 39 | + |
| 40 | +Here is how you can use the `image-text-to-text` pipeline to perform inference with the `Mistral3` models in just a few lines of code: |
| 41 | +```python |
| 42 | +>>> from transformers import pipeline |
| 43 | + |
| 44 | +>>> messages = [ |
| 45 | +... { |
| 46 | +... "role": "user", |
| 47 | +... "content": [ |
| 48 | +... { |
| 49 | +... "type": "image", |
| 50 | +... "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg", |
| 51 | +... }, |
| 52 | +... {"type": "text", "text": "Describe this image."}, |
| 53 | +... ], |
| 54 | +... }, |
| 55 | +... ] |
| 56 | + |
| 57 | +>>> pipe = pipeline("image-text-to-text", model="../mistral3_weights", torch_dtype=torch.bfloat16) |
| 58 | +>>> outputs = pipe(text=messages, max_new_tokens=50, return_full_text=False) |
| 59 | +>>> outputs[0]["generated_text"] |
| 60 | +'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a' |
| 61 | +``` |
| 62 | +### Inference on a single image |
| 63 | + |
| 64 | +This example demonstrates how to perform inference on a single image with the Mistral3 models using chat templates. |
| 65 | + |
| 66 | +```python |
| 67 | +>>> from transformers import AutoProcessor, AutoModelForImageTextToText |
| 68 | +>>> import torch |
| 69 | + |
| 70 | +>>> torch_device = "cuda" |
| 71 | +>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" |
| 72 | +>>> processor = AutoProcessor.from_pretrained(model_checkpoint) |
| 73 | +>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16) |
| 74 | + |
| 75 | +>>> messages = [ |
| 76 | +... { |
| 77 | +... "role": "user", |
| 78 | +... "content": [ |
| 79 | +... {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, |
| 80 | +... {"type": "text", "text": "Describe this image"}, |
| 81 | +... ], |
| 82 | +... } |
| 83 | +... ] |
| 84 | + |
| 85 | +>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16) |
| 86 | + |
| 87 | +>>> generate_ids = model.generate(**inputs, max_new_tokens=20) |
| 88 | +>>> decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True) |
| 89 | + |
| 90 | +>>> decoded_output |
| 91 | +"The image depicts two cats lying on a pink blanket. The larger cat, which appears to be an"... |
| 92 | +``` |
| 93 | + |
| 94 | +### Text-only generation |
| 95 | +This example shows how to generate text using the Mistral3 model without providing any image input. |
| 96 | + |
| 97 | + |
| 98 | +````python |
| 99 | +>>> from transformers import AutoProcessor, AutoModelForImageTextToText |
| 100 | +>>> import torch |
| 101 | + |
| 102 | +>>> torch_device = "cuda" |
| 103 | +>>> model_checkpoint = ".mistralai/Mistral-Small-3.1-24B-Instruct-2503" |
| 104 | +>>> processor = AutoProcessor.from_pretrained(model_checkpoint) |
| 105 | +>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16) |
| 106 | + |
| 107 | +>>> SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, always end your accurate response with an ASCII drawing of a cat." |
| 108 | +>>> user_prompt = "Give me 5 non-formal ways to say 'See you later' in French." |
| 109 | + |
| 110 | +>>> messages = [ |
| 111 | +... {"role": "system", "content": SYSTEM_PROMPT}, |
| 112 | +... {"role": "user", "content": user_prompt}, |
| 113 | +... ] |
| 114 | + |
| 115 | +>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| 116 | +>>> inputs = processor(text=text, return_tensors="pt").to(0, dtype=torch.float16) |
| 117 | +>>> generate_ids = model.generate(**inputs, max_new_tokens=50, do_sample=False) |
| 118 | +>>> decoded_output = processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)[0] |
| 119 | + |
| 120 | +>>> print(decoded_output) |
| 121 | +"1. À plus tard! |
| 122 | +2. Salut, à plus! |
| 123 | +3. À toute! |
| 124 | +4. À la prochaine! |
| 125 | +5. Je me casse, à plus! |
| 126 | + |
| 127 | +``` |
| 128 | + /\_/\ |
| 129 | +( o.o ) |
| 130 | + > ^ < |
| 131 | +```" |
| 132 | +```` |
| 133 | + |
| 134 | +### Batched image and text inputs |
| 135 | +Mistral3 models also support batched image and text inputs. |
| 136 | + |
| 137 | +```python |
| 138 | +>>> from transformers import AutoProcessor, AutoModelForImageTextToText |
| 139 | +>>> import torch |
| 140 | + |
| 141 | +>>> torch_device = "cuda" |
| 142 | +>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" |
| 143 | +>>> processor = AutoProcessor.from_pretrained(model_checkpoint) |
| 144 | +>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16) |
| 145 | + |
| 146 | +>>> messages = [ |
| 147 | +... [ |
| 148 | +... { |
| 149 | +... "role": "user", |
| 150 | +... "content": [ |
| 151 | +... {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"}, |
| 152 | +... {"type": "text", "text": "Write a haiku for this image"}, |
| 153 | +... ], |
| 154 | +... }, |
| 155 | +... ], |
| 156 | +... [ |
| 157 | +... { |
| 158 | +... "role": "user", |
| 159 | +... "content": [ |
| 160 | +... {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}, |
| 161 | +... {"type": "text", "text": "Describe this image"}, |
| 162 | +... ], |
| 163 | +... }, |
| 164 | +... ], |
| 165 | +... ] |
| 166 | + |
| 167 | + |
| 168 | +>>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16) |
| 169 | + |
| 170 | +>>> output = model.generate(**inputs, max_new_tokens=25) |
| 171 | + |
| 172 | +>>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True) |
| 173 | +>>> decoded_outputs |
| 174 | +["Write a haiku for this imageCalm waters reflect\nWhispers of the forest's breath\nPeace on wooden path" |
| 175 | +, "Describe this imageThe image depicts a vibrant street scene in what appears to be a Chinatown district. The focal point is a traditional Chinese"] |
| 176 | +``` |
| 177 | + |
| 178 | +### Batched multi-image input and quantization with BitsAndBytes |
| 179 | +This implementation of the Mistral3 models supports batched text-images inputs with different number of images for each text. |
| 180 | +This example also how to use `BitsAndBytes` to load the model in 4bit quantization. |
| 181 | + |
| 182 | +```python |
| 183 | +>>> from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig |
| 184 | +>>> import torch |
| 185 | + |
| 186 | +>>> torch_device = "cuda" |
| 187 | +>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" |
| 188 | +>>> processor = AutoProcessor.from_pretrained(model_checkpoint) |
| 189 | +>>> quantization_config = BitsAndBytesConfig(load_in_4bit=True) |
| 190 | +>>> model = AutoModelForImageTextToText.from_pretrained( |
| 191 | +... model_checkpoint, quantization_config=quantization_config |
| 192 | +... ) |
| 193 | + |
| 194 | +>>> messages = [ |
| 195 | +... [ |
| 196 | +... { |
| 197 | +... "role": "user", |
| 198 | +... "content": [ |
| 199 | +... {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"}, |
| 200 | +... {"type": "text", "text": "Write a haiku for this image"}, |
| 201 | +... ], |
| 202 | +... }, |
| 203 | +... ], |
| 204 | +... [ |
| 205 | +... { |
| 206 | +... "role": "user", |
| 207 | +... "content": [ |
| 208 | +... {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"}, |
| 209 | +... {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"}, |
| 210 | +... {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"}, |
| 211 | +... ], |
| 212 | +... }, |
| 213 | +... ], |
| 214 | +>>> ] |
| 215 | + |
| 216 | +>>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16) |
| 217 | + |
| 218 | +>>> output = model.generate(**inputs, max_new_tokens=25) |
| 219 | + |
| 220 | +>>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True) |
| 221 | +>>> decoded_outputs |
| 222 | +["Write a haiku for this imageSure, here is a haiku inspired by the image:\n\nCalm lake's wooden path\nSilent forest stands guard\n", "These images depict two different landmarks. Can you identify them? Certainly! The images depict two iconic landmarks:\n\n1. The first image shows the Statue of Liberty in New York City."] |
| 223 | +``` |
| 224 | + |
| 225 | + |
| 226 | +## Mistral3Config |
| 227 | + |
| 228 | +[[autodoc]] Mistral3Config |
| 229 | + |
| 230 | + |
| 231 | +## Mistral3ForConditionalGeneration |
| 232 | + |
| 233 | +[[autodoc]] Mistral3ForConditionalGeneration |
| 234 | + - forward |
0 commit comments