huggingface · ydshieh · Aug 19, 2025 · Sep 30, 2024 · Sep 30, 2024 · Oct 2, 2024
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -1039,6 +1039,8 @@
         title: Janus
       - local: model_doc/kosmos-2
         title: KOSMOS-2
+      - local: model_doc/kosmos2_5
+        title: KOSMOS-2.5
       - local: model_doc/layoutlm
         title: LayoutLM
       - local: model_doc/layoutlmv2

diff --git a/docs/source/en/model_doc/kosmos2_5.md b/docs/source/en/model_doc/kosmos2_5.md
@@ -0,0 +1,196 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+
+# KOSMOS-2.5
+
+The Kosmos-2.5 model was proposed in [KOSMOS-2.5: A Multimodal Literate Model](https://arxiv.org/abs/2309.11419/) by Microsoft.
+
+The abstract from the paper is the following:
+
+*We present Kosmos-2.5, a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/kosmos2_5_ocr.png"
+alt="drawing" width="600"/>
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/kosmos2_5_md.png"
+alt="drawing" width="600"/>
+
+<small> Overview of tasks that KOSMOS-2.5 can handle. Taken from the <a href="https://arxiv.org/abs/2309.11419">original paper</a>. </small>
+
+The examples below demonstrates how to generate with [`AutoModel`], for both Markdown and OCR tasks. 
+
+<hfoptions id="usage">
+<hfoption id="AutoModel - Markdown Task">
+
+```py
+import re
+import torch
+import requests
+from PIL import Image, ImageDraw
+from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
+
+repo = "ydshieh/kosmos-2.5"
+device = "cuda:0"
+dtype = torch.bfloat16
+model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
+processor = AutoProcessor.from_pretrained(repo)
+
+# sample image
+url = "https://huggingface.co/ydshieh/kosmos-2.5/resolve/main/receipt_00008.png"
+image = Image.open(requests.get(url, stream=True).raw)
+
+prompt = "<md>"
+inputs = processor(text=prompt, images=image, return_tensors="pt")
+
+height, width = inputs.pop("height"), inputs.pop("width")
+raw_width, raw_height = image.size
+scale_height = raw_height / height
+scale_width = raw_width / width
+
+inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
+inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
+generated_ids = model.generate(
+    **inputs,
+    max_new_tokens=1024,
+)
+
+generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+print(generated_text[0])
+```
+
+</hfoption>
+<hfoption id="AutoModel - OCR Task">
+
+```py
+import re
+import torch
+import requests
+from PIL import Image, ImageDraw
+from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
+
+repo = "ydshieh/kosmos-2.5"
+device = "cuda:0"
+dtype = torch.bfloat16
+model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
+processor = AutoProcessor.from_pretrained(repo)
+
+# sample image
+url = "https://huggingface.co/ydshieh/kosmos-2.5/resolve/main/receipt_00008.png"
+image = Image.open(requests.get(url, stream=True).raw)
+
+# bs = 1
+prompt = "<ocr>"
+inputs = processor(text=prompt, images=image, return_tensors="pt")
+height, width = inputs.pop("height"), inputs.pop("width")
+raw_width, raw_height = image.size
+scale_height = raw_height / height
+scale_width = raw_width / width
+
+# bs > 1, batch generation
+# inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
+# height, width = inputs.pop("height"), inputs.pop("width")
+# raw_width, raw_height = image.size
+# scale_height = raw_height / height[0]
+# scale_width = raw_width / width[0]
+
+inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
+inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
+generated_ids = model.generate(
+    **inputs,
+    max_new_tokens=1024,
+)
+
+generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+def post_process(y, scale_height, scale_width):
+    y = y.replace(prompt, "")
+    if "<md>" in prompt:
+        return y
+    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
+    bboxs_raw = re.findall(pattern, y)
+    lines = re.split(pattern, y)[1:]
+    bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
+    bboxs = [[int(j) for j in i] for i in bboxs]
+    info = ""
+    for i in range(len(lines)):
+        box = bboxs[i]
+        x0, y0, x1, y1 = box
+        if not (x0 >= x1 or y0 >= y1):
+            x0 = int(x0 * scale_width)
+            y0 = int(y0 * scale_height)
+            x1 = int(x1 * scale_width)
+            y1 = int(y1 * scale_height)
+            info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
+    return info
+
+output_text = post_process(generated_text[0], scale_height, scale_width)
+print(output_text)
+
+draw = ImageDraw.Draw(image)
+lines = output_text.split("\n")
+for line in lines:
+    # draw the bounding box
+    line = list(line.split(","))
+    if len(line) < 8:
+        continue
+    line = list(map(int, line[:8]))
+    draw.polygon(line, outline="red")
+image.save("output.png")
+```
+
+</hfoption>
+</hfoptions>
+
+
+## Example
+**Markdown Task:** For usage instructions, please refer to [md.py](https://huggingface.co/ydshieh/kosmos-2.5/blob/main/md.py).
+
+**OCR Task:** For usage instructions, please refer to [ocr.py](https://huggingface.co/ydshieh/kosmos-2.5/blob/main/ocr.py).
+
+
+
+## Kosmos2_5Config
+
+[[autodoc]] Kosmos2_5Config
+
+## Kosmos2_5ImageProcessor
+
+[[autodoc]] Kosmos2_5ImageProcessor
+    - preprocess
+
+## Kosmos2_5ImageProcessorFast
+
+[[autodoc]] Kosmos2_5ImageProcessorFast
+    - preprocess
+
+## Kosmos2_5Processor
+
+[[autodoc]] Kosmos2_5Processor
+
+## Kosmos2_5Model
+
+[[autodoc]] Kosmos2_5Model
+    - forward
+
+## Kosmos2_5ForConditionalGeneration
+
+[[autodoc]] Kosmos2_5ForConditionalGeneration
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -209,6 +209,7 @@
         ("jetmoe", "JetMoeConfig"),
         ("jukebox", "JukeboxConfig"),
         ("kosmos-2", "Kosmos2Config"),
+        ("kosmos-2.5", "Kosmos2_5Config"),
         ("kyutai_speech_to_text", "KyutaiSpeechToTextConfig"),
         ("layoutlm", "LayoutLMConfig"),
         ("layoutlmv2", "LayoutLMv2Config"),
@@ -625,6 +626,7 @@
         ("jetmoe", "JetMoe"),
         ("jukebox", "Jukebox"),
         ("kosmos-2", "KOSMOS-2"),
+        ("kosmos-2.5", "KOSMOS-2.5"),
         ("kyutai_speech_to_text", "KyutaiSpeechToText"),
         ("layoutlm", "LayoutLM"),
         ("layoutlmv2", "LayoutLMv2"),
@@ -906,6 +908,7 @@
         ("data2vec-vision", "data2vec"),
         ("donut-swin", "donut"),
         ("kosmos-2", "kosmos2"),
+        ("kosmos-2.5", "kosmos2_5"),
         ("maskformer-swin", "maskformer"),
         ("xclip", "x_clip"),
         ("clip_vision_model", "clip"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -116,6 +116,7 @@
             ("instructblipvideo", ("InstructBlipVideoImageProcessor", None)),
             ("janus", ("JanusImageProcessor", "JanusImageProcessorFast")),
             ("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("kosmos-2.5", ("Kosmos2_5ImageProcessor", "Kosmos2_5ImageProcessorFast")),
             ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")),
             ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
             ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -209,6 +209,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("jetmoe", "JetMoeModel"),
         ("jukebox", "JukeboxModel"),
         ("kosmos-2", "Kosmos2Model"),
+        ("kosmos-2.5", "Kosmos2_5Model"),
         ("kyutai_speech_to_text", "KyutaiSpeechToTextModel"),
         ("layoutlm", "LayoutLMModel"),
         ("layoutlmv2", "LayoutLMv2Model"),
@@ -942,6 +943,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("instructblip", "InstructBlipForConditionalGeneration"),
         ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
+        ("kosmos-2.5", "Kosmos2_5ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
@@ -990,6 +992,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("internvl", "InternVLForConditionalGeneration"),
         ("janus", "JanusForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
+        ("kosmos-2.5", "Kosmos2_5ForConditionalGeneration"),
         ("llama4", "Llama4ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -88,6 +88,7 @@
         ("internvl", "InternVLProcessor"),
         ("janus", "JanusProcessor"),
         ("kosmos-2", "Kosmos2Processor"),
+        ("kosmos-2.5", "Kosmos2_5Processor"),
         ("kyutai_speech_to_text", "KyutaiSpeechToTextProcessor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -343,6 +343,7 @@
                 "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
             ),
         ),
+        ("kosmos-2.5", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
         ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
         ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),

diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py
@@ -1819,6 +1819,7 @@ def forward(
             vision_model_output=vision_model_output,
         )
 
+    @torch.no_grad()
     def generate(
         self,
         pixel_values: Optional[torch.Tensor] = None,

diff --git a/src/transformers/models/kosmos2_5/__init__.py b/src/transformers/models/kosmos2_5/__init__.py
@@ -0,0 +1,31 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_kosmos2_5 import *
+    from .image_processing_kosmos2_5 import *
+    from .image_processing_kosmos2_5_fast import *
+    from .modeling_kosmos2_5 import *
+    from .processing_kosmos2_5 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)