From a8bd2ded74df10e75f3e64b42b2e2457fe2efa78 Mon Sep 17 00:00:00 2001 From: David Golchinfar Date: Sun, 28 Dec 2025 21:10:29 +0100 Subject: [PATCH 01/20] model: Add SauerkrautLM-ColPali visual document retrieval models Add inference code and requirements for SauerkrautLM-ColPali visual document retrieval models. These are multi-vector embedding models based on the ColPali architecture: - ColQwen3 (Qwen3-VL backbone): 1.7B Turbo, 2B, 4B, 8B variants - ColLFM2 (LFM2-VL backbone): 450M variant - ColMinistral3 (Ministral3 backbone): 3B variant All models produce 128-dimensional embeddings per text/image token and use MaxSim (late interaction) for retrieval scoring. Model checkpoints: - https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-1.7b-Turbo-v0.1 - https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-2b-v0.1 - https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-4b-v0.1 - https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-8b-v0.1 - https://huggingface.co/VAGOsolutions/SauerkrautLM-ColLFM2-450M-v0.1 - https://huggingface.co/VAGOsolutions/SauerkrautLM-ColMinistral3-3b-v0.1 --- .../model_implementations/slm_models.py | 484 ++++++++++++++++++ pyproject.toml | 4 + 2 files changed, 488 insertions(+) create mode 100644 mteb/models/model_implementations/slm_models.py diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py new file mode 100644 index 0000000000..70230aab89 --- /dev/null +++ b/mteb/models/model_implementations/slm_models.py @@ -0,0 +1,484 @@ +""" +SauerkrautLM Visual Document Retrieval Models - MTEB Integration + +This module provides MTEB wrappers for SauerkrautLM ColPali-style models: +- SLM-ColQwen3 (Qwen3-VL backbone) +- SLM-ColLFM2 (LFM2 backbone) +- SLM-ColMinistral3 (Ministral3 backbone) + +Based on: +- MTEB ColPali implementation: mteb/models/model_implementations/colpali_models.py +""" + +from __future__ import annotations + +import logging +from functools import partial +from typing import Any + +import torch +from PIL import Image +from torch.utils.data import DataLoader +from tqdm.auto import tqdm + +from mteb._requires_package import ( + requires_image_dependencies, + requires_package, +) +from mteb.abstasks.task_metadata import TaskMetadata +from mteb.models.abs_encoder import AbsEncoder +from mteb.models.model_meta import ModelMeta, ScoringFunction +from mteb.types import Array, BatchedInput, PromptType + +logger = logging.getLogger(__name__) + + +# ============================================================================= +# Supported Languages +# ============================================================================= + +SUPPORTED_LANGUAGES = [ + "eng-Latn", # English + "deu-Latn", # German + "fra-Latn", # French + "spa-Latn", # Spanish + "ita-Latn", # Italian + "por-Latn", # Portuguese +] + + +# ============================================================================= +# Base Wrapper Class +# ============================================================================= + +class SLMBaseWrapper(AbsEncoder): + """ + Base wrapper for SauerkrautLM multi-vector embedding models. + + All our models use late interaction (MaxSim) for retrieval scoring. + """ + + model_class = None + processor_class = None + model_name_prefix = "SLM" + + def __init__( + self, + model_name: str, + revision: str | None = None, + device: str | None = None, + use_flash_attn: bool = True, + **kwargs, + ): + requires_image_dependencies() + requires_package( + self, "sauerkrautlm_colpali", model_name, "pip install sauerkrautlm-colpali" + ) + + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self._load_model_and_processor(model_name, revision, use_flash_attn, **kwargs) + + def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): + """Override in subclasses to load specific model/processor.""" + raise NotImplementedError + + def encode( + self, + inputs: DataLoader[BatchedInput], + *, + task_metadata: TaskMetadata, + hf_split: str, + hf_subset: str, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> Array: + text_embeddings = None + image_embeddings = None + + if "text" in inputs.dataset.features: + text_embeddings = self.get_text_embeddings(inputs, **kwargs) + if "image" in inputs.dataset.features: + image_embeddings = self.get_image_embeddings(inputs, **kwargs) + + if text_embeddings is not None and image_embeddings is not None: + if len(text_embeddings) != len(image_embeddings): + raise ValueError( + "The number of texts and images must have the same length" + ) + fused_embeddings = text_embeddings + image_embeddings + return fused_embeddings + elif text_embeddings is not None: + return text_embeddings + elif image_embeddings is not None: + return image_embeddings + raise ValueError("No text or image features found in inputs") + + def encode_input(self, inputs): + """Forward pass through the model.""" + return self.mdl(**inputs) + + def _move_to_device(self, inputs: dict) -> dict: + """Move all tensor inputs to the model's device.""" + result = {} + for k, v in inputs.items(): + if isinstance(v, torch.Tensor): + result[k] = v.to(self.device) + else: + result[k] = v + return result + + def get_image_embeddings( + self, + images: DataLoader, + batch_size: int = 32, + **kwargs, + ) -> torch.Tensor: + import torchvision.transforms.functional as F + + all_embeds = [] + + with torch.no_grad(): + for batch in tqdm(images, desc="Encoding images"): + imgs = [ + F.to_pil_image(b) + if not isinstance(b, Image.Image) + else b + for b in batch["image"] + ] + inputs = self.processor.process_images(imgs) + inputs = self._move_to_device(inputs) + outs = self.encode_input(inputs) + all_embeds.extend(outs.cpu().to(torch.float32)) + + padded = torch.nn.utils.rnn.pad_sequence( + all_embeds, batch_first=True, padding_value=0 + ) + return padded + + def get_text_embeddings( + self, + texts: DataLoader, + batch_size: int = 32, + **kwargs, + ) -> torch.Tensor: + all_embeds = [] + + with torch.no_grad(): + for batch in tqdm(texts, desc="Encoding texts"): + inputs = self.processor.process_queries(batch["text"]) + inputs = self._move_to_device(inputs) + outs = self.encode_input(inputs) + all_embeds.extend(outs.cpu().to(torch.float32)) + + padded = torch.nn.utils.rnn.pad_sequence( + all_embeds, batch_first=True, padding_value=0 + ) + return padded + + def get_fused_embeddings( + self, + texts: list[str] | None = None, + images: list[Image.Image] | DataLoader | None = None, + *, + task_name: str | None = None, + prompt_type: PromptType | None = None, + batch_size: int = 32, + fusion_mode: str = "sum", + **kwargs: Any, + ): + raise NotImplementedError( + "Fused embeddings are not supported. " + "Please use get_text_embeddings or get_image_embeddings." + ) + + def calculate_probs( + self, + text_embeddings: torch.Tensor, + image_embeddings: torch.Tensor, + ) -> torch.Tensor: + scores = self.similarity(text_embeddings, image_embeddings).T + return scores.softmax(dim=-1) + + def similarity( + self, + a: torch.Tensor | list, + b: torch.Tensor | list, + ) -> torch.Tensor: + return self.processor.score(a, b, device=self.device) + + +# ============================================================================= +# ColQwen3 Wrapper +# ============================================================================= + +class SLMColQwen3Wrapper(SLMBaseWrapper): + """Wrapper for SLM-ColQwen3 models (Qwen3-VL backbone).""" + + def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): + from sauerkrautlm_colpali.models.qwen3.colqwen3 import ColQwen3, ColQwen3Processor + + self.mdl = ColQwen3.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2" if use_flash_attn else "eager", + revision=revision, + **kwargs, + ) + # Explicitly move to device + self.mdl = self.mdl.to(self.device) + self.mdl.eval() + + self.processor = ColQwen3Processor.from_pretrained( + model_name, + revision=revision, + ) + + logger.info(f"SLM-ColQwen3 loaded: dim={self.mdl.dim}, device={self.device}") + + +# ============================================================================= +# ColLFM2 Wrapper +# ============================================================================= + +class SLMColLFM2Wrapper(SLMBaseWrapper): + """Wrapper for SLM-ColLFM2 models (LFM2 backbone).""" + + def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): + from sauerkrautlm_colpali.models.lfm2.collfm2 import ColLFM2, ColLFM2Processor + + self.mdl = ColLFM2.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + revision=revision, + **kwargs, + ) + # Explicitly move to device + self.mdl = self.mdl.to(self.device) + self.mdl.eval() + + self.processor = ColLFM2Processor.from_pretrained( + model_name, + revision=revision, + ) + + logger.info(f"SLM-ColLFM2 loaded: dim={self.mdl.dim}, device={self.device}") + + +# ============================================================================= +# ColMinistral3 Wrapper +# ============================================================================= + +class SLMColMinistral3Wrapper(SLMBaseWrapper): + """Wrapper for SLM-ColMinistral3 models (Ministral3 backbone).""" + + def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): + from sauerkrautlm_colpali.models.ministral3.colministral3 import ColMinistral3, ColMinistral3Processor + + # ColMinistral3.__init__ doesn't accept extra kwargs - only pass model_name + self.mdl = ColMinistral3.from_pretrained(model_name) + # Explicitly move to device and convert to bfloat16 + self.mdl = self.mdl.to(dtype=torch.bfloat16, device=self.device) + self.mdl.eval() + + self.processor = ColMinistral3Processor.from_pretrained(model_name) + + logger.info(f"SLM-ColMinistral3 loaded: dim={self.mdl.dim}, device={self.device}") + + +# ============================================================================= +# Loader Functions +# ============================================================================= + +def slm_colqwen3_loader(model_name: str, revision: str | None = None, device: str | None = None, **kwargs) -> SLMColQwen3Wrapper: + return SLMColQwen3Wrapper(model_name=model_name, revision=revision, device=device, **kwargs) + +def slm_collfm2_loader(model_name: str, revision: str | None = None, device: str | None = None, **kwargs) -> SLMColLFM2Wrapper: + return SLMColLFM2Wrapper(model_name=model_name, revision=revision, device=device, **kwargs) + +def slm_colministral3_loader(model_name: str, revision: str | None = None, device: str | None = None, **kwargs) -> SLMColMinistral3Wrapper: + return SLMColMinistral3Wrapper(model_name=model_name, revision=revision, device=device, **kwargs) + + +# ============================================================================= +# Citations +# ============================================================================= + +SAUERKRAUTLM_CITATION = """ +@misc{sauerkrautlm-colpali-2025, + title={SauerkrautLM-ColPali: Multi-Vector Vision Retrieval Models}, + author={David Golchinfar}, + organization={VAGO Solutions}, + year={2025}, + url={https://github.com/VAGOsolutions/sauerkrautlm-colpali} +} +""" + +COLPALI_CITATION = """ +@misc{faysse2024colpali, + title={ColPali: Efficient Document Retrieval with Vision Language Models}, + author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C\\'eline and Colombo, Pierre}, + year={2024}, + eprint={2407.01449}, + archivePrefix={arXiv}, + primaryClass={cs.IR} +} +""" + + +# ============================================================================= +# ColQwen3 Model Metadata +# ============================================================================= + +# ColQwen3-1.7B Turbo: ~1.7B params → 3.4 GB VRAM in bfloat16 +slm_colqwen3_1_7b_turbo = ModelMeta( + loader=partial(slm_colqwen3_loader), + name="VAGOsolutions/SauerkrautLM-ColQwen3-1.7b-Turbo-v0.1", + languages=SUPPORTED_LANGUAGES, + revision="main", + release_date="2025-01-01", + modalities=["image", "text"], + n_parameters=1_700_000_000, + memory_usage_mb=3400, + max_tokens=262144, + embed_dim=128, + license="apache-2.0", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["ColPali"], + reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-1.7b-Turbo-v0.1", + similarity_fn_name=ScoringFunction.MAX_SIM, + use_instructions=True, + training_datasets=None, + citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, +) + +# ColQwen3-2B: ~2.2B params → 4.4 GB VRAM in bfloat16 +slm_colqwen3_2b = ModelMeta( + loader=partial(slm_colqwen3_loader), + name="VAGOsolutions/SauerkrautLM-ColQwen3-2b-v0.1", + languages=SUPPORTED_LANGUAGES, + revision="main", + release_date="2025-01-01", + modalities=["image", "text"], + n_parameters=2_200_000_000, + memory_usage_mb=4400, + max_tokens=262144, + embed_dim=128, + license="apache-2.0", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["ColPali"], + reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-2b-v0.1", + similarity_fn_name=ScoringFunction.MAX_SIM, + use_instructions=True, + training_datasets=None, + citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, +) + +# ColQwen3-4B: ~4B params → 8 GB VRAM in bfloat16 +slm_colqwen3_4b = ModelMeta( + loader=partial(slm_colqwen3_loader), + name="VAGOsolutions/SauerkrautLM-ColQwen3-4b-v0.1", + languages=SUPPORTED_LANGUAGES, + revision="main", + release_date="2025-01-01", + modalities=["image", "text"], + n_parameters=4_000_000_000, + memory_usage_mb=8000, + max_tokens=262144, + embed_dim=128, + license="apache-2.0", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["ColPali"], + reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-4b-v0.1", + similarity_fn_name=ScoringFunction.MAX_SIM, + use_instructions=True, + training_datasets=None, + citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, +) + +# ColQwen3-8B: ~8B params → 16 GB VRAM in bfloat16 +slm_colqwen3_8b = ModelMeta( + loader=partial(slm_colqwen3_loader), + name="VAGOsolutions/SauerkrautLM-ColQwen3-8b-v0.1", + languages=SUPPORTED_LANGUAGES, + revision="main", + release_date="2025-01-01", + modalities=["image", "text"], + n_parameters=8_000_000_000, + memory_usage_mb=16000, + max_tokens=262144, + embed_dim=128, + license="apache-2.0", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["ColPali"], + reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-8b-v0.1", + similarity_fn_name=ScoringFunction.MAX_SIM, + use_instructions=True, + training_datasets=None, + citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, +) + + +# ============================================================================= +# ColLFM2 Model Metadata +# ============================================================================= + +# ColLFM2-450M: ~450M params → 900 MB VRAM in bfloat16 +slm_collfm2_450m = ModelMeta( + loader=partial(slm_collfm2_loader), + name="VAGOsolutions/SauerkrautLM-ColLFM2-450M-v0.1", + languages=SUPPORTED_LANGUAGES, + revision="main", + release_date="2025-01-01", + modalities=["image", "text"], + n_parameters=450_000_000, + memory_usage_mb=900, + max_tokens=32768, + embed_dim=128, + license="https://huggingface.co/LiquidAI/LFM2-VL-450M/blob/main/LICENSE", # LiquidAI LFM 1.0 License + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["ColPali"], + reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColLFM2-450M-v0.1", + similarity_fn_name=ScoringFunction.MAX_SIM, + use_instructions=True, + training_datasets=None, + citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, +) + + +# ============================================================================= +# ColMinistral3 Model Metadata +# ============================================================================= + +# ColMinistral3-3B: ~3B params → 6 GB VRAM in bfloat16 +slm_colministral3_3b = ModelMeta( + loader=partial(slm_colministral3_loader), + name="VAGOsolutions/SauerkrautLM-ColMinistral3-3b-v0.1", + languages=SUPPORTED_LANGUAGES, + revision="main", + release_date="2025-01-01", + modalities=["image", "text"], + n_parameters=3_000_000_000, + memory_usage_mb=6000, + max_tokens=262144, + embed_dim=128, + license="apache-2.0", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["ColPali"], + reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColMinistral3-3b-v0.1", + similarity_fn_name=ScoringFunction.MAX_SIM, + use_instructions=True, + training_datasets=None, + citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, +) diff --git a/pyproject.toml b/pyproject.toml index 6fe28affd7..8d6fb01e99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,6 +93,7 @@ nomic = ["einops>=0.8.1"] ark = ["volcengine-python-sdk[ark]==3.0.2", "tiktoken>=0.8.0"] colpali_engine = ["colpali_engine>=0.3.12"] colqwen3 = ["transformers>=4.57", "torchvision>=0.22.1"] +sauerkrautlm-colpali = ["transformers>=4.47.0", "torch>=2.0.0", "sauerkrautlm-colpali @ git+https://github.com/VAGOsolutions/sauerkrautlm-colpali.git"] xet = ["huggingface_hub>=0.32.0"] youtu = ["tencentcloud-sdk-python-common>=3.0.1454", "tencentcloud-sdk-python-lkeap>=3.0.1451"] llama-embed-nemotron = ["transformers==4.51.0"] @@ -327,6 +328,9 @@ conflicts = [ [{ extra = "colqwen3" }, { extra = "llama-embed-nemotron" }], # conflicting versions of transformers [{ extra = "jina-v4" }, { extra = "llm2vec" }], [{ extra = "jina-v4" }, { extra = "llama-embed-nemotron" }], # conflicting versions of transformers + [{ extra = "sauerkrautlm-colpali" }, { extra = "pylate" }], + [{ extra = "sauerkrautlm-colpali" }, { extra = "llm2vec" }], + [{ extra = "sauerkrautlm-colpali" }, { extra = "llama-embed-nemotron" }], ] [tool.uv.extra-build-dependencies] From 952543b107195f6cce460a698b77b40e29bfd426 Mon Sep 17 00:00:00 2001 From: David Golchinfar Date: Sun, 28 Dec 2025 21:46:53 +0100 Subject: [PATCH 02/20] fix: Address review comments - Remove loader functions, use classes directly in ModelMeta - Remove unused get_fused_embeddings method - Move model.to(device) and model.eval() to base class __init__ - Pass torch_dtype directly to ColMinistral3.from_pretrained --- .../model_implementations/slm_models.py | 59 ++++--------------- 1 file changed, 12 insertions(+), 47 deletions(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 70230aab89..658001258f 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -77,6 +77,8 @@ def __init__( self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") self._load_model_and_processor(model_name, revision, use_flash_attn, **kwargs) + self.mdl = self.mdl.to(self.device) + self.mdl.eval() def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): """Override in subclasses to load specific model/processor.""" @@ -175,22 +177,6 @@ def get_text_embeddings( ) return padded - def get_fused_embeddings( - self, - texts: list[str] | None = None, - images: list[Image.Image] | DataLoader | None = None, - *, - task_name: str | None = None, - prompt_type: PromptType | None = None, - batch_size: int = 32, - fusion_mode: str = "sum", - **kwargs: Any, - ): - raise NotImplementedError( - "Fused embeddings are not supported. " - "Please use get_text_embeddings or get_image_embeddings." - ) - def calculate_probs( self, text_embeddings: torch.Tensor, @@ -224,9 +210,6 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar revision=revision, **kwargs, ) - # Explicitly move to device - self.mdl = self.mdl.to(self.device) - self.mdl.eval() self.processor = ColQwen3Processor.from_pretrained( model_name, @@ -252,9 +235,6 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar revision=revision, **kwargs, ) - # Explicitly move to device - self.mdl = self.mdl.to(self.device) - self.mdl.eval() self.processor = ColLFM2Processor.from_pretrained( model_name, @@ -274,31 +254,16 @@ class SLMColMinistral3Wrapper(SLMBaseWrapper): def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): from sauerkrautlm_colpali.models.ministral3.colministral3 import ColMinistral3, ColMinistral3Processor - # ColMinistral3.__init__ doesn't accept extra kwargs - only pass model_name - self.mdl = ColMinistral3.from_pretrained(model_name) - # Explicitly move to device and convert to bfloat16 - self.mdl = self.mdl.to(dtype=torch.bfloat16, device=self.device) - self.mdl.eval() + self.mdl = ColMinistral3.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + ) self.processor = ColMinistral3Processor.from_pretrained(model_name) logger.info(f"SLM-ColMinistral3 loaded: dim={self.mdl.dim}, device={self.device}") -# ============================================================================= -# Loader Functions -# ============================================================================= - -def slm_colqwen3_loader(model_name: str, revision: str | None = None, device: str | None = None, **kwargs) -> SLMColQwen3Wrapper: - return SLMColQwen3Wrapper(model_name=model_name, revision=revision, device=device, **kwargs) - -def slm_collfm2_loader(model_name: str, revision: str | None = None, device: str | None = None, **kwargs) -> SLMColLFM2Wrapper: - return SLMColLFM2Wrapper(model_name=model_name, revision=revision, device=device, **kwargs) - -def slm_colministral3_loader(model_name: str, revision: str | None = None, device: str | None = None, **kwargs) -> SLMColMinistral3Wrapper: - return SLMColMinistral3Wrapper(model_name=model_name, revision=revision, device=device, **kwargs) - - # ============================================================================= # Citations # ============================================================================= @@ -331,7 +296,7 @@ def slm_colministral3_loader(model_name: str, revision: str | None = None, devic # ColQwen3-1.7B Turbo: ~1.7B params → 3.4 GB VRAM in bfloat16 slm_colqwen3_1_7b_turbo = ModelMeta( - loader=partial(slm_colqwen3_loader), + loader=partial(SLMColQwen3Wrapper), name="VAGOsolutions/SauerkrautLM-ColQwen3-1.7b-Turbo-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", @@ -355,7 +320,7 @@ def slm_colministral3_loader(model_name: str, revision: str | None = None, devic # ColQwen3-2B: ~2.2B params → 4.4 GB VRAM in bfloat16 slm_colqwen3_2b = ModelMeta( - loader=partial(slm_colqwen3_loader), + loader=partial(SLMColQwen3Wrapper), name="VAGOsolutions/SauerkrautLM-ColQwen3-2b-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", @@ -379,7 +344,7 @@ def slm_colministral3_loader(model_name: str, revision: str | None = None, devic # ColQwen3-4B: ~4B params → 8 GB VRAM in bfloat16 slm_colqwen3_4b = ModelMeta( - loader=partial(slm_colqwen3_loader), + loader=partial(SLMColQwen3Wrapper), name="VAGOsolutions/SauerkrautLM-ColQwen3-4b-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", @@ -403,7 +368,7 @@ def slm_colministral3_loader(model_name: str, revision: str | None = None, devic # ColQwen3-8B: ~8B params → 16 GB VRAM in bfloat16 slm_colqwen3_8b = ModelMeta( - loader=partial(slm_colqwen3_loader), + loader=partial(SLMColQwen3Wrapper), name="VAGOsolutions/SauerkrautLM-ColQwen3-8b-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", @@ -432,7 +397,7 @@ def slm_colministral3_loader(model_name: str, revision: str | None = None, devic # ColLFM2-450M: ~450M params → 900 MB VRAM in bfloat16 slm_collfm2_450m = ModelMeta( - loader=partial(slm_collfm2_loader), + loader=partial(SLMColLFM2Wrapper), name="VAGOsolutions/SauerkrautLM-ColLFM2-450M-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", @@ -461,7 +426,7 @@ def slm_colministral3_loader(model_name: str, revision: str | None = None, devic # ColMinistral3-3B: ~3B params → 6 GB VRAM in bfloat16 slm_colministral3_3b = ModelMeta( - loader=partial(slm_colministral3_loader), + loader=partial(SLMColMinistral3Wrapper), name="VAGOsolutions/SauerkrautLM-ColMinistral3-3b-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", From 95ce3cfda1b271cf648403d6d9dfb487e0a6d60e Mon Sep 17 00:00:00 2001 From: dgolchin Date: Sun, 28 Dec 2025 21:54:25 +0100 Subject: [PATCH 03/20] Update mteb/models/model_implementations/slm_models.py Co-authored-by: Roman Solomatin --- mteb/models/model_implementations/slm_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 658001258f..69d123ee68 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -426,7 +426,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColMinistral3-3B: ~3B params → 6 GB VRAM in bfloat16 slm_colministral3_3b = ModelMeta( - loader=partial(SLMColMinistral3Wrapper), + loader=SLMColMinistral3Wrapper, name="VAGOsolutions/SauerkrautLM-ColMinistral3-3b-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", From 50856f6ffb922f6c4983e2b14a487e999890f51e Mon Sep 17 00:00:00 2001 From: dgolchin Date: Sun, 28 Dec 2025 21:54:42 +0100 Subject: [PATCH 04/20] Update mteb/models/model_implementations/slm_models.py Co-authored-by: Roman Solomatin --- mteb/models/model_implementations/slm_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 69d123ee68..b41c5ce9cb 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -397,7 +397,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColLFM2-450M: ~450M params → 900 MB VRAM in bfloat16 slm_collfm2_450m = ModelMeta( - loader=partial(SLMColLFM2Wrapper), + loader=SLMColLFM2Wrapper, name="VAGOsolutions/SauerkrautLM-ColLFM2-450M-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", From 7658b354785b25cc67306d04836f177dfe2b54a2 Mon Sep 17 00:00:00 2001 From: dgolchin Date: Sun, 28 Dec 2025 21:54:56 +0100 Subject: [PATCH 05/20] Update mteb/models/model_implementations/slm_models.py Co-authored-by: Roman Solomatin --- mteb/models/model_implementations/slm_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index b41c5ce9cb..7a66a47a05 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -368,7 +368,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColQwen3-8B: ~8B params → 16 GB VRAM in bfloat16 slm_colqwen3_8b = ModelMeta( - loader=partial(SLMColQwen3Wrapper), + loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-8b-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", From 1a393c350c8fce522c86d5c3e80c112ca911b46a Mon Sep 17 00:00:00 2001 From: dgolchin Date: Sun, 28 Dec 2025 21:55:12 +0100 Subject: [PATCH 06/20] Update mteb/models/model_implementations/slm_models.py Co-authored-by: Roman Solomatin --- mteb/models/model_implementations/slm_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 7a66a47a05..1e5bb59eb2 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -344,7 +344,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColQwen3-4B: ~4B params → 8 GB VRAM in bfloat16 slm_colqwen3_4b = ModelMeta( - loader=partial(SLMColQwen3Wrapper), + loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-4b-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", From de7445e7ae19451747abe54d0aa4e490834f1b0e Mon Sep 17 00:00:00 2001 From: dgolchin Date: Sun, 28 Dec 2025 21:55:22 +0100 Subject: [PATCH 07/20] Update mteb/models/model_implementations/slm_models.py Co-authored-by: Roman Solomatin --- mteb/models/model_implementations/slm_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 1e5bb59eb2..8ad2a85b6a 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -320,7 +320,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColQwen3-2B: ~2.2B params → 4.4 GB VRAM in bfloat16 slm_colqwen3_2b = ModelMeta( - loader=partial(SLMColQwen3Wrapper), + loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-2b-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", From 60f8176087444a44c30f1ae99b8311e82d16a2bd Mon Sep 17 00:00:00 2001 From: dgolchin Date: Sun, 28 Dec 2025 21:55:40 +0100 Subject: [PATCH 08/20] Update mteb/models/model_implementations/slm_models.py Co-authored-by: Roman Solomatin --- mteb/models/model_implementations/slm_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 8ad2a85b6a..dad779a91f 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -296,7 +296,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColQwen3-1.7B Turbo: ~1.7B params → 3.4 GB VRAM in bfloat16 slm_colqwen3_1_7b_turbo = ModelMeta( - loader=partial(SLMColQwen3Wrapper), + loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-1.7b-Turbo-v0.1", languages=SUPPORTED_LANGUAGES, revision="main", From dba097e191710b158999a5800f48c9ed1527a701 Mon Sep 17 00:00:00 2001 From: dgolchin Date: Sun, 28 Dec 2025 21:55:59 +0100 Subject: [PATCH 09/20] Update pyproject.toml Co-authored-by: Roman Solomatin --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8d6fb01e99..2a6a1edc18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,7 +93,7 @@ nomic = ["einops>=0.8.1"] ark = ["volcengine-python-sdk[ark]==3.0.2", "tiktoken>=0.8.0"] colpali_engine = ["colpali_engine>=0.3.12"] colqwen3 = ["transformers>=4.57", "torchvision>=0.22.1"] -sauerkrautlm-colpali = ["transformers>=4.47.0", "torch>=2.0.0", "sauerkrautlm-colpali @ git+https://github.com/VAGOsolutions/sauerkrautlm-colpali.git"] +sauerkrautlm-colpali = ["sauerkrautlm-colpali @ git+https://github.com/VAGOsolutions/sauerkrautlm-colpali.git"] xet = ["huggingface_hub>=0.32.0"] youtu = ["tencentcloud-sdk-python-common>=3.0.1454", "tencentcloud-sdk-python-lkeap>=3.0.1451"] llama-embed-nemotron = ["transformers==4.51.0"] From 6b738ee4d12c62d42087135419c89826c76e416b Mon Sep 17 00:00:00 2001 From: David Golchinfar Date: Sun, 28 Dec 2025 22:00:09 +0100 Subject: [PATCH 10/20] fix: Update release_date to 2025-12-20 --- .../model_implementations/slm_models.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index dad779a91f..1fbaa59147 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -296,11 +296,11 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColQwen3-1.7B Turbo: ~1.7B params → 3.4 GB VRAM in bfloat16 slm_colqwen3_1_7b_turbo = ModelMeta( - loader=SLMColQwen3Wrapper, + loader=partial(SLMColQwen3Wrapper), name="VAGOsolutions/SauerkrautLM-ColQwen3-1.7b-Turbo-v0.1", languages=SUPPORTED_LANGUAGES, - revision="main", - release_date="2025-01-01", + revision="19c295a18e057d6d82754f627c09408117ffdb66", + release_date="2025-12-20", modalities=["image", "text"], n_parameters=1_700_000_000, memory_usage_mb=3400, @@ -320,11 +320,11 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColQwen3-2B: ~2.2B params → 4.4 GB VRAM in bfloat16 slm_colqwen3_2b = ModelMeta( - loader=SLMColQwen3Wrapper, + loader=partial(SLMColQwen3Wrapper), name="VAGOsolutions/SauerkrautLM-ColQwen3-2b-v0.1", languages=SUPPORTED_LANGUAGES, - revision="main", - release_date="2025-01-01", + revision="48f699713c10af754684e12060a2af9266462cc9", + release_date="2025-12-20", modalities=["image", "text"], n_parameters=2_200_000_000, memory_usage_mb=4400, @@ -344,11 +344,11 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColQwen3-4B: ~4B params → 8 GB VRAM in bfloat16 slm_colqwen3_4b = ModelMeta( - loader=SLMColQwen3Wrapper, + loader=partial(SLMColQwen3Wrapper), name="VAGOsolutions/SauerkrautLM-ColQwen3-4b-v0.1", languages=SUPPORTED_LANGUAGES, - revision="main", - release_date="2025-01-01", + revision="b635fbb3ab145f07608ed10a85def33544de1723", + release_date="2025-12-20", modalities=["image", "text"], n_parameters=4_000_000_000, memory_usage_mb=8000, @@ -368,11 +368,11 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColQwen3-8B: ~8B params → 16 GB VRAM in bfloat16 slm_colqwen3_8b = ModelMeta( - loader=SLMColQwen3Wrapper, + loader=partial(SLMColQwen3Wrapper), name="VAGOsolutions/SauerkrautLM-ColQwen3-8b-v0.1", languages=SUPPORTED_LANGUAGES, - revision="main", - release_date="2025-01-01", + revision="36ac136e451a7b8d8229725d69d4ec23aa4f03c8", + release_date="2025-12-20", modalities=["image", "text"], n_parameters=8_000_000_000, memory_usage_mb=16000, @@ -397,11 +397,11 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColLFM2-450M: ~450M params → 900 MB VRAM in bfloat16 slm_collfm2_450m = ModelMeta( - loader=SLMColLFM2Wrapper, + loader=partial(SLMColLFM2Wrapper), name="VAGOsolutions/SauerkrautLM-ColLFM2-450M-v0.1", languages=SUPPORTED_LANGUAGES, - revision="main", - release_date="2025-01-01", + revision="a65223fd6633f331ccff4483e47575c3c620dc60", + release_date="2025-12-20", modalities=["image", "text"], n_parameters=450_000_000, memory_usage_mb=900, @@ -426,11 +426,11 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColMinistral3-3B: ~3B params → 6 GB VRAM in bfloat16 slm_colministral3_3b = ModelMeta( - loader=SLMColMinistral3Wrapper, + loader=partial(SLMColMinistral3Wrapper), name="VAGOsolutions/SauerkrautLM-ColMinistral3-3b-v0.1", languages=SUPPORTED_LANGUAGES, - revision="main", - release_date="2025-01-01", + revision="54aa3ffbbce20471fdcc4afc07d13989c65e71b8", + release_date="2025-12-20", modalities=["image", "text"], n_parameters=3_000_000_000, memory_usage_mb=6000, From e8516226ca1a02287b068157acd97329be371727 Mon Sep 17 00:00:00 2001 From: David Golchinfar Date: Mon, 29 Dec 2025 01:27:07 +0100 Subject: [PATCH 11/20] fix: address review comments - remove partial, add adapted_from and training_datasets --- .../model_implementations/slm_models.py | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 1fbaa59147..a583f8c26d 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -13,11 +13,9 @@ from __future__ import annotations import logging -from functools import partial from typing import Any import torch -from PIL import Image from torch.utils.data import DataLoader from tqdm.auto import tqdm @@ -141,6 +139,7 @@ def get_image_embeddings( with torch.no_grad(): for batch in tqdm(images, desc="Encoding images"): + from PIL import Image imgs = [ F.to_pil_image(b) if not isinstance(b, Image.Image) @@ -296,7 +295,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColQwen3-1.7B Turbo: ~1.7B params → 3.4 GB VRAM in bfloat16 slm_colqwen3_1_7b_turbo = ModelMeta( - loader=partial(SLMColQwen3Wrapper), + loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-1.7b-Turbo-v0.1", languages=SUPPORTED_LANGUAGES, revision="19c295a18e057d6d82754f627c09408117ffdb66", @@ -314,13 +313,14 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-1.7b-Turbo-v0.1", similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, - training_datasets=None, + adapted_from="Qwen/Qwen3-VL-2B-Instruct", + training_datasets={"vidore/colpali_train_set"}, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) # ColQwen3-2B: ~2.2B params → 4.4 GB VRAM in bfloat16 slm_colqwen3_2b = ModelMeta( - loader=partial(SLMColQwen3Wrapper), + loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-2b-v0.1", languages=SUPPORTED_LANGUAGES, revision="48f699713c10af754684e12060a2af9266462cc9", @@ -338,13 +338,14 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-2b-v0.1", similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, - training_datasets=None, + adapted_from="Qwen/Qwen3-VL-2B-Instruct", + training_datasets={"vidore/colpali_train_set"}, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) # ColQwen3-4B: ~4B params → 8 GB VRAM in bfloat16 slm_colqwen3_4b = ModelMeta( - loader=partial(SLMColQwen3Wrapper), + loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-4b-v0.1", languages=SUPPORTED_LANGUAGES, revision="b635fbb3ab145f07608ed10a85def33544de1723", @@ -362,13 +363,14 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-4b-v0.1", similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, - training_datasets=None, + adapted_from="Qwen/Qwen3-VL-4B-Instruct", + training_datasets={"vidore/colpali_train_set"}, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) # ColQwen3-8B: ~8B params → 16 GB VRAM in bfloat16 slm_colqwen3_8b = ModelMeta( - loader=partial(SLMColQwen3Wrapper), + loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-8b-v0.1", languages=SUPPORTED_LANGUAGES, revision="36ac136e451a7b8d8229725d69d4ec23aa4f03c8", @@ -386,7 +388,8 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColQwen3-8b-v0.1", similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, - training_datasets=None, + adapted_from="Qwen/Qwen3-VL-8B-Instruct", + training_datasets={"vidore/colpali_train_set"}, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -397,7 +400,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColLFM2-450M: ~450M params → 900 MB VRAM in bfloat16 slm_collfm2_450m = ModelMeta( - loader=partial(SLMColLFM2Wrapper), + loader=SLMColLFM2Wrapper, name="VAGOsolutions/SauerkrautLM-ColLFM2-450M-v0.1", languages=SUPPORTED_LANGUAGES, revision="a65223fd6633f331ccff4483e47575c3c620dc60", @@ -415,7 +418,8 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColLFM2-450M-v0.1", similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, - training_datasets=None, + adapted_from="LiquidAI/LFM2-VL-450M", + training_datasets={"vidore/colpali_train_set"}, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -426,7 +430,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColMinistral3-3B: ~3B params → 6 GB VRAM in bfloat16 slm_colministral3_3b = ModelMeta( - loader=partial(SLMColMinistral3Wrapper), + loader=SLMColMinistral3Wrapper, name="VAGOsolutions/SauerkrautLM-ColMinistral3-3b-v0.1", languages=SUPPORTED_LANGUAGES, revision="54aa3ffbbce20471fdcc4afc07d13989c65e71b8", @@ -444,6 +448,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar reference="https://huggingface.co/VAGOsolutions/SauerkrautLM-ColMinistral3-3b-v0.1", similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, - training_datasets=None, + adapted_from="mistralai/Ministral-3B-Instruct-2410", + training_datasets={"vidore/colpali_train_set"}, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) From 783f5512a7c36707f4981bc25485bff29e2e691f Mon Sep 17 00:00:00 2001 From: dgolchin Date: Mon, 29 Dec 2025 20:20:59 +0100 Subject: [PATCH 12/20] Update mteb/models/model_implementations/slm_models.py Co-authored-by: Roman Solomatin --- mteb/models/model_implementations/slm_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index a583f8c26d..96df09cf9c 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -339,7 +339,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-2B-Instruct", - training_datasets={"vidore/colpali_train_set"}, + training_datasets={"MMarcoReranking", "VDRMultilingualRetrieval"} | COLPALI_TRAINING_DATA, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) From 6169c6f36f5d1337a1b511e827ea032e7ca8d598 Mon Sep 17 00:00:00 2001 From: David Golchinfar Date: Mon, 29 Dec 2025 20:54:58 +0100 Subject: [PATCH 13/20] fix: import COLPALI_CITATION from colpali_models and add model_type --- mteb/models/model_implementations/slm_models.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 96df09cf9c..78113d6e38 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -25,6 +25,7 @@ ) from mteb.abstasks.task_metadata import TaskMetadata from mteb.models.abs_encoder import AbsEncoder +from mteb.models.model_implementations.colpali_models import COLPALI_CITATION from mteb.models.model_meta import ModelMeta, ScoringFunction from mteb.types import Array, BatchedInput, PromptType @@ -277,16 +278,6 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar } """ -COLPALI_CITATION = """ -@misc{faysse2024colpali, - title={ColPali: Efficient Document Retrieval with Vision Language Models}, - author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C\\'eline and Colombo, Pierre}, - year={2024}, - eprint={2407.01449}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -} -""" # ============================================================================= @@ -301,6 +292,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar revision="19c295a18e057d6d82754f627c09408117ffdb66", release_date="2025-12-20", modalities=["image", "text"], + model_type=["late-interaction"], n_parameters=1_700_000_000, memory_usage_mb=3400, max_tokens=262144, @@ -326,6 +318,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar revision="48f699713c10af754684e12060a2af9266462cc9", release_date="2025-12-20", modalities=["image", "text"], + model_type=["late-interaction"], n_parameters=2_200_000_000, memory_usage_mb=4400, max_tokens=262144, @@ -351,6 +344,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar revision="b635fbb3ab145f07608ed10a85def33544de1723", release_date="2025-12-20", modalities=["image", "text"], + model_type=["late-interaction"], n_parameters=4_000_000_000, memory_usage_mb=8000, max_tokens=262144, @@ -376,6 +370,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar revision="36ac136e451a7b8d8229725d69d4ec23aa4f03c8", release_date="2025-12-20", modalities=["image", "text"], + model_type=["late-interaction"], n_parameters=8_000_000_000, memory_usage_mb=16000, max_tokens=262144, @@ -406,6 +401,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar revision="a65223fd6633f331ccff4483e47575c3c620dc60", release_date="2025-12-20", modalities=["image", "text"], + model_type=["late-interaction"], n_parameters=450_000_000, memory_usage_mb=900, max_tokens=32768, @@ -436,6 +432,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar revision="54aa3ffbbce20471fdcc4afc07d13989c65e71b8", release_date="2025-12-20", modalities=["image", "text"], + model_type=["late-interaction"], n_parameters=3_000_000_000, memory_usage_mb=6000, max_tokens=262144, From d1ea011f1e01ec4e3d64080c6a879991ec38c59d Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 30 Dec 2025 00:58:09 +0500 Subject: [PATCH 14/20] add training datasets --- .../model_implementations/slm_models.py | 70 ++++++++++++------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 78113d6e38..956ead25d2 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -25,7 +25,10 @@ ) from mteb.abstasks.task_metadata import TaskMetadata from mteb.models.abs_encoder import AbsEncoder -from mteb.models.model_implementations.colpali_models import COLPALI_CITATION +from mteb.models.model_implementations.colpali_models import ( + COLPALI_CITATION, + COLPALI_TRAINING_DATA, +) from mteb.models.model_meta import ModelMeta, ScoringFunction from mteb.types import Array, BatchedInput, PromptType @@ -50,13 +53,14 @@ # Base Wrapper Class # ============================================================================= + class SLMBaseWrapper(AbsEncoder): """ Base wrapper for SauerkrautLM multi-vector embedding models. - + All our models use late interaction (MaxSim) for retrieval scoring. """ - + model_class = None processor_class = None model_name_prefix = "SLM" @@ -73,7 +77,7 @@ def __init__( requires_package( self, "sauerkrautlm_colpali", model_name, "pip install sauerkrautlm-colpali" ) - + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") self._load_model_and_processor(model_name, revision, use_flash_attn, **kwargs) self.mdl = self.mdl.to(self.device) @@ -95,7 +99,7 @@ def encode( ) -> Array: text_embeddings = None image_embeddings = None - + if "text" in inputs.dataset.features: text_embeddings = self.get_text_embeddings(inputs, **kwargs) if "image" in inputs.dataset.features: @@ -141,10 +145,9 @@ def get_image_embeddings( with torch.no_grad(): for batch in tqdm(images, desc="Encoding images"): from PIL import Image + imgs = [ - F.to_pil_image(b) - if not isinstance(b, Image.Image) - else b + F.to_pil_image(b) if not isinstance(b, Image.Image) else b for b in batch["image"] ] inputs = self.processor.process_images(imgs) @@ -164,7 +167,7 @@ def get_text_embeddings( **kwargs, ) -> torch.Tensor: all_embeds = [] - + with torch.no_grad(): for batch in tqdm(texts, desc="Encoding texts"): inputs = self.processor.process_queries(batch["text"]) @@ -178,16 +181,16 @@ def get_text_embeddings( return padded def calculate_probs( - self, - text_embeddings: torch.Tensor, + self, + text_embeddings: torch.Tensor, image_embeddings: torch.Tensor, ) -> torch.Tensor: scores = self.similarity(text_embeddings, image_embeddings).T return scores.softmax(dim=-1) def similarity( - self, - a: torch.Tensor | list, + self, + a: torch.Tensor | list, b: torch.Tensor | list, ) -> torch.Tensor: return self.processor.score(a, b, device=self.device) @@ -197,11 +200,15 @@ def similarity( # ColQwen3 Wrapper # ============================================================================= + class SLMColQwen3Wrapper(SLMBaseWrapper): """Wrapper for SLM-ColQwen3 models (Qwen3-VL backbone).""" def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): - from sauerkrautlm_colpali.models.qwen3.colqwen3 import ColQwen3, ColQwen3Processor + from sauerkrautlm_colpali.models.qwen3.colqwen3 import ( + ColQwen3, + ColQwen3Processor, + ) self.mdl = ColQwen3.from_pretrained( model_name, @@ -215,7 +222,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar model_name, revision=revision, ) - + logger.info(f"SLM-ColQwen3 loaded: dim={self.mdl.dim}, device={self.device}") @@ -223,6 +230,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColLFM2 Wrapper # ============================================================================= + class SLMColLFM2Wrapper(SLMBaseWrapper): """Wrapper for SLM-ColLFM2 models (LFM2 backbone).""" @@ -240,7 +248,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar model_name, revision=revision, ) - + logger.info(f"SLM-ColLFM2 loaded: dim={self.mdl.dim}, device={self.device}") @@ -248,11 +256,15 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar # ColMinistral3 Wrapper # ============================================================================= + class SLMColMinistral3Wrapper(SLMBaseWrapper): """Wrapper for SLM-ColMinistral3 models (Ministral3 backbone).""" def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): - from sauerkrautlm_colpali.models.ministral3.colministral3 import ColMinistral3, ColMinistral3Processor + from sauerkrautlm_colpali.models.ministral3.colministral3 import ( + ColMinistral3, + ColMinistral3Processor, + ) self.mdl = ColMinistral3.from_pretrained( model_name, @@ -260,8 +272,10 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar ) self.processor = ColMinistral3Processor.from_pretrained(model_name) - - logger.info(f"SLM-ColMinistral3 loaded: dim={self.mdl.dim}, device={self.device}") + + logger.info( + f"SLM-ColMinistral3 loaded: dim={self.mdl.dim}, device={self.device}" + ) # ============================================================================= @@ -279,11 +293,15 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar """ - # ============================================================================= # ColQwen3 Model Metadata # ============================================================================= +_SLM_TRAINING_DATASETS = { + "MMarcoReranking", + "VDRMultilingualRetrieval", +} | COLPALI_TRAINING_DATA + # ColQwen3-1.7B Turbo: ~1.7B params → 3.4 GB VRAM in bfloat16 slm_colqwen3_1_7b_turbo = ModelMeta( loader=SLMColQwen3Wrapper, @@ -306,7 +324,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-2B-Instruct", - training_datasets={"vidore/colpali_train_set"}, + training_datasets=_SLM_TRAINING_DATASETS, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -332,7 +350,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-2B-Instruct", - training_datasets={"MMarcoReranking", "VDRMultilingualRetrieval"} | COLPALI_TRAINING_DATA, + training_datasets=_SLM_TRAINING_DATASETS, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -358,7 +376,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-4B-Instruct", - training_datasets={"vidore/colpali_train_set"}, + training_datasets=_SLM_TRAINING_DATASETS, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -384,7 +402,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-8B-Instruct", - training_datasets={"vidore/colpali_train_set"}, + training_datasets=_SLM_TRAINING_DATASETS, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -415,7 +433,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="LiquidAI/LFM2-VL-450M", - training_datasets={"vidore/colpali_train_set"}, + training_datasets=_SLM_TRAINING_DATASETS, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -446,6 +464,6 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="mistralai/Ministral-3B-Instruct-2410", - training_datasets={"vidore/colpali_train_set"}, + training_datasets=_SLM_TRAINING_DATASETS, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) From 420322b57b54105cacc481a8117c39efb0bc32fa Mon Sep 17 00:00:00 2001 From: David Golchinfar Date: Mon, 29 Dec 2025 22:14:41 +0100 Subject: [PATCH 15/20] fix: remove section headers and use PyPI package instead of Git URL --- .../model_implementations/slm_models.py | 37 ++++++++++--------- pyproject.toml | 2 +- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 956ead25d2..3c9a00ba2c 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -35,10 +35,6 @@ logger = logging.getLogger(__name__) -# ============================================================================= -# Supported Languages -# ============================================================================= - SUPPORTED_LANGUAGES = [ "eng-Latn", # English "deu-Latn", # German @@ -49,11 +45,14 @@ ] +<<<<<<< HEAD # ============================================================================= # Base Wrapper Class # ============================================================================= +======= +>>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) class SLMBaseWrapper(AbsEncoder): """ Base wrapper for SauerkrautLM multi-vector embedding models. @@ -196,11 +195,14 @@ def similarity( return self.processor.score(a, b, device=self.device) +<<<<<<< HEAD # ============================================================================= # ColQwen3 Wrapper # ============================================================================= +======= +>>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) class SLMColQwen3Wrapper(SLMBaseWrapper): """Wrapper for SLM-ColQwen3 models (Qwen3-VL backbone).""" @@ -226,11 +228,14 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar logger.info(f"SLM-ColQwen3 loaded: dim={self.mdl.dim}, device={self.device}") +<<<<<<< HEAD # ============================================================================= # ColLFM2 Wrapper # ============================================================================= +======= +>>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) class SLMColLFM2Wrapper(SLMBaseWrapper): """Wrapper for SLM-ColLFM2 models (LFM2 backbone).""" @@ -252,11 +257,14 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar logger.info(f"SLM-ColLFM2 loaded: dim={self.mdl.dim}, device={self.device}") +<<<<<<< HEAD # ============================================================================= # ColMinistral3 Wrapper # ============================================================================= +======= +>>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) class SLMColMinistral3Wrapper(SLMBaseWrapper): """Wrapper for SLM-ColMinistral3 models (Ministral3 backbone).""" @@ -278,10 +286,6 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar ) -# ============================================================================= -# Citations -# ============================================================================= - SAUERKRAUTLM_CITATION = """ @misc{sauerkrautlm-colpali-2025, title={SauerkrautLM-ColPali: Multi-Vector Vision Retrieval Models}, @@ -293,6 +297,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar """ +<<<<<<< HEAD # ============================================================================= # ColQwen3 Model Metadata # ============================================================================= @@ -302,6 +307,8 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar "VDRMultilingualRetrieval", } | COLPALI_TRAINING_DATA +======= +>>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) # ColQwen3-1.7B Turbo: ~1.7B params → 3.4 GB VRAM in bfloat16 slm_colqwen3_1_7b_turbo = ModelMeta( loader=SLMColQwen3Wrapper, @@ -350,7 +357,11 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-2B-Instruct", +<<<<<<< HEAD training_datasets=_SLM_TRAINING_DATASETS, +======= + training_datasets={"vidore/colpali_train_set"}, +>>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -406,11 +417,6 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) - -# ============================================================================= -# ColLFM2 Model Metadata -# ============================================================================= - # ColLFM2-450M: ~450M params → 900 MB VRAM in bfloat16 slm_collfm2_450m = ModelMeta( loader=SLMColLFM2Wrapper, @@ -437,11 +443,6 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) - -# ============================================================================= -# ColMinistral3 Model Metadata -# ============================================================================= - # ColMinistral3-3B: ~3B params → 6 GB VRAM in bfloat16 slm_colministral3_3b = ModelMeta( loader=SLMColMinistral3Wrapper, diff --git a/pyproject.toml b/pyproject.toml index 2a6a1edc18..38145231a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,7 +93,7 @@ nomic = ["einops>=0.8.1"] ark = ["volcengine-python-sdk[ark]==3.0.2", "tiktoken>=0.8.0"] colpali_engine = ["colpali_engine>=0.3.12"] colqwen3 = ["transformers>=4.57", "torchvision>=0.22.1"] -sauerkrautlm-colpali = ["sauerkrautlm-colpali @ git+https://github.com/VAGOsolutions/sauerkrautlm-colpali.git"] +sauerkrautlm-colpali = ["sauerkrautlm-colpali>=0.1.0"] xet = ["huggingface_hub>=0.32.0"] youtu = ["tencentcloud-sdk-python-common>=3.0.1454", "tencentcloud-sdk-python-lkeap>=3.0.1451"] llama-embed-nemotron = ["transformers==4.51.0"] From e3390509abeb54e0f7985bce09689a614e289676 Mon Sep 17 00:00:00 2001 From: David Golchinfar Date: Tue, 30 Dec 2025 20:26:21 +0100 Subject: [PATCH 16/20] fix: resolve merge conflicts and remove section headers --- .../model_implementations/slm_models.py | 114 ++++-------------- 1 file changed, 25 insertions(+), 89 deletions(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 3c9a00ba2c..4a9a989415 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -25,10 +25,7 @@ ) from mteb.abstasks.task_metadata import TaskMetadata from mteb.models.abs_encoder import AbsEncoder -from mteb.models.model_implementations.colpali_models import ( - COLPALI_CITATION, - COLPALI_TRAINING_DATA, -) +from mteb.models.model_implementations.colpali_models import COLPALI_CITATION from mteb.models.model_meta import ModelMeta, ScoringFunction from mteb.types import Array, BatchedInput, PromptType @@ -45,21 +42,13 @@ ] -<<<<<<< HEAD -# ============================================================================= -# Base Wrapper Class -# ============================================================================= - - -======= ->>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) class SLMBaseWrapper(AbsEncoder): """ Base wrapper for SauerkrautLM multi-vector embedding models. - + All our models use late interaction (MaxSim) for retrieval scoring. """ - + model_class = None processor_class = None model_name_prefix = "SLM" @@ -76,7 +65,7 @@ def __init__( requires_package( self, "sauerkrautlm_colpali", model_name, "pip install sauerkrautlm-colpali" ) - + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") self._load_model_and_processor(model_name, revision, use_flash_attn, **kwargs) self.mdl = self.mdl.to(self.device) @@ -98,7 +87,7 @@ def encode( ) -> Array: text_embeddings = None image_embeddings = None - + if "text" in inputs.dataset.features: text_embeddings = self.get_text_embeddings(inputs, **kwargs) if "image" in inputs.dataset.features: @@ -144,9 +133,10 @@ def get_image_embeddings( with torch.no_grad(): for batch in tqdm(images, desc="Encoding images"): from PIL import Image - imgs = [ - F.to_pil_image(b) if not isinstance(b, Image.Image) else b + F.to_pil_image(b) + if not isinstance(b, Image.Image) + else b for b in batch["image"] ] inputs = self.processor.process_images(imgs) @@ -166,7 +156,7 @@ def get_text_embeddings( **kwargs, ) -> torch.Tensor: all_embeds = [] - + with torch.no_grad(): for batch in tqdm(texts, desc="Encoding texts"): inputs = self.processor.process_queries(batch["text"]) @@ -180,37 +170,26 @@ def get_text_embeddings( return padded def calculate_probs( - self, - text_embeddings: torch.Tensor, + self, + text_embeddings: torch.Tensor, image_embeddings: torch.Tensor, ) -> torch.Tensor: scores = self.similarity(text_embeddings, image_embeddings).T return scores.softmax(dim=-1) def similarity( - self, - a: torch.Tensor | list, + self, + a: torch.Tensor | list, b: torch.Tensor | list, ) -> torch.Tensor: return self.processor.score(a, b, device=self.device) -<<<<<<< HEAD -# ============================================================================= -# ColQwen3 Wrapper -# ============================================================================= - - -======= ->>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) class SLMColQwen3Wrapper(SLMBaseWrapper): """Wrapper for SLM-ColQwen3 models (Qwen3-VL backbone).""" def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): - from sauerkrautlm_colpali.models.qwen3.colqwen3 import ( - ColQwen3, - ColQwen3Processor, - ) + from sauerkrautlm_colpali.models.qwen3.colqwen3 import ColQwen3, ColQwen3Processor self.mdl = ColQwen3.from_pretrained( model_name, @@ -224,18 +203,10 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar model_name, revision=revision, ) - + logger.info(f"SLM-ColQwen3 loaded: dim={self.mdl.dim}, device={self.device}") -<<<<<<< HEAD -# ============================================================================= -# ColLFM2 Wrapper -# ============================================================================= - - -======= ->>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) class SLMColLFM2Wrapper(SLMBaseWrapper): """Wrapper for SLM-ColLFM2 models (LFM2 backbone).""" @@ -253,26 +224,15 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar model_name, revision=revision, ) - + logger.info(f"SLM-ColLFM2 loaded: dim={self.mdl.dim}, device={self.device}") -<<<<<<< HEAD -# ============================================================================= -# ColMinistral3 Wrapper -# ============================================================================= - - -======= ->>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) class SLMColMinistral3Wrapper(SLMBaseWrapper): """Wrapper for SLM-ColMinistral3 models (Ministral3 backbone).""" def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): - from sauerkrautlm_colpali.models.ministral3.colministral3 import ( - ColMinistral3, - ColMinistral3Processor, - ) + from sauerkrautlm_colpali.models.ministral3.colministral3 import ColMinistral3, ColMinistral3Processor self.mdl = ColMinistral3.from_pretrained( model_name, @@ -280,10 +240,8 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar ) self.processor = ColMinistral3Processor.from_pretrained(model_name) - - logger.info( - f"SLM-ColMinistral3 loaded: dim={self.mdl.dim}, device={self.device}" - ) + + logger.info(f"SLM-ColMinistral3 loaded: dim={self.mdl.dim}, device={self.device}") SAUERKRAUTLM_CITATION = """ @@ -297,19 +255,6 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar """ -<<<<<<< HEAD -# ============================================================================= -# ColQwen3 Model Metadata -# ============================================================================= - -_SLM_TRAINING_DATASETS = { - "MMarcoReranking", - "VDRMultilingualRetrieval", -} | COLPALI_TRAINING_DATA - -======= ->>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) -# ColQwen3-1.7B Turbo: ~1.7B params → 3.4 GB VRAM in bfloat16 slm_colqwen3_1_7b_turbo = ModelMeta( loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-1.7b-Turbo-v0.1", @@ -331,11 +276,10 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-2B-Instruct", - training_datasets=_SLM_TRAINING_DATASETS, + training_datasets={"vidore/colpali_train_set"}, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) -# ColQwen3-2B: ~2.2B params → 4.4 GB VRAM in bfloat16 slm_colqwen3_2b = ModelMeta( loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-2b-v0.1", @@ -357,15 +301,10 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-2B-Instruct", -<<<<<<< HEAD - training_datasets=_SLM_TRAINING_DATASETS, -======= training_datasets={"vidore/colpali_train_set"}, ->>>>>>> 32881a4 (fix: remove section headers and use PyPI package instead of Git URL) citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) -# ColQwen3-4B: ~4B params → 8 GB VRAM in bfloat16 slm_colqwen3_4b = ModelMeta( loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-4b-v0.1", @@ -387,11 +326,10 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-4B-Instruct", - training_datasets=_SLM_TRAINING_DATASETS, + training_datasets={"vidore/colpali_train_set"}, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) -# ColQwen3-8B: ~8B params → 16 GB VRAM in bfloat16 slm_colqwen3_8b = ModelMeta( loader=SLMColQwen3Wrapper, name="VAGOsolutions/SauerkrautLM-ColQwen3-8b-v0.1", @@ -413,11 +351,10 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-8B-Instruct", - training_datasets=_SLM_TRAINING_DATASETS, + training_datasets={"vidore/colpali_train_set"}, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) -# ColLFM2-450M: ~450M params → 900 MB VRAM in bfloat16 slm_collfm2_450m = ModelMeta( loader=SLMColLFM2Wrapper, name="VAGOsolutions/SauerkrautLM-ColLFM2-450M-v0.1", @@ -430,7 +367,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar memory_usage_mb=900, max_tokens=32768, embed_dim=128, - license="https://huggingface.co/LiquidAI/LFM2-VL-450M/blob/main/LICENSE", # LiquidAI LFM 1.0 License + license="https://huggingface.co/LiquidAI/LFM2-VL-450M/blob/main/LICENSE", open_weights=True, public_training_code=None, public_training_data=None, @@ -439,11 +376,10 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="LiquidAI/LFM2-VL-450M", - training_datasets=_SLM_TRAINING_DATASETS, + training_datasets={"vidore/colpali_train_set"}, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) -# ColMinistral3-3B: ~3B params → 6 GB VRAM in bfloat16 slm_colministral3_3b = ModelMeta( loader=SLMColMinistral3Wrapper, name="VAGOsolutions/SauerkrautLM-ColMinistral3-3b-v0.1", @@ -465,6 +401,6 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="mistralai/Ministral-3B-Instruct-2410", - training_datasets=_SLM_TRAINING_DATASETS, + training_datasets={"vidore/colpali_train_set"}, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) From b63607ef8f3ad45eca645bf377cc8c0cadd08269 Mon Sep 17 00:00:00 2001 From: David Golchinfar Date: Tue, 30 Dec 2025 23:02:32 +0100 Subject: [PATCH 17/20] fix: use COLPALI_TRAINING_DATA for training_datasets --- mteb/models/model_implementations/slm_models.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 4a9a989415..2074dcbae7 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -25,7 +25,7 @@ ) from mteb.abstasks.task_metadata import TaskMetadata from mteb.models.abs_encoder import AbsEncoder -from mteb.models.model_implementations.colpali_models import COLPALI_CITATION +from mteb.models.model_implementations.colpali_models import COLPALI_CITATION, COLPALI_TRAINING_DATA from mteb.models.model_meta import ModelMeta, ScoringFunction from mteb.types import Array, BatchedInput, PromptType @@ -276,7 +276,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-2B-Instruct", - training_datasets={"vidore/colpali_train_set"}, + training_datasets=COLPALI_TRAINING_DATA, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -301,7 +301,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-2B-Instruct", - training_datasets={"vidore/colpali_train_set"}, + training_datasets=COLPALI_TRAINING_DATA, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -326,7 +326,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-4B-Instruct", - training_datasets={"vidore/colpali_train_set"}, + training_datasets=COLPALI_TRAINING_DATA, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -351,7 +351,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="Qwen/Qwen3-VL-8B-Instruct", - training_datasets={"vidore/colpali_train_set"}, + training_datasets=COLPALI_TRAINING_DATA, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -376,7 +376,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="LiquidAI/LFM2-VL-450M", - training_datasets={"vidore/colpali_train_set"}, + training_datasets=COLPALI_TRAINING_DATA, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) @@ -401,6 +401,6 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar similarity_fn_name=ScoringFunction.MAX_SIM, use_instructions=True, adapted_from="mistralai/Ministral-3B-Instruct-2410", - training_datasets={"vidore/colpali_train_set"}, + training_datasets=COLPALI_TRAINING_DATA, citation=SAUERKRAUTLM_CITATION + COLPALI_CITATION, ) From ca33bdceb91655eafde28cbc06a69260a09b09de Mon Sep 17 00:00:00 2001 From: David Golchinfar Date: Sat, 3 Jan 2026 20:43:27 +0100 Subject: [PATCH 18/20] fix: use exact n_parameters and memory_usage_mb values from HuggingFace --- .../model_implementations/slm_models.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 2074dcbae7..3805dc146a 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -263,8 +263,8 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar release_date="2025-12-20", modalities=["image", "text"], model_type=["late-interaction"], - n_parameters=1_700_000_000, - memory_usage_mb=3400, + n_parameters=1_756_572_288, + memory_usage_mb=3350, max_tokens=262144, embed_dim=128, license="apache-2.0", @@ -288,8 +288,8 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar release_date="2025-12-20", modalities=["image", "text"], model_type=["late-interaction"], - n_parameters=2_200_000_000, - memory_usage_mb=4400, + n_parameters=2_127_794_304, + memory_usage_mb=4058, max_tokens=262144, embed_dim=128, license="apache-2.0", @@ -313,8 +313,8 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar release_date="2025-12-20", modalities=["image", "text"], model_type=["late-interaction"], - n_parameters=4_000_000_000, - memory_usage_mb=8000, + n_parameters=4_438_143_616, + memory_usage_mb=8465, max_tokens=262144, embed_dim=128, license="apache-2.0", @@ -338,8 +338,8 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar release_date="2025-12-20", modalities=["image", "text"], model_type=["late-interaction"], - n_parameters=8_000_000_000, - memory_usage_mb=16000, + n_parameters=8_145_318_256, + memory_usage_mb=15536, max_tokens=262144, embed_dim=128, license="apache-2.0", @@ -363,8 +363,8 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar release_date="2025-12-20", modalities=["image", "text"], model_type=["late-interaction"], - n_parameters=450_000_000, - memory_usage_mb=900, + n_parameters=450_953_856, + memory_usage_mb=860, max_tokens=32768, embed_dim=128, license="https://huggingface.co/LiquidAI/LFM2-VL-450M/blob/main/LICENSE", @@ -388,8 +388,8 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar release_date="2025-12-20", modalities=["image", "text"], model_type=["late-interaction"], - n_parameters=3_000_000_000, - memory_usage_mb=6000, + n_parameters=4_252_136_448, + memory_usage_mb=8110, max_tokens=262144, embed_dim=128, license="apache-2.0", From 903cf58a6714cef2da4886c922660a46acd1468c Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 4 Jan 2026 13:12:15 +0500 Subject: [PATCH 19/20] don't build 3.14 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7274d29a48..a1820e5e22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ nomic = ["einops>=0.8.1"] ark = ["volcengine-python-sdk[ark]==3.0.2", "tiktoken>=0.8.0"] colpali_engine = ["colpali_engine>=0.3.12; python_full_version < '3.14'"] colqwen3 = ["transformers>=4.57", "torchvision>=0.22.1"] -sauerkrautlm-colpali = ["sauerkrautlm-colpali>=0.1.0"] +sauerkrautlm-colpali = ["sauerkrautlm-colpali>=0.1.0; python_full_version < '3.14'"] xet = ["huggingface_hub>=0.32.0"] youtu = ["tencentcloud-sdk-python-common>=3.0.1454", "tencentcloud-sdk-python-lkeap>=3.0.1451"] llama-embed-nemotron = ["transformers==4.51.0"] From 40170cd47e8fedd0ce1e27aac234aad91586d3b5 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 4 Jan 2026 14:23:40 +0500 Subject: [PATCH 20/20] lint --- .../model_implementations/slm_models.py | 48 +++++++++++-------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/mteb/models/model_implementations/slm_models.py b/mteb/models/model_implementations/slm_models.py index 3805dc146a..a0f152c6b2 100644 --- a/mteb/models/model_implementations/slm_models.py +++ b/mteb/models/model_implementations/slm_models.py @@ -25,7 +25,10 @@ ) from mteb.abstasks.task_metadata import TaskMetadata from mteb.models.abs_encoder import AbsEncoder -from mteb.models.model_implementations.colpali_models import COLPALI_CITATION, COLPALI_TRAINING_DATA +from mteb.models.model_implementations.colpali_models import ( + COLPALI_CITATION, + COLPALI_TRAINING_DATA, +) from mteb.models.model_meta import ModelMeta, ScoringFunction from mteb.types import Array, BatchedInput, PromptType @@ -45,10 +48,10 @@ class SLMBaseWrapper(AbsEncoder): """ Base wrapper for SauerkrautLM multi-vector embedding models. - + All our models use late interaction (MaxSim) for retrieval scoring. """ - + model_class = None processor_class = None model_name_prefix = "SLM" @@ -65,7 +68,7 @@ def __init__( requires_package( self, "sauerkrautlm_colpali", model_name, "pip install sauerkrautlm-colpali" ) - + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") self._load_model_and_processor(model_name, revision, use_flash_attn, **kwargs) self.mdl = self.mdl.to(self.device) @@ -87,7 +90,7 @@ def encode( ) -> Array: text_embeddings = None image_embeddings = None - + if "text" in inputs.dataset.features: text_embeddings = self.get_text_embeddings(inputs, **kwargs) if "image" in inputs.dataset.features: @@ -133,10 +136,9 @@ def get_image_embeddings( with torch.no_grad(): for batch in tqdm(images, desc="Encoding images"): from PIL import Image + imgs = [ - F.to_pil_image(b) - if not isinstance(b, Image.Image) - else b + F.to_pil_image(b) if not isinstance(b, Image.Image) else b for b in batch["image"] ] inputs = self.processor.process_images(imgs) @@ -156,7 +158,7 @@ def get_text_embeddings( **kwargs, ) -> torch.Tensor: all_embeds = [] - + with torch.no_grad(): for batch in tqdm(texts, desc="Encoding texts"): inputs = self.processor.process_queries(batch["text"]) @@ -170,16 +172,16 @@ def get_text_embeddings( return padded def calculate_probs( - self, - text_embeddings: torch.Tensor, + self, + text_embeddings: torch.Tensor, image_embeddings: torch.Tensor, ) -> torch.Tensor: scores = self.similarity(text_embeddings, image_embeddings).T return scores.softmax(dim=-1) def similarity( - self, - a: torch.Tensor | list, + self, + a: torch.Tensor | list, b: torch.Tensor | list, ) -> torch.Tensor: return self.processor.score(a, b, device=self.device) @@ -189,7 +191,10 @@ class SLMColQwen3Wrapper(SLMBaseWrapper): """Wrapper for SLM-ColQwen3 models (Qwen3-VL backbone).""" def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): - from sauerkrautlm_colpali.models.qwen3.colqwen3 import ColQwen3, ColQwen3Processor + from sauerkrautlm_colpali.models.qwen3.colqwen3 import ( + ColQwen3, + ColQwen3Processor, + ) self.mdl = ColQwen3.from_pretrained( model_name, @@ -203,7 +208,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar model_name, revision=revision, ) - + logger.info(f"SLM-ColQwen3 loaded: dim={self.mdl.dim}, device={self.device}") @@ -224,7 +229,7 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar model_name, revision=revision, ) - + logger.info(f"SLM-ColLFM2 loaded: dim={self.mdl.dim}, device={self.device}") @@ -232,7 +237,10 @@ class SLMColMinistral3Wrapper(SLMBaseWrapper): """Wrapper for SLM-ColMinistral3 models (Ministral3 backbone).""" def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwargs): - from sauerkrautlm_colpali.models.ministral3.colministral3 import ColMinistral3, ColMinistral3Processor + from sauerkrautlm_colpali.models.ministral3.colministral3 import ( + ColMinistral3, + ColMinistral3Processor, + ) self.mdl = ColMinistral3.from_pretrained( model_name, @@ -240,8 +248,10 @@ def _load_model_and_processor(self, model_name, revision, use_flash_attn, **kwar ) self.processor = ColMinistral3Processor.from_pretrained(model_name) - - logger.info(f"SLM-ColMinistral3 loaded: dim={self.mdl.dim}, device={self.device}") + + logger.info( + f"SLM-ColMinistral3 loaded: dim={self.mdl.dim}, device={self.device}" + ) SAUERKRAUTLM_CITATION = """