diff --git a/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py b/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py index f3ac386871..290bdc3897 100644 --- a/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +++ b/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any import torch -from packaging.version import Version +from packaging.specifiers import SpecifierSet from torch.utils.data import DataLoader from transformers import __version__ as transformers_version @@ -31,18 +31,20 @@ def __init__( model_name_or_path: str, revision: str, trust_remote_code: bool, + transformers_version_constraint: str | None = None, device_map="cuda", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", **kwargs, ): - required_transformers_version = "4.49.0" - - if Version(transformers_version) != Version(required_transformers_version): - raise RuntimeError( - f"transformers version {transformers_version} is not match with required " - f"install version {required_transformers_version} to run `nvidia/llama-nemoretriever-colembed`" - ) + if transformers_version_constraint is not None: + spec = SpecifierSet(transformers_version_constraint) + if transformers_version not in spec: + raise RuntimeError( + f"Model `{model_name_or_path}` requires transformers{transformers_version_constraint}, " + f"but {transformers_version} is installed. " + f"Run: pip install 'transformers{transformers_version_constraint}'" + ) from transformers import AutoModel @@ -150,10 +152,24 @@ def encode( "wiki-ss-nq", } + +TRAINING_DATA_v2 = { + "VidoreDocVQARetrieval", + "VidoreInfoVQARetrieval", + "VidoreTatdqaRetrieval", + "VidoreArxivQARetrieval", + "docmatix-ir", + "VDRMultilingualRetrieval", + "VisRAG-Ret-Train-Synthetic-data", + "VisRAG-Ret-Train-In-domain-data", + "wiki-ss-nq", +} + llama_nemoretriever_colembed_1b_v1 = ModelMeta( loader=LlamaNemoretrieverColembed, loader_kwargs=dict( trust_remote_code=True, + transformers_version_constraint="==4.49.0", ), name="nvidia/llama-nemoretriever-colembed-1b-v1", model_type=["late-interaction"], @@ -168,7 +184,7 @@ def encode( embed_dim=2048, license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE", open_weights=True, - public_training_code="Proprietary Code", + public_training_code=None, public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset", framework=["PyTorch", "Transformers", "safetensors"], reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1", @@ -182,6 +198,7 @@ def encode( loader=LlamaNemoretrieverColembed, loader_kwargs=dict( trust_remote_code=True, + transformers_version_constraint="==4.49.0", ), name="nvidia/llama-nemoretriever-colembed-3b-v1", model_type=["late-interaction"], @@ -196,7 +213,7 @@ def encode( embed_dim=3072, license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE", open_weights=True, - public_training_code="Proprietary Code", + public_training_code=None, public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset", framework=["PyTorch", "Transformers", "safetensors"], reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-3b-v1", @@ -205,3 +222,86 @@ def encode( training_datasets=TRAINING_DATA, citation=LLAMA_NEMORETRIEVER_CITATION, ) + +llama_nemotron_colembed_vl_3b_v2 = ModelMeta( + loader=LlamaNemoretrieverColembed, + loader_kwargs=dict( + trust_remote_code=True, + transformers_version_constraint="==4.49.0", + ), + name="nvidia/llama-nemotron-colembed-vl-3b-v2", + model_type=["late-interaction"], + languages=["eng-Latn"], + revision="75f03c712cb3a252e062295f9a0966e5d95d6156", + release_date="2026-01-21", + modalities=["image", "text"], + n_parameters=4_407_000_000, + memory_usage_mb=8403, + max_tokens=8192, + embed_dim=3072, + license="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2/blob/main/LICENSE", + open_weights=True, + public_training_code=None, + public_training_data="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2#training-dataset", + framework=["PyTorch", "Transformers", "safetensors"], + reference="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2", + similarity_fn_name="MaxSim", + use_instructions=True, + training_datasets=TRAINING_DATA, + citation=LLAMA_NEMORETRIEVER_CITATION, +) + +nemotron_colembed_vl_4b_v2 = ModelMeta( + loader=LlamaNemoretrieverColembed, + loader_kwargs=dict( + trust_remote_code=True, + transformers_version_constraint="==5.0.0rc0", + ), + name="nvidia/nemotron-colembed-vl-4b-v2", + revision="823b1625c15fe3da73fa094205e538a7a2301a2a", + languages=["eng-Latn"], + release_date="2026-01-07", + modalities=["image", "text"], + n_parameters=4_800_000_000, + memory_usage_mb=9206, + max_tokens=262144, + embed_dim=2560, + license="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2/blob/main/LICENSE", + open_weights=True, + public_training_code=None, + public_training_data="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2#training-dataset", + framework=["PyTorch", "Transformers"], + reference="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2", + similarity_fn_name="MaxSim", + use_instructions=True, + training_datasets=TRAINING_DATA_v2, + citation=LLAMA_NEMORETRIEVER_CITATION, +) + + +nemotron_colembed_vl_8b_v2 = ModelMeta( + loader=LlamaNemoretrieverColembed, + loader_kwargs=dict( + trust_remote_code=True, + transformers_version_constraint="==5.0.0rc0", + ), + name="nvidia/nemotron-colembed-vl-8b-v2", + revision="6cbe43579dda6237768fc373768ad372cc5cdfec", + languages=["eng-Latn"], + release_date="2026-01-07", + modalities=["image", "text"], + n_parameters=8_700_000_000, + memory_usage_mb=16722, + max_tokens=262144, + embed_dim=4096, + license="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2/blob/main/LICENSE", + open_weights=True, + public_training_code=None, + public_training_data="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2#training-dataset", + framework=["PyTorch", "Transformers"], + reference="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2", + similarity_fn_name="MaxSim", + use_instructions=True, + training_datasets=TRAINING_DATA_v2, + citation=LLAMA_NEMORETRIEVER_CITATION, +)