Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 110 additions & 10 deletions mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import TYPE_CHECKING, Any

import torch
from packaging.version import Version
from packaging.specifiers import SpecifierSet
from torch.utils.data import DataLoader
from transformers import __version__ as transformers_version

Expand Down Expand Up @@ -31,18 +31,20 @@ def __init__(
model_name_or_path: str,
revision: str,
trust_remote_code: bool,
transformers_version_constraint: str | None = None,
device_map="cuda",
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
**kwargs,
):
required_transformers_version = "4.49.0"

if Version(transformers_version) != Version(required_transformers_version):
raise RuntimeError(
f"transformers version {transformers_version} is not match with required "
f"install version {required_transformers_version} to run `nvidia/llama-nemoretriever-colembed`"
)
if transformers_version_constraint is not None:
spec = SpecifierSet(transformers_version_constraint)
if transformers_version not in spec:
raise RuntimeError(
f"Model `{model_name_or_path}` requires transformers{transformers_version_constraint}, "
f"but {transformers_version} is installed. "
f"Run: pip install 'transformers{transformers_version_constraint}'"
)

from transformers import AutoModel

Expand Down Expand Up @@ -150,10 +152,24 @@ def encode(
"wiki-ss-nq",
}


TRAINING_DATA_v2 = {
"VidoreDocVQARetrieval",
"VidoreInfoVQARetrieval",
"VidoreTatdqaRetrieval",
"VidoreArxivQARetrieval",
"docmatix-ir",
"VDRMultilingualRetrieval",
"VisRAG-Ret-Train-Synthetic-data",
"VisRAG-Ret-Train-In-domain-data",
"wiki-ss-nq",
}

llama_nemoretriever_colembed_1b_v1 = ModelMeta(
loader=LlamaNemoretrieverColembed,
loader_kwargs=dict(
trust_remote_code=True,
transformers_version_constraint="==4.49.0",
),
name="nvidia/llama-nemoretriever-colembed-1b-v1",
model_type=["late-interaction"],
Expand All @@ -168,7 +184,7 @@ def encode(
embed_dim=2048,
license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
open_weights=True,
public_training_code="Proprietary Code",
public_training_code=None,
public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset",
framework=["PyTorch", "Transformers", "safetensors"],
reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1",
Expand All @@ -182,6 +198,7 @@ def encode(
loader=LlamaNemoretrieverColembed,
loader_kwargs=dict(
trust_remote_code=True,
transformers_version_constraint="==4.49.0",
),
name="nvidia/llama-nemoretriever-colembed-3b-v1",
model_type=["late-interaction"],
Expand All @@ -196,7 +213,7 @@ def encode(
embed_dim=3072,
license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
open_weights=True,
public_training_code="Proprietary Code",
public_training_code=None,
public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset",
framework=["PyTorch", "Transformers", "safetensors"],
reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-3b-v1",
Expand All @@ -205,3 +222,86 @@ def encode(
training_datasets=TRAINING_DATA,
citation=LLAMA_NEMORETRIEVER_CITATION,
)

llama_nemotron_colembed_vl_3b_v2 = ModelMeta(
loader=LlamaNemoretrieverColembed,
loader_kwargs=dict(
trust_remote_code=True,
transformers_version_constraint="==4.49.0",
),
name="nvidia/llama-nemotron-colembed-vl-3b-v2",
model_type=["late-interaction"],
languages=["eng-Latn"],
revision="75f03c712cb3a252e062295f9a0966e5d95d6156",
release_date="2026-01-21",
modalities=["image", "text"],
n_parameters=4_407_000_000,
memory_usage_mb=8403,
max_tokens=8192,
embed_dim=3072,
license="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2/blob/main/LICENSE",
open_weights=True,
public_training_code=None,
public_training_data="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2#training-dataset",
framework=["PyTorch", "Transformers", "safetensors"],
reference="https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2",
similarity_fn_name="MaxSim",
use_instructions=True,
training_datasets=TRAINING_DATA,
citation=LLAMA_NEMORETRIEVER_CITATION,
)

nemotron_colembed_vl_4b_v2 = ModelMeta(
loader=LlamaNemoretrieverColembed,
loader_kwargs=dict(
trust_remote_code=True,
transformers_version_constraint="==5.0.0rc0",
),
name="nvidia/nemotron-colembed-vl-4b-v2",
revision="823b1625c15fe3da73fa094205e538a7a2301a2a",
languages=["eng-Latn"],
release_date="2026-01-07",
modalities=["image", "text"],
n_parameters=4_800_000_000,
memory_usage_mb=9206,
max_tokens=262144,
embed_dim=2560,
license="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2/blob/main/LICENSE",
open_weights=True,
public_training_code=None,
public_training_data="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2#training-dataset",
framework=["PyTorch", "Transformers"],
reference="https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2",
similarity_fn_name="MaxSim",
use_instructions=True,
training_datasets=TRAINING_DATA_v2,
citation=LLAMA_NEMORETRIEVER_CITATION,
)


nemotron_colembed_vl_8b_v2 = ModelMeta(
loader=LlamaNemoretrieverColembed,
loader_kwargs=dict(
trust_remote_code=True,
transformers_version_constraint="==5.0.0rc0",
),
name="nvidia/nemotron-colembed-vl-8b-v2",
revision="6cbe43579dda6237768fc373768ad372cc5cdfec",
languages=["eng-Latn"],
release_date="2026-01-07",
modalities=["image", "text"],
n_parameters=8_700_000_000,
memory_usage_mb=16722,
max_tokens=262144,
embed_dim=4096,
license="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2/blob/main/LICENSE",
open_weights=True,
public_training_code=None,
public_training_data="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2#training-dataset",
framework=["PyTorch", "Transformers"],
reference="https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2",
similarity_fn_name="MaxSim",
use_instructions=True,
training_datasets=TRAINING_DATA_v2,
citation=LLAMA_NEMORETRIEVER_CITATION,
)