Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 20 additions & 19 deletions docs/tasks.md

Large diffs are not rendered by default.

13 changes: 8 additions & 5 deletions docs/usage/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -456,12 +456,15 @@ There are times you may want to cache the embeddings so you can re-use them. Thi

```python
# define your task(s) and model above as normal
...
task = mteb.get_task("LccSentimentClassification")
model = mteb.get_model("minishlab/M2V_base_glove_subword")
evaluation = mteb.MTEB(tasks=[task])

# wrap the model with the cache wrapper
from mteb.models.cache_wrapper import CachedEmbeddingWrapper
model_with_cached_emb = CachedEmbeddingWrapper(model, cache_path='<path_to_cache_dir>')
model_with_cached_emb = CachedEmbeddingWrapper(model, cache_path='path_to_cache_dir')
# run as normal
evaluation.run(model, ...)
evaluation.run(model_with_cached_emb)
```

If you want to directly access the cached embeddings (e.g. for subsequent analyses) follow this example:
Expand All @@ -471,8 +474,8 @@ import numpy as np
from mteb.models.cache_wrapper import TextVectorMap

# Access the memory-mapped file and convert to array
vector_map = TextVectorMap("<path_to_cache_dir>/AppsRetrieval")
vector_map.load(name="AppsRetrieval")
vector_map = TextVectorMap("path_to_cache_dir/LccSentimentClassification")
vector_map.load(name="LccSentimentClassification")
vectors = np.asarray(vector_map.vectors)

# Remove all "placeholders" in the embedding cache
Expand Down
2 changes: 1 addition & 1 deletion mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def apply_styling(
joint_table[score_columns] = joint_table[score_columns].map(format_scores)
joint_table_style = joint_table.style.format(
{
**{column: "{:.2f}" for column in score_columns},
**dict.fromkeys(score_columns, "{:.2f}"),
"Rank (Borda)": "{:.0f}",
},
na_rep="",
Expand Down
33 changes: 22 additions & 11 deletions mteb/models/cache_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,8 @@ def load(self, name: str | None = None) -> None:
self.vectors = np.memmap(
self.vectors_file, dtype="float32", mode="r+"
)
self.vectors = self.vectors.reshape(-1, self.vector_dim)
logger.info(f"Loaded vectors file with shape: {self.vectors.shape}")
self.vectors = self.vectors.reshape(-1, self.vector_dim) # type: ignore
logger.info(f"Loaded vectors file with shape: {self.vectors.shape}") # type: ignore
else:
logger.warning(
"Vector dimension not set. Unable to load vectors file."
Expand Down Expand Up @@ -214,22 +214,30 @@ def __init__(self, model: Encoder, cache_path: str | Path):
logger.info("Initialized CachedEmbeddingWrapper")

def encode(
self, texts: list[str], batch_size: int = 32, task_name: str = None, **kwargs
self,
texts: list[str],
batch_size: int = 32,
task_name: str | None = None,
**kwargs,
) -> np.ndarray:
"""Encode texts using the wrapped model, with caching"""
_task_name = task_name or "no_task_name"

try:
results = []
uncached_texts = []
uncached_indices = []

# Initialize cache
if task_name not in self.cache_dict:
self.cache_dict[task_name] = TextVectorMap(self.cache_path / task_name)
self.cache_dict[task_name].load(name=task_name)
if _task_name not in self.cache_dict:
self.cache_dict[_task_name] = TextVectorMap(
self.cache_path / _task_name
)
self.cache_dict[_task_name].load(name=_task_name)

# Check cache for each text
for i, text in enumerate(texts):
vector = self.cache_dict[task_name].get_vector(text)
vector = self.cache_dict[_task_name].get_vector(text)
if vector is not None:
results.append(vector)
else:
Expand All @@ -240,16 +248,19 @@ def encode(
if uncached_texts:
logger.info(f"Encoding {len(uncached_texts)} new texts")
new_vectors = self._model.encode(
uncached_texts, batch_size=batch_size, **kwargs
uncached_texts,
batch_size=batch_size,
task_name=task_name, # type: ignore
**kwargs,
)
if isinstance(new_vectors, torch.Tensor):
new_vectors = new_vectors.cpu().numpy()

# Add new vectors to cache
for text, vector in zip(uncached_texts, new_vectors):
self.cache_dict[task_name].add(text, vector)
self.cache_dict[_task_name].add(text, vector)
results.extend(new_vectors)
self.cache_dict[task_name].save()
self.cache_dict[_task_name].save()
else:
logger.info("All texts found in cache")

Expand Down Expand Up @@ -287,7 +298,7 @@ def __getattr__(self, name: str) -> Any:

def __dir__(self) -> list[str]:
"""Return all attributes from both this class and the wrapped model"""
return list(set(super().__dir__() + dir(self._model)))
return list(set(super().__dir__() + dir(self._model))) # type: ignore

def __del__(self):
self.close()
Expand Down
3 changes: 2 additions & 1 deletion mteb/models/cadet_models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from __future__ import annotations

from mteb.model_meta import ModelMeta, sentence_transformers_loader
from mteb.model_meta import ModelMeta
from mteb.models.bge_models import bge_m3_training_data
from mteb.models.sentence_transformers_models import sentence_transformers_loader

cadet_training_data = {
# we train with the corpora of FEVER, MSMARCO, and DBPEDIA. We only train with synthetic generated queries.
Expand Down
2 changes: 2 additions & 0 deletions mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
promptriever_models,
qodo_models,
qtack_models,
qwen3_models,
repllama_models,
rerankers_custom,
rerankers_monot5_based,
Expand Down Expand Up @@ -144,6 +145,7 @@
promptriever_models,
qodo_models,
qtack_models,
qwen3_models,
repllama_models,
rerankers_custom,
rerankers_monot5_based,
Expand Down
186 changes: 186 additions & 0 deletions mteb/models/qwen3_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
from __future__ import annotations

from mteb.encoder_interface import Encoder, PromptType
from mteb.model_meta import ModelMeta
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel


def instruction_template(
instruction: str, prompt_type: PromptType | None = None
) -> str:
if not instruction or prompt_type == PromptType.passage:
return ""
if isinstance(instruction, dict):
if prompt_type is None:
instruction = list(instruction.values())[0] # TODO
else:
instruction = instruction[prompt_type]
return f"Instruct: {instruction}\nQuery:"


multilingual_langs = [
"afr-Latn",
"ara-Arab",
"aze-Latn",
"bel-Cyrl",
"bul-Cyrl",
"ben-Beng",
"cat-Latn",
"ceb-Latn",
"ces-Latn",
"cym-Latn",
"dan-Latn",
"deu-Latn",
"ell-Grek",
"eng-Latn",
"spa-Latn",
"est-Latn",
"eus-Latn",
"fas-Arab",
"fin-Latn",
"fra-Latn",
"glg-Latn",
"guj-Gujr",
"heb-Hebr",
"hin-Deva",
"hrv-Latn",
"hat-Latn",
"hun-Latn",
"hye-Armn",
"ind-Latn",
"isl-Latn",
"ita-Latn",
"jpn-Jpan",
"jav-Latn",
"kat-Geor",
"kaz-Cyrl",
"khm-Khmr",
"kan-Knda",
"kor-Hang",
"kir-Cyrl",
"lao-Laoo",
"lit-Latn",
"lav-Latn",
"mkd-Cyrl",
"mal-Mlym",
"mon-Cyrl",
"mar-Deva",
"msa-Latn",
"mya-Mymr",
"nep-Deva",
"nld-Latn",
"nor-Latn",
"pan-Guru",
"pol-Latn",
"por-Latn",
"que-Latn",
"ron-Latn",
"rus-Cyrl",
"sin-Sinh",
"slk-Latn",
"slv-Latn",
"swa-Latn",
"tam-Taml",
"tel-Telu",
"tha-Thai",
"tgl-Latn",
"tur-Latn",
"ukr-Cyrl",
"urd-Arab",
"vie-Latn",
"yor-Latn",
"zho-Hans",
]

training_data = {
"T2Retrieval": ["train"],
"DuRetrieval": ["train"],
"MMarcoReranking": ["train"],
"CMedQAv2-reranking": ["train"],
"NQ": ["train"],
"MSMARCO": ["train"],
"HotpotQA": ["train"],
"FEVER": ["train"],
"MrTidyRetrieval": ["train"],
"MIRACLRetrieval": ["train"],
"CodeSearchNet": ["train"],
}


def q3e_instruct_loader(model_name_or_path: str, revision: str, **kwargs) -> Encoder:
model = InstructSentenceTransformerModel(
model_name_or_path,
revision=revision,
instruction_template=instruction_template,
apply_instruction_to_passages=False,
**kwargs,
)
encoder = model.model._first_module()
if encoder.auto_model.config._attn_implementation == "flash_attention_2":
# The Qwen3 code only use left padding in flash_attention_2 mode.
encoder.tokenizer.padding_side = "left"
return model


Qwen3_Embedding_0B6 = ModelMeta(
loader=q3e_instruct_loader,
name="Qwen/Qwen3-Embedding-0.6B",
languages=multilingual_langs,
open_weights=True,
revision="b22da495047858cce924d27d76261e96be6febc0", # Commit of @tomaarsen
release_date="2025-06-05",
n_parameters=595776512,
memory_usage_mb=2272,
embed_dim=1024,
max_tokens=32768,
license="apache-2.0",
reference="https://huggingface.co/Qwen/Qwen3-Embedding-0.6B",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_code=None,
public_training_data=None,
training_datasets=training_data,
)

Qwen3_Embedding_4B = ModelMeta(
loader=q3e_instruct_loader,
name="Qwen/Qwen3-Embedding-4B",
languages=multilingual_langs,
open_weights=True,
revision="636cd9bf47d976946cdbb2b0c3ca0cb2f8eea5ff", # Commit of @tomaarsen
release_date="2025-06-05",
n_parameters=4021774336,
memory_usage_mb=15341,
embed_dim=2560,
max_tokens=32768,
license="apache-2.0",
reference="https://huggingface.co/Qwen/Qwen3-Embedding-4B",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_code=None,
public_training_data=None,
training_datasets=training_data,
)

Qwen3_Embedding_8B = ModelMeta(
loader=q3e_instruct_loader,
name="Qwen/Qwen3-Embedding-8B",
languages=multilingual_langs,
open_weights=True,
revision="4e423935c619ae4df87b646a3ce949610c66241c", # Commit of @tomaarsen
release_date="2025-06-05",
n_parameters=7567295488,
memory_usage_mb=28866,
embed_dim=4096,
max_tokens=32768,
license="apache-2.0",
reference="https://huggingface.co/Qwen/Qwen3-Embedding-8B",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_code=None,
public_training_data=None,
training_datasets=training_data,
)
Loading
Loading