diff --git a/mteb/models/no_instruct_sentence_models.py b/mteb/models/no_instruct_sentence_models.py new file mode 100644 index 0000000000..019cfe7e04 --- /dev/null +++ b/mteb/models/no_instruct_sentence_models.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +from functools import partial +from typing import Any + +import numpy as np +import torch +from transformers import AutoModel, AutoTokenizer + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.wrapper import Wrapper + +from .utils import batched + + +class NoInstructWrapper(Wrapper): + def __init__( + self, + model_name: str, + revision: str, + model_prompts: dict[str, str] | None = None, + **kwargs: Any, + ): + self.model_name = model_name + device = kwargs.pop("device", None) + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.model = AutoModel.from_pretrained( + model_name, revision=revision, **kwargs + ).to(self.device) + self.model.eval() + self.tokenizer = AutoTokenizer.from_pretrained( + model_name, revision=revision, **kwargs + ) + + def encode( # type: ignore + self, + sentences: list[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + batch_size: int = 32, + **kwargs: Any, + ): + embeddings = [] + for batch in batched(sentences, batch_size): + # Tokenize the batch + encoding = self.tokenizer( + batch, + padding=True, + truncation=True, + return_tensors="pt", + return_attention_mask=True, + ).to(self.device) + + input_ids = encoding["input_ids"] + attention_mask = encoding["attention_mask"] + + # Forward pass + with torch.no_grad(): + outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) + + # The model is optimized to use the mean pooling for queries, + # while the sentence / document embedding uses the [CLS] representation. + if prompt_type == PromptType.query: + # Mean pooling + vectors = outputs.last_hidden_state * attention_mask.unsqueeze(2) + pooled_vectors = vectors.sum(dim=1) / attention_mask.sum( + dim=-1, keepdim=True + ) + else: + # [CLS] token representation + pooled_vectors = outputs.last_hidden_state[:, 0, :] + + # Append pooled vectors to result + embeddings.append(pooled_vectors.cpu().detach().numpy()) + + return np.concatenate(embeddings, axis=0) + + +no_instruct_small_v0 = ModelMeta( + loader=partial( + NoInstructWrapper, + model_name="avsolatorio/NoInstruct-small-Embedding-v0", + revision="b38747000553d8268915c95a55fc87e707c9aadd", + ), + name="avsolatorio/NoInstruct-small-Embedding-v0", + languages=["eng-Latn"], + open_weights=True, + revision="b38747000553d8268915c95a55fc87e707c9aadd", + release_date="2024-05-01", # first commit + n_parameters=33_400_000, + memory_usage=None, + max_tokens=512, + embed_dim=384, + license="mit", + reference="https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + adapted_from=None, + superseded_by=None, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 6f2cc8ed76..5a3cfe757f 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -29,6 +29,7 @@ misc_models, model2vec_models, mxbai_models, + no_instruct_sentence_models, nomic_models, nvidia_models, openai_models, @@ -66,6 +67,7 @@ model2vec_models, misc_models, nomic_models, + no_instruct_sentence_models, nvidia_models, openai_models, promptriever_models, diff --git a/mteb/models/utils.py b/mteb/models/utils.py new file mode 100644 index 0000000000..78d778eb7f --- /dev/null +++ b/mteb/models/utils.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from itertools import islice + + +# https://docs.python.org/3/library/itertools.html#itertools.batched +# Added in version 3.12. +def batched(iterable, n: int, *, strict: bool = False) -> tuple: + # batched('ABCDEFG', 3) → ABC DEF G + if n < 1: + raise ValueError("n must be at least one") + iterator = iter(iterable) + while batch := tuple(islice(iterator, n)): + if strict and len(batch) != n: + raise ValueError("batched(): incomplete batch") + yield batch