diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index 62fb40bdc9..248449e5e4 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -7,17 +7,17 @@ import numpy as np import torch import torch.nn.functional as F -from sentence_transformers import SentenceTransformer +import mteb from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta -from .wrapper import Wrapper +from .sentence_transformer_wrapper import SentenceTransformerWrapper logger = logging.getLogger(__name__) -class NomicWrapper(Wrapper): +class NomicWrapper(SentenceTransformerWrapper): """following the hf model card documentation.""" def __init__( @@ -28,10 +28,7 @@ def __init__( **kwargs: Any, ): self.model_name = model_name - self.model = SentenceTransformer(model_name, revision=revision, **kwargs) - self.model_prompts = ( - self.validate_task_to_prompt_name(model_prompts) if model_prompts else None - ) + super().__init__(model_name, revision, model_prompts, **kwargs) def to(self, device: torch.device) -> None: self.model.to(device) @@ -45,33 +42,51 @@ def encode( # type: ignore batch_size: int = 32, **kwargs: Any, ) -> np.ndarray: - input_type = self.get_prompt_name(self.model_prompts, task_name, prompt_type) - # default to search_document if input_type and prompt_name are not provided - if input_type is None: - input_type = "search_document" - - sentences = [f"{input_type}: {sentence}" for sentence in sentences] - - emb = self.model.encode(sentences, batch_size=batch_size, **kwargs) + prompt_name = ( + self.get_prompt_name(self.model_prompts, task_name, prompt_type) + or PromptType.passage.value + ) + task = mteb.get_task(task_name) + # normalization not applied to classification + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/eval/mteb_eval/eval_mteb.py#L172 + normalize = task.metadata.type not in ( + "Classification", + "MultilabelClassification", + "PairClassification", + "Reranking", + "STS", + "Summarization", + ) + emb = self.model.encode( + sentences, + prompt_name=prompt_name, + batch_size=batch_size, + **kwargs, + ) # v1.5 has a non-trainable layer norm to unit normalize the embeddings for binary quantization # the outputs are similar to if we just normalized but keeping the same for consistency if self.model_name == "nomic-ai/nomic-embed-text-v1.5": if not isinstance(emb, torch.Tensor): emb = torch.tensor(emb) emb = F.layer_norm(emb, normalized_shape=(emb.shape[1],)) - emb = F.normalize(emb, p=2, dim=1) + if normalize: + emb = F.normalize(emb, p=2, dim=1) if isinstance(emb, torch.Tensor): emb = emb.cpu().detach().float().numpy() - return emb +# https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/eval/mteb_eval/eval_mteb.py#L142-L159 model_prompts = { "Classification": "classification: ", "MultilabelClassification": "classification: ", "Clustering": "clustering: ", + "PairClassification": "classification: ", + "Reranking": "classification: ", + "STS": "classification: ", + "Summarization": "classification: ", PromptType.query.value: "search_query: ", PromptType.passage.value: "search_document: ", } @@ -155,7 +170,7 @@ def encode( # type: ignore ) -nomic_embed_v1_ablated = ModelMeta( +nomic_embed_v1_unsupervised = ModelMeta( loader=partial( # type: ignore NomicWrapper, trust_remote_code=True,