From 1fcd87254489780eaec09b53422c8c7add8582db Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 7 Jan 2025 14:31:28 +0300 Subject: [PATCH 1/7] add gigaembeddings --- mteb/models/nvidia_models.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 72274b41de..110343e574 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -115,3 +115,29 @@ def encode( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, ) + +giga_embeddings = ModelMeta( + loader=partial( # type: ignore + NvEmbedWrapper, + model="ai-sage/Giga-Embeddings-instruct", + revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", + trust_remote_code=True, + model_kwargs={ + "torch_dtype": torch.float16, + }, + ), + name="ai-sage/Giga-Embeddings-instruct", + languages=["eng_Latn", "rus_Cyrl"], + open_weights=True, + revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", + release_date="2024-12-13", + n_parameters=2_530_000_000, + memory_usage=None, + embed_dim=2048, + license="mit", + max_tokens=32768, + reference="https://huggingface.co/ai-sage/Giga-Embeddings-instruct", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, +) From 47bb697c6b1d6eca89d737da2d238b100c706694 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 7 Jan 2025 14:33:52 +0300 Subject: [PATCH 2/7] use jasper --- mteb/models/nvidia_models.py | 26 -------------------------- mteb/models/ru_sentence_models.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 110343e574..72274b41de 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -115,29 +115,3 @@ def encode( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, ) - -giga_embeddings = ModelMeta( - loader=partial( # type: ignore - NvEmbedWrapper, - model="ai-sage/Giga-Embeddings-instruct", - revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", - trust_remote_code=True, - model_kwargs={ - "torch_dtype": torch.float16, - }, - ), - name="ai-sage/Giga-Embeddings-instruct", - languages=["eng_Latn", "rus_Cyrl"], - open_weights=True, - revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", - release_date="2024-12-13", - n_parameters=2_530_000_000, - memory_usage=None, - embed_dim=2048, - license="mit", - max_tokens=32768, - reference="https://huggingface.co/ai-sage/Giga-Embeddings-instruct", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=True, -) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index f90111b954..9d967734b1 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -4,7 +4,10 @@ from functools import partial +import torch + from mteb.model_meta import ModelMeta, sentence_transformers_loader +from mteb.models.jasper_models import JasperWrapper rubert_tiny2 = ModelMeta( name="cointegrated/rubert-tiny2", @@ -236,3 +239,30 @@ release_date="2024-07-29", use_instructions=True, ) + +giga_embeddings = ModelMeta( + loader=partial( # type: ignore + JasperWrapper, + model="ai-sage/Giga-Embeddings-instruct", + revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", + trust_remote_code=True, + model_kwargs={ + "torch_dtype": torch.float16, + }, + ), + name="ai-sage/Giga-Embeddings-instruct", + languages=["eng_Latn", "rus_Cyrl"], + open_weights=True, + revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", + release_date="2024-12-13", + n_parameters=2_530_000_000, + memory_usage=None, + embed_dim=2048, + license="mit", + max_tokens=32768, + reference="https://huggingface.co/ai-sage/Giga-Embeddings-instruct", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, +) + From 80a90c61397e38ee1360f6dd1cb52544ba2207e8 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 7 Jan 2025 14:37:07 +0300 Subject: [PATCH 3/7] fix name --- mteb/models/ru_sentence_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 9d967734b1..f6dc58f091 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -243,7 +243,7 @@ giga_embeddings = ModelMeta( loader=partial( # type: ignore JasperWrapper, - model="ai-sage/Giga-Embeddings-instruct", + model_name="ai-sage/Giga-Embeddings-instruct", revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", trust_remote_code=True, model_kwargs={ From 27df66a87bc998675bdc2c540276b3c4d9ee79a9 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 7 Jan 2025 15:07:22 +0300 Subject: [PATCH 4/7] create sentence_transformer instruct wrapper --- mteb/models/instruct_wrapper.py | 63 +++++++++++++++++++++++++++++++ mteb/models/ru_sentence_models.py | 9 +++-- 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 303a386836..22f1a85933 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -6,7 +6,9 @@ import numpy as np import torch +from sentence_transformers import SentenceTransformer +import mteb from mteb.encoder_interface import PromptType from .wrapper import Wrapper @@ -78,3 +80,64 @@ def encode( return embeddings return InstructWrapper(model_name_or_path, mode, instruction_template, **kwargs) + + +class InstructSentenceTransformerWrapper(Wrapper): + def __init__( + self, + model_name: str, + revision: str, + instruction_template: str | Callable[[str], str] | None = None, + max_seq_length: int | None = None, + apply_instruction_to_passages: bool = True, + **kwargs: Any, + ): + if ( + isinstance(instruction_template, str) + and "{instruction}" not in instruction_template + ): + raise ValueError( + "Instruction template must contain the string '{instruction}'." + ) + if instruction_template is None: + logger.warning( + "No instruction template provided. Instructions will be used as-is." + ) + + self.model_name = model_name + self.model = SentenceTransformer(model_name, revision=revision, **kwargs) + self.instruction_template = instruction_template + self.apply_instruction_to_passages = apply_instruction_to_passages + if max_seq_length is not None: + self.model.max_seq_length = max_seq_length + + def encode( + self, + sentences: Sequence[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> np.ndarray: + task = mteb.get_task(task_name=task_name) + instruction = self.get_instruction(task_name, prompt_type) + + # to passage prompts won't be applied to passages + if ( + not self.apply_instruction_to_passages + and prompt_type == PromptType.passage + and task.metadata.type == "s2p" + ): + instruction = None + + logger.info(f"Using instruction: '{instruction}' for task: '{task_name}'") + embeddings = self.model.encode( + sentences, + prompt=instruction, + **kwargs, + ) + + if isinstance(embeddings, torch.Tensor): + # sometimes in kwargs can be return_tensors=True + embeddings = embeddings.cpu().detach().float().numpy() + return embeddings diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index f6dc58f091..fc05e58610 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -7,7 +7,7 @@ import torch from mteb.model_meta import ModelMeta, sentence_transformers_loader -from mteb.models.jasper_models import JasperWrapper +from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper rubert_tiny2 = ModelMeta( name="cointegrated/rubert-tiny2", @@ -241,11 +241,13 @@ ) giga_embeddings = ModelMeta( - loader=partial( # type: ignore - JasperWrapper, + loader=partial( + InstructSentenceTransformerWrapper, model_name="ai-sage/Giga-Embeddings-instruct", revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", trust_remote_code=True, + instruction_template="Instruct: {instruction}\nQuery: ", + apply_instruction_to_passages=False, model_kwargs={ "torch_dtype": torch.float16, }, @@ -265,4 +267,3 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, ) - From ddf96f4d259fb0a01d4b4fc8a15f3a0b225bc685 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 7 Jan 2025 15:21:38 +0300 Subject: [PATCH 5/7] apply instruction template --- mteb/models/instruct_wrapper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 22f1a85933..ce5152896b 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -121,6 +121,8 @@ def encode( ) -> np.ndarray: task = mteb.get_task(task_name=task_name) instruction = self.get_instruction(task_name, prompt_type) + if self.instruction_template: + instruction = self.format_instruction(instruction, prompt_type) # to passage prompts won't be applied to passages if ( From b70232e842ee2db75df6c6df33910c060474b3dd Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 7 Jan 2025 15:38:03 +0300 Subject: [PATCH 6/7] fix jasper --- mteb/models/jasper_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 60fa4f6975..b5103e57b7 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -44,7 +44,7 @@ def encode( instruction = self.get_task_instruction(task_name, prompt_type) # to passage prompts won't be applied to passages - if prompt_type == PromptType.passage and task.metadata.type == "s2p": + if prompt_type == PromptType.passage and task.metadata.category == "s2p": instruction = None embeddings = self.model.encode( From 14b457b12f5f7a1542e077c59ae6fede950947c9 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 14 Feb 2025 00:21:34 +0300 Subject: [PATCH 7/7] update meta --- mteb/models/ru_sentence_models.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index f329d661c2..d9a8bd1041 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -4,9 +4,12 @@ from functools import partial +import torch + from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.bge_models import bge_m3_training_data +from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper rubert_tiny = ModelMeta( name="cointegrated/rubert-tiny", @@ -569,7 +572,7 @@ instruction_template="Instruct: {instruction}\nQuery: ", apply_instruction_to_passages=False, model_kwargs={ - "torch_dtype": torch.float16, + "torch_dtype": torch.bfloat16, }, ), name="ai-sage/Giga-Embeddings-instruct", @@ -578,7 +581,7 @@ revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", release_date="2024-12-13", n_parameters=2_530_000_000, - memory_usage=None, + memory_usage_mb=9649, embed_dim=2048, license="mit", max_tokens=32768, @@ -586,4 +589,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=None, )