embeddings-benchmark · orionw · Oct 25, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/mteb/abstasks/AbsTaskInstructionRetrieval.py b/mteb/abstasks/AbsTaskInstructionRetrieval.py
@@ -372,6 +372,7 @@ def _evaluate_subset_lang(
         )
 
         top_ranked = top_ranked[split]
+        kwargs["prediction_name"] = "og"  # for naming predictions, as needed
         scores_og, results_og = self._evaluate_subset(
             retriever,
             corpus,
@@ -382,6 +383,7 @@ def _evaluate_subset_lang(
             lang,
             **kwargs,
         )
+        kwargs["prediction_name"] = "changed"  # for naming predictions, as needed
         scores_changed, results_changed = self._evaluate_subset(
             retriever,
             corpus,
@@ -411,6 +413,7 @@ def _evaluate_subset_lang(
                 keywords[split],
                 short_instructions[split],
             )
+            kwargs["prediction_name"] = "base"  # for naming predictions, as needed
             scores_base, results_base = self._evaluate_subset(
                 retriever,
                 corpus,
@@ -421,6 +424,7 @@ def _evaluate_subset_lang(
                 lang,
                 **kwargs,
             )
+            kwargs["prediction_name"] = "keywords"  # for naming predictions, as needed
             scores_w_keywords_scores, scores_w_keywords_results = self._evaluate_subset(
                 retriever,
                 corpus,
@@ -431,6 +435,9 @@ def _evaluate_subset_lang(
                 lang,
                 **kwargs,
             )
+            kwargs["prediction_name"] = (
+                "short_instr"  # for naming predictions, as needed
+            )
             (
                 scores_w_short_instr_scores,
                 scores_w_short_instr_result,
@@ -572,6 +579,11 @@ def _evaluate_subset(
             else:
                 qrels_save_path = f"{output_folder}/{self.metadata_dict['name']}_{lang}_predictions.json"
 
+            if kwargs.get("prediction_name", None):
+                qrels_save_path = qrels_save_path.replace(
+                    ".json", f"_{kwargs['prediction_name']}.json"
+                )
+
             with open(qrels_save_path, "w") as f:
                 json.dump(results, f)
 

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -63,6 +63,7 @@ def load_results(
             base_results = load_results()
         return base_results.select_tasks(self.tasks)
 
+
 MTEB_MAIN_EN = Benchmark(
     name="MTEB(eng)",
     tasks=get_tasks(

diff --git a/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py b/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py
@@ -26,6 +26,19 @@ def __call__(
             return self.retriever.search_cross_encoder(
                 corpus, queries, self.top_k, instructions=instructions, **kwargs
             )
+        elif (
+            hasattr(self.retriever.model, "mteb_model_meta")
+            and self.retriever.model.mteb_model_meta.name == "bm25s"
+        ):
+            return self.retriever.model.search(
+                corpus,
+                queries,
+                self.top_k,
+                self.score_function,
+                task_name=self.task_name,  # type: ignore
+                instructions=instructions,
+                **kwargs,
+            )
         else:
             return self.retriever.search(
                 corpus,

diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py
@@ -257,11 +257,18 @@ def search_cross_encoder(
     ) -> dict[str, dict[str, float]]:
         """This function provides support for reranker (or cross-encoder) models that encoder query and document at the same time (typically with attention).
         Some notable examples include MonoBERT, MonoT5, RankLlama, etc.
-        Note: you must provide the path to the results to rerank to the __init__ function as `previous_results`
+        Note: you must provide the path to the results to rerank to the __init__ function as `previous_results` or else rerank all documents in the corpus
         """
         pairs = []  # create the pairs for reranking
         for qid in queries.keys():
-            q_results = self.previous_results[qid]
+            if self.previous_results is None:
+                # try to use all of them
+                logging.logging(
+                    f"previous_results is None. Using all the documents to rerank: {len(corpus)}"
+                )
+                q_results = {doc_id: 0.0 for doc_id in corpus.keys()}
+            else:
+                q_results = self.previous_results[qid]
             # take the top-k only
             q_results_sorted = dict(
                 sorted(q_results.items(), key=lambda item: item[1], reverse=True)

diff --git a/mteb/tasks/InstructionRetrieval/__init__.py b/mteb/tasks/InstructionRetrieval/__init__.py
@@ -3,3 +3,4 @@
 from .eng.Core17InstructionRetrieval import *
 from .eng.News21InstructionRetrieval import *
 from .eng.Robust04InstructionRetrieval import *
+from .multilingual.mFollowIR import *
diff --git a/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py b/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py
@@ -8,7 +8,7 @@
 class Core17InstructionRetrieval(AbsTaskInstructionRetrieval):
     metadata = TaskMetadata(
         name="Core17InstructionRetrieval",
-        description="Measuring retrieval instruction following ability on Core17 narratives.",
+        description="Measuring retrieval instruction following ability on Core17 narratives for the FollowIR benchmark.",
         reference="https://arxiv.org/abs/2403.15246",
         dataset={
             "path": "jhu-clsp/core17-instructions",

diff --git a/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py b/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py
@@ -8,7 +8,7 @@
 class News21InstructionRetrieval(AbsTaskInstructionRetrieval):
     metadata = TaskMetadata(
         name="News21InstructionRetrieval",
-        description="Measuring retrieval instruction following ability on News21 narratives.",
+        description="Measuring retrieval instruction following ability on News21 narratives for the FollowIR benchmark.",
         reference="https://arxiv.org/abs/2403.15246",
         dataset={
             "path": "jhu-clsp/news21-instructions",

diff --git a/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py b/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py
@@ -8,7 +8,7 @@
 class Robust04InstructionRetrieval(AbsTaskInstructionRetrieval):
     metadata = TaskMetadata(
         name="Robust04InstructionRetrieval",
-        description="Measuring retrieval instruction following ability on Robust04 narratives.",
+        description="Measuring retrieval instruction following ability on Robust04 narratives for the FollowIR benchmark.",
         reference="https://arxiv.org/abs/2403.15246",
         dataset={
             "path": "jhu-clsp/robust04-instructions",

diff --git a/mteb/tasks/InstructionRetrieval/multilingual/__init__.py b/mteb/tasks/InstructionRetrieval/multilingual/__init__.py