embeddings-benchmark · Samoed · Dec 22, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 11, 2024
diff --git a/README.md b/README.md
@@ -210,6 +210,21 @@ Note that the public leaderboard uses the test splits for all datasets except MS
 
 </details>
 
+
+<details>
+  <summary> Selecting evaluation subset </summary>
+
+### Selecting evaluation subset
+You can evaluate only on selected subsets. For example, if you want to evaluate only the `subset_name_to_run` subset of all tasks, do the following:
+
+```python
+evaluation.run(model, eval_subsets=["subset_name_to_run"])
+```
+
+Monolingual tasks have `default` subset, other tasks have subsets that are specific to the dataset.
+
+</details>
+
 <details>
   <summary>  Using a custom model </summary>
 
@@ -315,6 +330,34 @@ evaluation.run(
 )
 ```
 
+</details>
+
+<details>
+  <summary> Late Interaction (ColBERT) </summary>
+
+### Using Late Interaction models for retrieval
+
+```python
+from mteb import MTEB
+import mteb
+
+
+colbert = mteb.get_model("colbert-ir/colbertv2.0")
+tasks = mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"])
+
+eval_splits = ["test"]
+
+evaluation = MTEB(tasks=tasks)
+
+evaluation.run(
+    colbert,
+    eval_splits=eval_splits,
+    corpus_chunk_size=500,
+)
+```
+This implementation employs the MaxSim operation to compute the similarity between sentences. While MaxSim provides high-quality results, it processes a larger number of embeddings, potentially leading to increased resource usage. To manage resource consumption, consider lowering the `corpus_chunk_size` parameter.
+
+
 </details>
 
 <details>

diff --git a/docs/tasks.md b/docs/tasks.md
diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py
@@ -109,17 +109,18 @@ def evaluate(
         self,
         model: Encoder,
         split: str = "test",
+        subsets_to_run: list[HFSubset] | None = None,
         *,
         encode_kwargs: dict[str, Any] = {},
         **kwargs: Any,
     ) -> dict[HFSubset, ScoresDict]:
         """Evaluates a Sentence Embedding Model on the task.
-        Returns a dict (that can be serialized to json).
 
         Args:
             model: Sentence embedding method. Implements a encode(sentences) method, that encodes sentences and returns a numpy matrix with the
                 sentence embeddings
             split: Which datasplit to be used.
+            subsets_to_run: List of HFSubsets to evaluate. If None, all subsets are evaluated.
             encode_kwargs: Additional keyword arguments that are passed to the model's `encode` method.
             kwargs: Additional keyword arguments that are passed to the _evaluate_subset method.
         """
@@ -131,6 +132,9 @@ def evaluate(
         scores = {}
         hf_subsets = list(self.dataset.keys()) if self.is_multilingual else ["default"]
 
+        if subsets_to_run is not None:
+            hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
+
         for hf_subset in hf_subsets:
             logger.info(
                 f"\nTask: {self.metadata_dict['name']}, split: {split}, subset: {hf_subset}. Running..."

diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py
@@ -67,7 +67,8 @@ def __init__(self, **kwargs):
     def evaluate(
         self,
         model: Encoder,
-        split: str,
+        split: str = "test",
+        subsets_to_run: list[HFSubset] | None = None,
         *,
         encode_kwargs: dict[str, Any] = {},
         **kwargs,
@@ -77,6 +78,10 @@ def evaluate(
 
         hf_subsets = list(self.dataset) if self.is_multilingual else ["default"]
 
+        # If subsets_to_run is specified, filter the hf_subsets accordingly
+        if subsets_to_run is not None:
+            hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
+
         scores = {}
         if self.parallel_subsets:
             scores = self._evaluate_subset(

diff --git a/mteb/abstasks/AbsTaskClassification.py b/mteb/abstasks/AbsTaskClassification.py
@@ -95,6 +95,7 @@ def evaluate(
         model,
         eval_split: str = "test",
         train_split: str = "train",
+        subsets_to_run: list[HFSubset] | None = None,
         *,
         encode_kwargs: dict[str, Any] = {},
         **kwargs,
@@ -104,6 +105,8 @@ def evaluate(
 
         scores = {}
         hf_subsets = list(self.dataset) if self.is_multilingual else ["default"]
+        if subsets_to_run is not None:
+            hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
 
         for hf_subset in hf_subsets:
             logger.info(

diff --git a/mteb/abstasks/AbsTaskMultilabelClassification.py b/mteb/abstasks/AbsTaskMultilabelClassification.py
@@ -121,6 +121,7 @@ def evaluate(
         model: Encoder,
         eval_split: str = "test",
         train_split: str = "train",
+        subsets_to_run: list[HFSubset] | None = None,
         *,
         encode_kwargs: dict[str, Any] = {},
         **kwargs: Any,
@@ -130,6 +131,9 @@ def evaluate(
 
         scores = {}
         hf_subsets = list(self.dataset) if self.is_multilingual else ["default"]
+        # If subsets_to_run is specified, filter the hf_subsets accordingly
+        if subsets_to_run is not None:
+            hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
 
         for hf_subset in hf_subsets:
             logger.info(

diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py
@@ -26,6 +26,7 @@
     "MMarcoReranking",
     "CMedQAv1-reranking",
     "CMedQAv2-reranking",
+    "NamaaMrTydiReranking",
 ]
 
 

diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py
@@ -237,6 +237,7 @@ def evaluate(
         self,
         model,
         split: str = "test",
+        subsets_to_run: list[HFSubset] | None = None,
         *,
         encode_kwargs: dict[str, Any] = {},
         **kwargs,
@@ -250,6 +251,8 @@ def evaluate(
 
         scores = {}
         hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"]
+        if subsets_to_run is not None:
+            hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
 
         for hf_subset in hf_subsets:
             logger.info(f"Subset: {hf_subset}")

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -979,3 +979,27 @@ def load_results(
   year={2024}
 }""",
 )
+
+NANOBEIR = Benchmark(
+    name="NanoBEIR",
+    tasks=get_tasks(
+        tasks=[
+            "NanoArguAnaRetrieval",
+            "NanoClimateFeverRetrieval",
+            "NanoDBPediaRetrieval",
+            "NanoFEVERRetrieval",
+            "NanoFiQA2018Retrieval",
+            "NanoHotpotQARetrieval",
+            "NanoMSMARCORetrieval",
+            "NanoNFCorpusRetrieval",
+            "NanoNQRetrieval",
+            "NanoQuoraRetrieval",
+            "NanoSCIDOCSRetrieval",
+            "NanoSciFactRetrieval",
+            "NanoTouche2020Retrieval",
+        ],
+    ),
+    description="A benchmark to evaluate with subsets of BEIR datasets to use less computational power",
+    reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6",
+    citation=None,
+)
diff --git a/mteb/descriptive_stats/Reranking/NamaaMrTydiReranking.json b/mteb/descriptive_stats/Reranking/NamaaMrTydiReranking.json
@@ -0,0 +1,31 @@
+{
+    "test": {
+        "num_samples": 5504,
+        "number_of_characters": 1293166,
+        "num_documents": 4586,
+        "min_document_length": 0,
+        "average_document_length": 275.8353685128652,
+        "max_document_length": 4158,
+        "unique_documents": 4586,
+        "num_queries": 918,
+        "min_query_length": 13,
+        "average_query_length": 30.702614379084967,
+        "max_query_length": 93,
+        "unique_queries": 918,
+        "none_queries": 0,
+        "num_relevant_docs": 4586,
+        "min_relevant_docs_per_query": 2,
+        "average_relevant_docs_per_query": 1.0,
+        "max_relevant_docs_per_query": 6,
+        "unique_relevant_docs": 4586,
+        "num_instructions": null,
+        "min_instruction_length": null,
+        "average_instruction_length": null,
+        "max_instruction_length": null,
+        "unique_instructions": null,
+        "num_top_ranked": 918,
+        "min_top_ranked_per_query": 2,
+        "average_top_ranked_per_query": 4.995642701525054,
+        "max_top_ranked_per_query": 6
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/NanoArguAnaRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoArguAnaRetrieval.json
@@ -0,0 +1,31 @@
+{
+    "train": {
+        "num_samples": 3685,
+        "number_of_characters": 3737951,
+        "num_documents": 3635,
+        "min_document_length": 70,
+        "average_document_length": 1011.7914718019257,
+        "max_document_length": 6673,
+        "unique_documents": 3635,
+        "num_queries": 50,
+        "min_query_length": 504,
+        "average_query_length": 1201.78,
+        "max_query_length": 2164,
+        "unique_queries": 50,
+        "none_queries": 0,
+        "num_relevant_docs": 50,
+        "min_relevant_docs_per_query": 1,
+        "average_relevant_docs_per_query": 1.0,
+        "max_relevant_docs_per_query": 1,
+        "unique_relevant_docs": 50,
+        "num_instructions": null,
+        "min_instruction_length": null,
+        "average_instruction_length": null,
+        "max_instruction_length": null,
+        "unique_instructions": null,
+        "num_top_ranked": null,
+        "min_top_ranked_per_query": null,
+        "average_top_ranked_per_query": null,
+        "max_top_ranked_per_query": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/NanoClimateFeverRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoClimateFeverRetrieval.json
@@ -0,0 +1,31 @@
+{
+    "train": {
+        "num_samples": 3458,
+        "number_of_characters": 5525784,
+        "num_documents": 3408,
+        "min_document_length": 33,
+        "average_document_length": 1619.531690140845,
+        "max_document_length": 6619,
+        "unique_documents": 3408,
+        "num_queries": 50,
+        "min_query_length": 38,
+        "average_query_length": 128.4,
+        "max_query_length": 265,
+        "unique_queries": 50,
+        "none_queries": 0,
+        "num_relevant_docs": 50,
+        "min_relevant_docs_per_query": 1,
+        "average_relevant_docs_per_query": 1.0,
+        "max_relevant_docs_per_query": 1,
+        "unique_relevant_docs": 38,
+        "num_instructions": null,
+        "min_instruction_length": null,
+        "average_instruction_length": null,
+        "max_instruction_length": null,
+        "unique_instructions": null,
+        "num_top_ranked": null,
+        "min_top_ranked_per_query": null,
+        "average_top_ranked_per_query": null,
+        "max_top_ranked_per_query": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/NanoDBPediaRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoDBPediaRetrieval.json
@@ -0,0 +1,31 @@
+{
+    "train": {
+        "num_samples": 6095,
+        "number_of_characters": 2034629,
+        "num_documents": 6045,
+        "min_document_length": 1,
+        "average_document_length": 336.30669975186106,
+        "max_document_length": 1390,
+        "unique_documents": 6045,
+        "num_queries": 50,
+        "min_query_length": 8,
+        "average_query_length": 33.1,
+        "max_query_length": 63,
+        "unique_queries": 50,
+        "none_queries": 0,
+        "num_relevant_docs": 50,
+        "min_relevant_docs_per_query": 1,
+        "average_relevant_docs_per_query": 1.0,
+        "max_relevant_docs_per_query": 1,
+        "unique_relevant_docs": 50,
+        "num_instructions": null,
+        "min_instruction_length": null,
+        "average_instruction_length": null,
+        "max_instruction_length": null,
+        "unique_instructions": null,
+        "num_top_ranked": null,
+        "min_top_ranked_per_query": null,
+        "average_top_ranked_per_query": null,
+        "max_top_ranked_per_query": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/NanoFEVERRetrieval.json b/mteb/descriptive_stats/Retrieval/NanoFEVERRetrieval.json
@@ -0,0 +1,31 @@
+{
+    "train": {
+        "num_samples": 5046,
+        "number_of_characters": 6140916,
+        "num_documents": 4996,
+        "min_document_length": 25,
+        "average_document_length": 1228.7119695756605,
+        "max_document_length": 8491,
+        "unique_documents": 4996,
+        "num_queries": 50,
+        "min_query_length": 17,
+        "average_query_length": 45.42,
+        "max_query_length": 83,
+        "unique_queries": 50,
+        "none_queries": 0,
+        "num_relevant_docs": 50,
+        "min_relevant_docs_per_query": 1,
+        "average_relevant_docs_per_query": 1.0,
+        "max_relevant_docs_per_query": 1,
+        "unique_relevant_docs": 50,
+        "num_instructions": null,
+        "min_instruction_length": null,
+        "average_instruction_length": null,
+        "max_instruction_length": null,
+        "unique_instructions": null,
+        "num_top_ranked": null,
+        "min_top_ranked_per_query": null,
+        "average_top_ranked_per_query": null,
+        "max_top_ranked_per_query": null
+    }
+}
diff --git a/mteb/descriptive_stats/Retrieval/NanoFiQA2018Retrieval.json b/mteb/descriptive_stats/Retrieval/NanoFiQA2018Retrieval.json
@@ -0,0 +1,31 @@
+{
+    "train": {
+        "num_samples": 4648,
+        "number_of_characters": 4139437,
+        "num_documents": 4598,
+        "min_document_length": 0,
+        "average_document_length": 899.6326663766855,
+        "max_document_length": 10506,
+        "unique_documents": 4598,
+        "num_queries": 50,
+        "min_query_length": 18,
+        "average_query_length": 58.52,
+        "max_query_length": 97,
+        "unique_queries": 50,
+        "none_queries": 0,
+        "num_relevant_docs": 50,
+        "min_relevant_docs_per_query": 1,
+        "average_relevant_docs_per_query": 1.0,
+        "max_relevant_docs_per_query": 1,
+        "unique_relevant_docs": 50,
+        "num_instructions": null,
+        "min_instruction_length": null,
+        "average_instruction_length": null,
+        "max_instruction_length": null,
+        "unique_instructions": null,
+        "num_top_ranked": null,
+        "min_top_ranked_per_query": null,
+        "average_top_ranked_per_query": null,
+        "max_top_ranked_per_query": null
+    }
+}