Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
53756ad
feat: add new arctic v2.0 models (#1574)
dbuades Dec 10, 2024
27f7d8c
1.24.0
invalid-email-address Dec 10, 2024
7b9b3c9
fix: Add namaa MrTydi reranking dataset (#1573)
omarelshehy Dec 11, 2024
1101db7
Update tasks table
github-actions[bot] Dec 11, 2024
9c0b208
1.24.1
invalid-email-address Dec 11, 2024
373db74
fix: Eval langs not correctly passed to monolingual tasks (#1587)
Samoed Dec 13, 2024
eecc9f1
1.24.2
invalid-email-address Dec 13, 2024
fdfdaef
feat: Add ColBert (#1563)
sam-hey Dec 14, 2024
b466051
1.25.0
invalid-email-address Dec 14, 2024
992b20b
doc: colbert add score_function & doc section (#1592)
sam-hey Dec 15, 2024
8e6ee46
Feat: add support for scoring function (#1594)
Samoed Dec 15, 2024
95d5ae5
Add new models nvidia, gte, linq (#1436)
AlexeyVatolin Dec 16, 2024
0c9e046
Leaderboard: Refined plots (#1601)
x-tabdeveloping Dec 16, 2024
6ecc86f
fix: Leaderboard refinements (#1603)
x-tabdeveloping Dec 16, 2024
5e9c468
1.25.1
invalid-email-address Dec 16, 2024
b81b584
Feat: Use similarity scores if available (#1602)
Samoed Dec 16, 2024
6731b94
Add NanoBEIR Datasets (#1588)
KGupta10 Dec 18, 2024
9de7f20
Update tasks table
github-actions[bot] Dec 18, 2024
48cb97d
Feat: Evaluate missing languages (#1584)
Samoed Dec 18, 2024
ad05983
Add IBM Granite Embedding Models (#1613)
aashka-trivedi Dec 19, 2024
7c8e094
fix: disable co2_tracker for API models (#1614)
dbuades Dec 20, 2024
d8c015f
1.25.2
invalid-email-address Dec 20, 2024
0c44482
fix: set `use_instructions` to True in models using prompts (#1616)
dbuades Dec 20, 2024
2024338
1.25.3
invalid-email-address Dec 20, 2024
3d703e4
Merge branch 'refs/heads/main' into update_v2
Samoed Dec 21, 2024
eb29eb3
update RetrievalEvaluator.py
Samoed Dec 21, 2024
107dd4a
update imports
Samoed Dec 21, 2024
92dba39
update imports and metadata
Samoed Dec 21, 2024
7b4ae88
fix tests
Samoed Dec 21, 2024
788f54e
fix tests
Samoed Dec 21, 2024
06017ef
fix output path for retrieval
Samoed Dec 21, 2024
7144fca
fix similarity function
Samoed Dec 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,21 @@ Note that the public leaderboard uses the test splits for all datasets except MS

</details>


<details>
<summary> Selecting evaluation subset </summary>

### Selecting evaluation subset
You can evaluate only on selected subsets. For example, if you want to evaluate only the `subset_name_to_run` subset of all tasks, do the following:

```python
evaluation.run(model, eval_subsets=["subset_name_to_run"])
```

Monolingual tasks have `default` subset, other tasks have subsets that are specific to the dataset.

</details>

<details>
<summary> Using a custom model </summary>

Expand Down Expand Up @@ -315,6 +330,34 @@ evaluation.run(
)
```

</details>

<details>
<summary> Late Interaction (ColBERT) </summary>

### Using Late Interaction models for retrieval

```python
from mteb import MTEB
import mteb


colbert = mteb.get_model("colbert-ir/colbertv2.0")
tasks = mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"])

eval_splits = ["test"]

evaluation = MTEB(tasks=tasks)

evaluation.run(
colbert,
eval_splits=eval_splits,
corpus_chunk_size=500,
)
```
This implementation employs the MaxSim operation to compute the similarity between sentences. While MaxSim provides high-quality results, it processes a larger number of embeddings, potentially leading to increased resource usage. To manage resource consumption, consider lowering the `corpus_chunk_size` parameter.


</details>

<details>
Expand Down
2,144 changes: 1,079 additions & 1,065 deletions docs/tasks.md

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,17 +109,18 @@ def evaluate(
self,
model: Encoder,
split: str = "test",
subsets_to_run: list[HFSubset] | None = None,
*,
encode_kwargs: dict[str, Any] = {},
**kwargs: Any,
) -> dict[HFSubset, ScoresDict]:
"""Evaluates a Sentence Embedding Model on the task.
Returns a dict (that can be serialized to json).

Args:
model: Sentence embedding method. Implements a encode(sentences) method, that encodes sentences and returns a numpy matrix with the
sentence embeddings
split: Which datasplit to be used.
subsets_to_run: List of HFSubsets to evaluate. If None, all subsets are evaluated.
encode_kwargs: Additional keyword arguments that are passed to the model's `encode` method.
kwargs: Additional keyword arguments that are passed to the _evaluate_subset method.
"""
Expand All @@ -131,6 +132,9 @@ def evaluate(
scores = {}
hf_subsets = list(self.dataset.keys()) if self.is_multilingual else ["default"]

if subsets_to_run is not None:
hf_subsets = [s for s in hf_subsets if s in subsets_to_run]

for hf_subset in hf_subsets:
logger.info(
f"\nTask: {self.metadata_dict['name']}, split: {split}, subset: {hf_subset}. Running..."
Expand Down
7 changes: 6 additions & 1 deletion mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def __init__(self, **kwargs):
def evaluate(
self,
model: Encoder,
split: str,
split: str = "test",
subsets_to_run: list[HFSubset] | None = None,
*,
encode_kwargs: dict[str, Any] = {},
**kwargs,
Expand All @@ -77,6 +78,10 @@ def evaluate(

hf_subsets = list(self.dataset) if self.is_multilingual else ["default"]

# If subsets_to_run is specified, filter the hf_subsets accordingly
if subsets_to_run is not None:
hf_subsets = [s for s in hf_subsets if s in subsets_to_run]

scores = {}
if self.parallel_subsets:
scores = self._evaluate_subset(
Expand Down
3 changes: 3 additions & 0 deletions mteb/abstasks/AbsTaskClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def evaluate(
model,
eval_split: str = "test",
train_split: str = "train",
subsets_to_run: list[HFSubset] | None = None,
*,
encode_kwargs: dict[str, Any] = {},
**kwargs,
Expand All @@ -104,6 +105,8 @@ def evaluate(

scores = {}
hf_subsets = list(self.dataset) if self.is_multilingual else ["default"]
if subsets_to_run is not None:
hf_subsets = [s for s in hf_subsets if s in subsets_to_run]

for hf_subset in hf_subsets:
logger.info(
Expand Down
4 changes: 4 additions & 0 deletions mteb/abstasks/AbsTaskMultilabelClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def evaluate(
model: Encoder,
eval_split: str = "test",
train_split: str = "train",
subsets_to_run: list[HFSubset] | None = None,
*,
encode_kwargs: dict[str, Any] = {},
**kwargs: Any,
Expand All @@ -130,6 +131,9 @@ def evaluate(

scores = {}
hf_subsets = list(self.dataset) if self.is_multilingual else ["default"]
# If subsets_to_run is specified, filter the hf_subsets accordingly
if subsets_to_run is not None:
hf_subsets = [s for s in hf_subsets if s in subsets_to_run]

for hf_subset in hf_subsets:
logger.info(
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/AbsTaskReranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"MMarcoReranking",
"CMedQAv1-reranking",
"CMedQAv2-reranking",
"NamaaMrTydiReranking",
]


Expand Down
3 changes: 3 additions & 0 deletions mteb/abstasks/AbsTaskRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ def evaluate(
self,
model,
split: str = "test",
subsets_to_run: list[HFSubset] | None = None,
*,
encode_kwargs: dict[str, Any] = {},
**kwargs,
Expand All @@ -250,6 +251,8 @@ def evaluate(

scores = {}
hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"]
if subsets_to_run is not None:
hf_subsets = [s for s in hf_subsets if s in subsets_to_run]

for hf_subset in hf_subsets:
logger.info(f"Subset: {hf_subset}")
Expand Down
24 changes: 24 additions & 0 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -979,3 +979,27 @@ def load_results(
year={2024}
}""",
)

NANOBEIR = Benchmark(
name="NanoBEIR",
tasks=get_tasks(
tasks=[
"NanoArguAnaRetrieval",
"NanoClimateFeverRetrieval",
"NanoDBPediaRetrieval",
"NanoFEVERRetrieval",
"NanoFiQA2018Retrieval",
"NanoHotpotQARetrieval",
"NanoMSMARCORetrieval",
"NanoNFCorpusRetrieval",
"NanoNQRetrieval",
"NanoQuoraRetrieval",
"NanoSCIDOCSRetrieval",
"NanoSciFactRetrieval",
"NanoTouche2020Retrieval",
],
),
description="A benchmark to evaluate with subsets of BEIR datasets to use less computational power",
reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6",
citation=None,
)
31 changes: 31 additions & 0 deletions mteb/descriptive_stats/Reranking/NamaaMrTydiReranking.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"test": {
"num_samples": 5504,
"number_of_characters": 1293166,
"num_documents": 4586,
"min_document_length": 0,
"average_document_length": 275.8353685128652,
"max_document_length": 4158,
"unique_documents": 4586,
"num_queries": 918,
"min_query_length": 13,
"average_query_length": 30.702614379084967,
"max_query_length": 93,
"unique_queries": 918,
"none_queries": 0,
"num_relevant_docs": 4586,
"min_relevant_docs_per_query": 2,
"average_relevant_docs_per_query": 1.0,
"max_relevant_docs_per_query": 6,
"unique_relevant_docs": 4586,
"num_instructions": null,
"min_instruction_length": null,
"average_instruction_length": null,
"max_instruction_length": null,
"unique_instructions": null,
"num_top_ranked": 918,
"min_top_ranked_per_query": 2,
"average_top_ranked_per_query": 4.995642701525054,
"max_top_ranked_per_query": 6
}
}
31 changes: 31 additions & 0 deletions mteb/descriptive_stats/Retrieval/NanoArguAnaRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"train": {
"num_samples": 3685,
"number_of_characters": 3737951,
"num_documents": 3635,
"min_document_length": 70,
"average_document_length": 1011.7914718019257,
"max_document_length": 6673,
"unique_documents": 3635,
"num_queries": 50,
"min_query_length": 504,
"average_query_length": 1201.78,
"max_query_length": 2164,
"unique_queries": 50,
"none_queries": 0,
"num_relevant_docs": 50,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.0,
"max_relevant_docs_per_query": 1,
"unique_relevant_docs": 50,
"num_instructions": null,
"min_instruction_length": null,
"average_instruction_length": null,
"max_instruction_length": null,
"unique_instructions": null,
"num_top_ranked": null,
"min_top_ranked_per_query": null,
"average_top_ranked_per_query": null,
"max_top_ranked_per_query": null
}
}
31 changes: 31 additions & 0 deletions mteb/descriptive_stats/Retrieval/NanoClimateFeverRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"train": {
"num_samples": 3458,
"number_of_characters": 5525784,
"num_documents": 3408,
"min_document_length": 33,
"average_document_length": 1619.531690140845,
"max_document_length": 6619,
"unique_documents": 3408,
"num_queries": 50,
"min_query_length": 38,
"average_query_length": 128.4,
"max_query_length": 265,
"unique_queries": 50,
"none_queries": 0,
"num_relevant_docs": 50,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.0,
"max_relevant_docs_per_query": 1,
"unique_relevant_docs": 38,
"num_instructions": null,
"min_instruction_length": null,
"average_instruction_length": null,
"max_instruction_length": null,
"unique_instructions": null,
"num_top_ranked": null,
"min_top_ranked_per_query": null,
"average_top_ranked_per_query": null,
"max_top_ranked_per_query": null
}
}
31 changes: 31 additions & 0 deletions mteb/descriptive_stats/Retrieval/NanoDBPediaRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"train": {
"num_samples": 6095,
"number_of_characters": 2034629,
"num_documents": 6045,
"min_document_length": 1,
"average_document_length": 336.30669975186106,
"max_document_length": 1390,
"unique_documents": 6045,
"num_queries": 50,
"min_query_length": 8,
"average_query_length": 33.1,
"max_query_length": 63,
"unique_queries": 50,
"none_queries": 0,
"num_relevant_docs": 50,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.0,
"max_relevant_docs_per_query": 1,
"unique_relevant_docs": 50,
"num_instructions": null,
"min_instruction_length": null,
"average_instruction_length": null,
"max_instruction_length": null,
"unique_instructions": null,
"num_top_ranked": null,
"min_top_ranked_per_query": null,
"average_top_ranked_per_query": null,
"max_top_ranked_per_query": null
}
}
31 changes: 31 additions & 0 deletions mteb/descriptive_stats/Retrieval/NanoFEVERRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"train": {
"num_samples": 5046,
"number_of_characters": 6140916,
"num_documents": 4996,
"min_document_length": 25,
"average_document_length": 1228.7119695756605,
"max_document_length": 8491,
"unique_documents": 4996,
"num_queries": 50,
"min_query_length": 17,
"average_query_length": 45.42,
"max_query_length": 83,
"unique_queries": 50,
"none_queries": 0,
"num_relevant_docs": 50,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.0,
"max_relevant_docs_per_query": 1,
"unique_relevant_docs": 50,
"num_instructions": null,
"min_instruction_length": null,
"average_instruction_length": null,
"max_instruction_length": null,
"unique_instructions": null,
"num_top_ranked": null,
"min_top_ranked_per_query": null,
"average_top_ranked_per_query": null,
"max_top_ranked_per_query": null
}
}
31 changes: 31 additions & 0 deletions mteb/descriptive_stats/Retrieval/NanoFiQA2018Retrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"train": {
"num_samples": 4648,
"number_of_characters": 4139437,
"num_documents": 4598,
"min_document_length": 0,
"average_document_length": 899.6326663766855,
"max_document_length": 10506,
"unique_documents": 4598,
"num_queries": 50,
"min_query_length": 18,
"average_query_length": 58.52,
"max_query_length": 97,
"unique_queries": 50,
"none_queries": 0,
"num_relevant_docs": 50,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.0,
"max_relevant_docs_per_query": 1,
"unique_relevant_docs": 50,
"num_instructions": null,
"min_instruction_length": null,
"average_instruction_length": null,
"max_instruction_length": null,
"unique_instructions": null,
"num_top_ranked": null,
"min_top_ranked_per_query": null,
"average_top_ranked_per_query": null,
"max_top_ranked_per_query": null
}
}
Loading
Loading