Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
57c757f
refactor: split BRIGHT benchmark into individual subset tasks
whybe-choi Oct 8, 2025
b04b46e
Merge branch 'main' into bright-subset-tasks
Samoed Oct 20, 2025
7299e59
readd bright
Samoed Oct 20, 2025
bf31a79
Merge branch 'embeddings-benchmark:main' into bright-subset-tasks
whybe-choi Oct 20, 2025
3f875a2
readd bright subset tasks
whybe-choi Oct 22, 2025
6aeea07
feat: add descriptive stats for BRIGHT subsets retrieval tasks
whybe-choi Oct 23, 2025
f95a246
feat: add top_ranked for excluded_ids handling
whybe-choi Oct 24, 2025
9df0bba
change main score to recall@1 for long version
whybe-choi Oct 31, 2025
c9a30bd
improve BRIGHT task descriptions
whybe-choi Nov 5, 2025
3b1e90b
add prompts to BRIGHT retrieval tasks
whybe-choi Nov 5, 2025
f0e498b
refactor: BRIGHT(v1.1)
whybe-choi Dec 9, 2025
78d73b5
calculate descriptive stats for BRIGHTLongRetrieval
whybe-choi Dec 9, 2025
825d51c
update prompts
Samoed Dec 27, 2025
3aafbc1
Merge branch 'main' into bright-subset-tasks
Samoed Dec 27, 2025
035f3bd
normalize names in prompts
Samoed Jan 6, 2026
41c8ee4
don't filter tasks
Samoed Jan 14, 2026
67e88d7
Merge branch 'main' into bright-subset-tasks
Samoed Jan 14, 2026
1fb1d57
remove filter_queries_without_positives and update revision
Samoed Jan 14, 2026
74faf1c
don't create top ranked if not necessary
Samoed Jan 14, 2026
30158eb
get back naucs
Samoed Jan 14, 2026
5dc9469
fix instructions
Samoed Jan 18, 2026
bf0e37a
add warning
Samoed Jan 18, 2026
26b189b
fix import
Samoed Jan 18, 2026
75c9017
Merge branch 'main' into bright-subset-tasks
Samoed Jan 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mteb/abstasks/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def _process_split(
instructions,
)
)
if hasattr(self, "top_ranked"):
if hasattr(self, "top_ranked") and self.top_ranked:
self.dataset[subset][split]["top_ranked"] = self.top_ranked[
split
].copy()
Expand Down
2 changes: 2 additions & 0 deletions mteb/benchmarks/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
BEIR_NL,
BRIGHT,
BRIGHT_LONG,
BRIGHT_V1_1,
BUILT_MTEB,
C_MTEB,
CHEMTEB,
Expand Down Expand Up @@ -69,6 +70,7 @@
"BEIR_NL",
"BRIGHT",
"BRIGHT_LONG",
"BRIGHT_V1_1",
"BUILT_MTEB",
"CHEMTEB",
"CHEMTEB_V1_1",
Expand Down
43 changes: 41 additions & 2 deletions mteb/benchmarks/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1330,6 +1330,46 @@
""",
)

BRIGHT_V1_1 = Benchmark(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we just combine this into one table with both long and short as two different columns (we can also have different columns for the different domains)
Screenshot 2026-01-14 at 21 05 07

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel free to delete the benchmark here and add that in a separate PR.

name="BRIGHT(v1.1)",
display_name="Reasoning Retrieval",
tasks=get_tasks(
tasks=[
"BrightBiologyRetrieval",
"BrightEarthScienceRetrieval",
"BrightEconomicsRetrieval",
"BrightPsychologyRetrieval",
"BrightRoboticsRetrieval",
"BrightStackoverflowRetrieval",
"BrightSustainableLivingRetrieval",
"BrightPonyRetrieval",
"BrightLeetcodeRetrieval",
"BrightAopsRetrieval",
"BrightTheoremQATheoremsRetrieval",
"BrightTheoremQAQuestionsRetrieval",
"BrightBiologyLongRetrieval",
"BrightEarthScienceLongRetrieval",
"BrightEconomicsLongRetrieval",
"BrightPsychologyLongRetrieval",
"BrightRoboticsLongRetrieval",
"BrightStackoverflowLongRetrieval",
"BrightSustainableLivingLongRetrieval",
"BrightPonyLongRetrieval",
],
),
description="v1.1 refactors the BRIGHT into a different tasks and added prompt to individual tasks.",
reference="https://brightbenchmark.github.io/",
citation=r"""
@article{su2024bright,
author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
journal = {arXiv preprint arXiv:2407.12883},
title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
year = {2024},
}
""",
)


CODE_RAG = Benchmark(
name="CodeRAG",
tasks=get_tasks(
Expand Down Expand Up @@ -1781,8 +1821,7 @@
"TRECCOVID-NL",
],
),
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated "
"translation.",
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.",
reference="https://arxiv.org/abs/2412.08329",
contacts=["nikolay-banar"],
citation=r"""
Expand Down
35 changes: 35 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"standard": {
"num_samples": 188113,
"number_of_characters": 141769714,
"documents_text_statistics": {
"total_text_length": 141734227,
"min_text_length": 58,
"average_text_length": 753.8974425803981,
"max_text_length": 7334,
"unique_texts": 176508
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 35487,
"min_text_length": 85,
"average_text_length": 319.7027027027027,
"max_text_length": 1167,
"unique_texts": 111
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 524,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 4.7207207207207205,
"max_relevant_docs_per_query": 8,
"unique_relevant_docs": 111
},
"top_ranked_statistics": {
"num_top_ranked": 20264921,
"min_top_ranked_per_query": 176954,
"average_top_ranked_per_query": 182566.85585585586,
"max_top_ranked_per_query": 186176
}
}
}
35 changes: 35 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"long": {
"num_samples": 627,
"number_of_characters": 19398082,
"documents_text_statistics": {
"total_text_length": 19344209,
"min_text_length": 142,
"average_text_length": 36916.42938931298,
"max_text_length": 1324201,
"unique_texts": 498
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 53873,
"min_text_length": 89,
"average_text_length": 523.0388349514564,
"max_text_length": 2195,
"unique_texts": 103
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 134,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.3009708737864079,
"max_relevant_docs_per_query": 4,
"unique_relevant_docs": 134
},
"top_ranked_statistics": {
"num_top_ranked": 53972,
"min_top_ranked_per_query": 524,
"average_top_ranked_per_query": 524.0,
"max_top_ranked_per_query": 524
}
}
}
35 changes: 35 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"standard": {
"num_samples": 57462,
"number_of_characters": 18936054,
"documents_text_statistics": {
"total_text_length": 18882181,
"min_text_length": 1,
"average_text_length": 329.192994996426,
"max_text_length": 31130,
"unique_texts": 49434
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 53873,
"min_text_length": 89,
"average_text_length": 523.0388349514564,
"max_text_length": 2195,
"unique_texts": 103
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 374,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 3.6310679611650487,
"max_relevant_docs_per_query": 19,
"unique_relevant_docs": 374
},
"top_ranked_statistics": {
"num_top_ranked": 5907977,
"min_top_ranked_per_query": 57359,
"average_top_ranked_per_query": 57359.0,
"max_top_ranked_per_query": 57359
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"long": {
"num_samples": 717,
"number_of_characters": 41696684,
"documents_text_statistics": {
"total_text_length": 41641374,
"min_text_length": 28,
"average_text_length": 69286.81198003328,
"max_text_length": 2627262,
"unique_texts": 587
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 55310,
"min_text_length": 83,
"average_text_length": 476.8103448275862,
"max_text_length": 1565,
"unique_texts": 116
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 187,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.6120689655172413,
"max_relevant_docs_per_query": 4,
"unique_relevant_docs": 187
},
"top_ranked_statistics": {
"num_top_ranked": 69716,
"min_top_ranked_per_query": 601,
"average_top_ranked_per_query": 601.0,
"max_top_ranked_per_query": 601
}
}
}
35 changes: 35 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"standard": {
"num_samples": 121365,
"number_of_characters": 40478259,
"documents_text_statistics": {
"total_text_length": 40422949,
"min_text_length": 1,
"average_text_length": 333.3878959826473,
"max_text_length": 233622,
"unique_texts": 117633
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 55310,
"min_text_length": 83,
"average_text_length": 476.8103448275862,
"max_text_length": 1565,
"unique_texts": 116
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 609,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 5.25,
"max_relevant_docs_per_query": 23,
"unique_relevant_docs": 609
},
"top_ranked_statistics": {
"num_top_ranked": 14064884,
"min_top_ranked_per_query": 121249,
"average_top_ranked_per_query": 121249.0,
"max_top_ranked_per_query": 121249
}
}
}
35 changes: 35 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"long": {
"num_samples": 619,
"number_of_characters": 19993261,
"documents_text_statistics": {
"total_text_length": 19917079,
"min_text_length": 43,
"average_text_length": 38598.99031007752,
"max_text_length": 429507,
"unique_texts": 515
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 76182,
"min_text_length": 164,
"average_text_length": 739.6310679611651,
"max_text_length": 2223,
"unique_texts": 103
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 109,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.058252427184466,
"max_relevant_docs_per_query": 3,
"unique_relevant_docs": 109
},
"top_ranked_statistics": {
"num_top_ranked": 53148,
"min_top_ranked_per_query": 516,
"average_top_ranked_per_query": 516.0,
"max_top_ranked_per_query": 516
}
}
}
35 changes: 35 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"standard": {
"num_samples": 50323,
"number_of_characters": 19882579,
"documents_text_statistics": {
"total_text_length": 19806397,
"min_text_length": 1,
"average_text_length": 394.3926125049781,
"max_text_length": 39672,
"unique_texts": 40594
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 76182,
"min_text_length": 164,
"average_text_length": 739.6310679611651,
"max_text_length": 2223,
"unique_texts": 103
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 823,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 7.990291262135922,
"max_relevant_docs_per_query": 85,
"unique_relevant_docs": 823
},
"top_ranked_statistics": {
"num_top_ranked": 5172660,
"min_top_ranked_per_query": 50220,
"average_top_ranked_per_query": 50220.0,
"max_top_ranked_per_query": 50220
}
}
}
35 changes: 35 additions & 0 deletions mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"standard": {
"num_samples": 414074,
"number_of_characters": 438348000,
"documents_text_statistics": {
"total_text_length": 438140779,
"min_text_length": 75,
"average_text_length": 1058.4849178125876,
"max_text_length": 103665,
"unique_texts": 413932
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 207221,
"min_text_length": 422,
"average_text_length": 1459.3028169014085,
"max_text_length": 3964,
"unique_texts": 142
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 262,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.8450704225352113,
"max_relevant_docs_per_query": 5,
"unique_relevant_docs": 216
},
"top_ranked_statistics": {
"num_top_ranked": 58744859,
"min_top_ranked_per_query": 412813,
"average_top_ranked_per_query": 413696.1901408451,
"max_top_ranked_per_query": 413923
}
}
}
Loading