Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,9 +1060,7 @@

BRIGHT = Benchmark(
name="BRIGHT",
tasks=get_tasks(
tasks=["BrightRetrieval"],
),
tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["standard"]),
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
BRIGHT is the first text retrieval
benchmark that requires intensive reasoning to retrieve relevant documents with
Expand All @@ -1079,6 +1077,28 @@
}""",
)


BRIGHT_LONG = Benchmark(
name="BRIGHT (long)",
tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["long"]),
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
BRIGHT is the first text retrieval
benchmark that requires intensive reasoning to retrieve relevant documents with
a dataset consisting of 1,384 real-world queries spanning diverse domains, such as
economics, psychology, mathematics, and coding. These queries are drawn from
naturally occurring and carefully curated human data.

This is the long version of the benchmark, which only filter longer documents.
""",
reference="https://brightbenchmark.github.io/",
citation="""@article{su2024bright,
title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
author={Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
journal={arXiv preprint arXiv:2407.12883},
year={2024}
}""",
)

CODE_RAG = Benchmark(
name="CodeRAG",
tasks=get_tasks(
Expand Down
2 changes: 1 addition & 1 deletion mteb/load_results/task_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult:
if task is None:
task = get_task(self.task_name)

splits = task.metadata.eval_splits
splits = task.eval_splits
hf_subsets = task.hf_subsets
hf_subsets = set(hf_subsets)

Expand Down
1 change: 0 additions & 1 deletion mteb/tasks/Retrieval/eng/BrightRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ class BrightRetrieval(MultilingualTask, AbsTaskRetrieval):
domains=["Non-fiction", "Written"],
task_subtypes=["Article retrieval"],
license="cc-by-4.0",
socioeconomic_status="low",
annotations_creators="derived",
dialect=[],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ class DummyTask(AbsTask):
annotations_creators="derived",
dialect=[],
bibtex_citation="",
descriptive_stats={},
modalities=["text"],
sample_creation="created",
)
Expand All @@ -48,11 +47,11 @@ def _evaluate_subset(self, **kwargs):
def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall=False
) -> dict[str, float]:
pass
return {}


def test_mteb_results():
"""Test TaskResult class (this is the same as the example in the docstring)"""
@pytest.fixture()
def task_result():
scores = {
"train": {
"en-de": {
Expand All @@ -66,13 +65,19 @@ def test_mteb_results():

evaluation_time = 100

mteb_results = TaskResult.from_task_results(
return TaskResult.from_task_results(
task=DummyTask(), scores=scores, evaluation_time=evaluation_time
)

assert mteb_results.get_score() == 0.55
assert mteb_results.get_score(languages=["eng"]) == 0.55
assert mteb_results.get_score(languages=["fra"]) == 0.6

def test_task_results_get_score(task_result: TaskResult):
"""Test TaskResult class (this is the same as the example in the docstring)"""
assert task_result.get_score() == 0.55
assert task_result.get_score(languages=["eng"]) == 0.55
assert task_result.get_score(languages=["fra"]) == 0.6


def test_task_results_to_dict(task_result: TaskResult):
dict_repr = {
"dataset_revision": "1.0",
"task_name": "dummy_task",
Expand All @@ -94,7 +99,52 @@ def test_mteb_results():
]
},
}
assert mteb_results.to_dict() == dict_repr
assert task_result.to_dict() == dict_repr


def test_task_results_validate_and_filter():
scores = {
"train": {
"en-de": {
"main_score": 0.5,
},
"en-fr": {
"main_score": 0.6,
},
},
"test": {
"en-de": {
"main_score": 0.3,
},
"en-fr": {
"main_score": 0.4,
},
},
}

evaluation_time = 100

res = TaskResult.from_task_results(
task=DummyTask(), scores=scores, evaluation_time=evaluation_time
)

task = DummyTask()
task._eval_splits = ["train", "test"]
res1 = res.validate_and_filter_scores(task=task)

assert res1.scores.keys() == {"train", "test"}
assert res1.get_score() == (0.5 + 0.6 + 0.3 + 0.4) / 4

task._eval_splits = ["test"]
res2 = res.validate_and_filter_scores(task=task)
assert res2.scores.keys() == {"test"}
assert res2.get_score() == (0.3 + 0.4) / 2 # only test scores

task.hf_subsets = ["en-de"]
task._eval_splits = ["train", "test"]
res3 = res.validate_and_filter_scores(task=task)
assert res3.scores.keys() == {"train", "test"}
assert res3.get_score() == (0.5 + 0.3) / 2 # only en-de scores


@pytest.mark.parametrize(
Expand Down
Loading