Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions mteb/abstasks/AbsTaskInstructionRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ def _evaluate_subset_lang(
)

top_ranked = top_ranked[split]
kwargs["prediction_name"] = "og" # for naming predictions, as needed
scores_og, results_og = self._evaluate_subset(
retriever,
corpus,
Expand All @@ -382,6 +383,7 @@ def _evaluate_subset_lang(
lang,
**kwargs,
)
kwargs["prediction_name"] = "changed" # for naming predictions, as needed
scores_changed, results_changed = self._evaluate_subset(
retriever,
corpus,
Expand Down Expand Up @@ -411,6 +413,7 @@ def _evaluate_subset_lang(
keywords[split],
short_instructions[split],
)
kwargs["prediction_name"] = "base" # for naming predictions, as needed
scores_base, results_base = self._evaluate_subset(
retriever,
corpus,
Expand All @@ -421,6 +424,7 @@ def _evaluate_subset_lang(
lang,
**kwargs,
)
kwargs["prediction_name"] = "keywords" # for naming predictions, as needed
scores_w_keywords_scores, scores_w_keywords_results = self._evaluate_subset(
retriever,
corpus,
Expand All @@ -431,6 +435,9 @@ def _evaluate_subset_lang(
lang,
**kwargs,
)
kwargs["prediction_name"] = (
"short_instr" # for naming predictions, as needed
)
(
scores_w_short_instr_scores,
scores_w_short_instr_result,
Expand Down Expand Up @@ -572,6 +579,11 @@ def _evaluate_subset(
else:
qrels_save_path = f"{output_folder}/{self.metadata_dict['name']}_{lang}_predictions.json"

if kwargs.get("prediction_name", None):
qrels_save_path = qrels_save_path.replace(
".json", f"_{kwargs['prediction_name']}.json"
)

with open(qrels_save_path, "w") as f:
json.dump(results, f)

Expand Down
1 change: 1 addition & 0 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def load_results(
base_results = load_results()
return base_results.select_tasks(self.tasks)


MTEB_MAIN_EN = Benchmark(
name="MTEB(eng)",
tasks=get_tasks(
Expand Down
13 changes: 13 additions & 0 deletions mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,19 @@ def __call__(
return self.retriever.search_cross_encoder(
corpus, queries, self.top_k, instructions=instructions, **kwargs
)
elif (
hasattr(self.retriever.model, "mteb_model_meta")
and self.retriever.model.mteb_model_meta.name == "bm25s"
):
return self.retriever.model.search(
corpus,
queries,
self.top_k,
self.score_function,
task_name=self.task_name, # type: ignore
instructions=instructions,
**kwargs,
)
else:
return self.retriever.search(
corpus,
Expand Down
11 changes: 9 additions & 2 deletions mteb/evaluation/evaluators/RetrievalEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,11 +257,18 @@ def search_cross_encoder(
) -> dict[str, dict[str, float]]:
"""This function provides support for reranker (or cross-encoder) models that encoder query and document at the same time (typically with attention).
Some notable examples include MonoBERT, MonoT5, RankLlama, etc.
Note: you must provide the path to the results to rerank to the __init__ function as `previous_results`
Note: you must provide the path to the results to rerank to the __init__ function as `previous_results` or else rerank all documents in the corpus
"""
pairs = [] # create the pairs for reranking
for qid in queries.keys():
q_results = self.previous_results[qid]
if self.previous_results is None:
# try to use all of them
logging.logging(
f"previous_results is None. Using all the documents to rerank: {len(corpus)}"
)
q_results = {doc_id: 0.0 for doc_id in corpus.keys()}
else:
q_results = self.previous_results[qid]
# take the top-k only
q_results_sorted = dict(
sorted(q_results.items(), key=lambda item: item[1], reverse=True)
Expand Down
1 change: 1 addition & 0 deletions mteb/tasks/InstructionRetrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .eng.Core17InstructionRetrieval import *
from .eng.News21InstructionRetrieval import *
from .eng.Robust04InstructionRetrieval import *
from .multilingual.mFollowIR import *
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class Core17InstructionRetrieval(AbsTaskInstructionRetrieval):
metadata = TaskMetadata(
name="Core17InstructionRetrieval",
description="Measuring retrieval instruction following ability on Core17 narratives.",
description="Measuring retrieval instruction following ability on Core17 narratives for the FollowIR benchmark.",
reference="https://arxiv.org/abs/2403.15246",
dataset={
"path": "jhu-clsp/core17-instructions",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class News21InstructionRetrieval(AbsTaskInstructionRetrieval):
metadata = TaskMetadata(
name="News21InstructionRetrieval",
description="Measuring retrieval instruction following ability on News21 narratives.",
description="Measuring retrieval instruction following ability on News21 narratives for the FollowIR benchmark.",
reference="https://arxiv.org/abs/2403.15246",
dataset={
"path": "jhu-clsp/news21-instructions",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class Robust04InstructionRetrieval(AbsTaskInstructionRetrieval):
metadata = TaskMetadata(
name="Robust04InstructionRetrieval",
description="Measuring retrieval instruction following ability on Robust04 narratives.",
description="Measuring retrieval instruction following ability on Robust04 narratives for the FollowIR benchmark.",
reference="https://arxiv.org/abs/2403.15246",
dataset={
"path": "jhu-clsp/robust04-instructions",
Expand Down
Empty file.
Loading