Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion mteb/benchmarks/_create_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ def _create_per_language_table_from_benchmark_results(

def _create_summary_table_mean_public_private(
benchmark_results: BenchmarkResults,
exclude_private_from_borda: bool = False,
) -> pd.DataFrame:
"""Create summary table from BenchmarkResults.

Expand All @@ -311,6 +312,7 @@ def _create_summary_table_mean_public_private(

Args:
benchmark_results: BenchmarkResults object containing model results
exclude_private_from_borda: If True, calculate Borda rank using only public tasks

Returns:
DataFrame with model summaries, ready for styling in the leaderboard
Expand Down Expand Up @@ -356,7 +358,11 @@ def _create_summary_table_mean_public_private(
joint_table = joint_table.drop(models_to_remove, axis=0)
joint_table.insert(0, "mean(public)", public_mean)
joint_table.insert(1, "mean(private)", private_mean)
joint_table["borda_rank"] = _get_borda_rank(per_task)
if exclude_private_from_borda:
borda_per_task = per_task[public_task_name]
else:
borda_per_task = per_task
joint_table["borda_rank"] = _get_borda_rank(borda_per_task)
joint_table = joint_table.sort_values("borda_rank", ascending=True)
joint_table = joint_table.reset_index()

Expand Down
12 changes: 11 additions & 1 deletion mteb/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,19 @@ def _create_summary_table(
_create_summary_table_mean_public_private,
)

joint_table = _create_summary_table_mean_public_private(benchmark_results)
joint_table = _create_summary_table_mean_public_private(
benchmark_results, exclude_private_from_borda=True
)
# issue 3902: temporary remove the private column from RTEB summary table
if "Mean (Private)" in joint_table.columns:
joint_table = joint_table.drop(columns=["Mean (Private)"])
# For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
# but due to 3902, if Private column existed, Mean (Task) was the mean of Public and Private so instead we drop Mean (Task) and rename Mean (Public) to Mean (Task)
joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
if "Mean (Task)" in joint_table.columns:
joint_table = joint_table.drop(columns=["Mean (Task)"])
joint_table = joint_table.rename(columns={"Mean (Public)": "Mean (Task)"})

return joint_table


Expand Down
29 changes: 20 additions & 9 deletions mteb/benchmarks/benchmarks/rteb_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
year = {2025},
}"""

removal_note = "\n\nNote: We have temporarily removed the 'Private' column to read more about this decision out the [announcement](https://github.com/embeddings-benchmark/mteb/issues/3934)."

RTEB_MAIN = RtebBenchmark(
name="RTEB(beta)",
display_name="RTEB Multilingual",
Expand Down Expand Up @@ -48,7 +50,8 @@
"JapaneseLegal1Retrieval",
],
),
description="RTEB (ReTrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across multiple languages. The dataset includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
description="RTEB (ReTrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across multiple languages. The dataset includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+ removal_note,
citation=RTEB_CITATION,
contacts=["fzowl"],
)
Expand Down Expand Up @@ -83,7 +86,8 @@
],
languages=["eng"],
),
description="RTEB English is a subset of RTEB containing retrieval tasks in English across legal, finance, code, and healthcare domains. Includes diverse tasks covering specialized domains such as healthcare and finance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
description="RTEB English is a subset of RTEB containing retrieval tasks in English across legal, finance, code, and healthcare domains. Includes diverse tasks covering specialized domains such as healthcare and finance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+ removal_note,
citation=RTEB_CITATION,
contacts=["fzowl"],
)
Expand All @@ -101,7 +105,8 @@
],
languages=["fra"],
),
description="RTEB French is a subset of RTEB containing retrieval tasks in French across legal and general knowledge domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
description="RTEB French is a subset of RTEB containing retrieval tasks in French across legal and general knowledge domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+ removal_note,
citation=RTEB_CITATION,
contacts=["fzowl"],
)
Expand All @@ -119,7 +124,8 @@
"GermanLegal1Retrieval",
],
),
description="RTEB German is a subset of RTEB containing retrieval tasks in German across legal, healthcare, and business domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
description="RTEB German is a subset of RTEB containing retrieval tasks in German across legal, healthcare, and business domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+ removal_note,
citation=RTEB_CITATION,
contacts=["fzowl"],
)
Expand All @@ -135,7 +141,8 @@
"JapaneseLegal1Retrieval",
],
),
description="RTEB Japanese is a subset of RTEB containing retrieval tasks in Japanese across legal and code domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
description="RTEB Japanese is a subset of RTEB containing retrieval tasks in Japanese across legal and code domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+ removal_note,
citation=RTEB_CITATION,
contacts=["fzowl"],
)
Expand All @@ -156,7 +163,8 @@
"EnglishFinance4Retrieval",
],
),
description="RTEB Finance is a subset of RTEB containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, financial document retrieval, and corporate governance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
description="RTEB Finance is a subset of RTEB containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, financial document retrieval, and corporate governance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+ removal_note,
citation=RTEB_CITATION,
contacts=["fzowl"],
)
Expand All @@ -177,7 +185,8 @@
"JapaneseLegal1Retrieval",
],
),
description="RTEB Legal is a subset of RTEB containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and multilingual legal Q&A. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
description="RTEB Legal is a subset of RTEB containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and multilingual legal Q&A. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+ removal_note,
citation=RTEB_CITATION,
contacts=["fzowl"],
)
Expand All @@ -199,7 +208,8 @@
"JapaneseCode1Retrieval",
],
),
description="RTEB Code is a subset of RTEB containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, SQL retrieval, and multilingual code retrieval. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
description="RTEB Code is a subset of RTEB containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, SQL retrieval, and multilingual code retrieval. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+ removal_note,
citation=RTEB_CITATION,
contacts=["fzowl"],
)
Expand All @@ -217,7 +227,8 @@
"GermanHealthcare1Retrieval",
],
),
description="RTEB Healthcare is a subset of RTEB containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, cross-lingual medical retrieval, and multilingual medical consultation. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
description="RTEB Healthcare is a subset of RTEB containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, cross-lingual medical retrieval, and multilingual medical consultation. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
+ removal_note,
citation=RTEB_CITATION,
contacts=["fzowl"],
)
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.