embeddings-benchmark · KennethEnevoldsen · Nov 27, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 14, 2024
diff --git a/docs/create_tasks_table.py b/docs/create_tasks_table.py
@@ -68,7 +68,7 @@ def create_tasks_table(tasks: list[mteb.AbsTask]) -> str:
     return table
 
 
-def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str:
+def create_task_lang_table(tasks: list[mteb.AbsTask], sort_by_sum=False) -> str:
     table_dict = {}
     ## Group by language. If it is a multilingual dataset, 1 is added to all languages present.
     for task in tasks:
@@ -82,22 +82,27 @@ def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str:
     ## Wrangle for polars
     pl_table_dict = []
     for lang, d in table_dict.items():
-        d.update({"lang": lang})
+        d.update({"0-lang": lang})  # for sorting columns
         pl_table_dict.append(d)
 
-    df = pl.DataFrame(pl_table_dict).sort(by="lang")
+    df = pl.DataFrame(pl_table_dict).sort(by="0-lang")
+    df = df.with_columns(sum=pl.sum_horizontal(get_args(TASK_TYPE)))
+    df = df.select(sorted(df.columns))
+    if sort_by_sum:
+        df = df.sort(by="sum", descending=True)
+
     total = df.sum()
 
     task_names_md = " | ".join(sorted(get_args(TASK_TYPE)))
-    horizontal_line_md = "---|---" * len(sorted(get_args(TASK_TYPE)))
+    horizontal_line_md = "---|---" * (len(sorted(get_args(TASK_TYPE))) + 1)
     table = f"""
-| Language | {task_names_md} |
+| Language | {task_names_md} | Sum |
 |{horizontal_line_md}|
 """
 
     for row in df.iter_rows():
-        table += f"| {row[-1]} "
-        for num in row[:-1]:
+        table += f"| {row[0]} "
+        for num in row[1:]:
             table += f"| {num} "
         table += "|\n"
 

diff --git a/mteb/__init__.py b/mteb/__init__.py
@@ -6,6 +6,7 @@
     MTEB_ENG_CLASSIC,
     MTEB_MAIN_RU,
     MTEB_RETRIEVAL_LAW,
+    MTEB_RETRIEVAL_MEDICAL,
     MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
     CoIR,
 )
@@ -24,6 +25,7 @@
     "MTEB_ENG_CLASSIC",
     "MTEB_MAIN_RU",
     "MTEB_RETRIEVAL_LAW",
+    "MTEB_RETRIEVAL_MEDICAL",
     "MTEB_RETRIEVAL_WITH_INSTRUCTIONS",
     "CoIR",
     "TASKS_REGISTRY",

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -106,7 +106,7 @@ def load_results(
             "StackExchangeClustering.v2",
             "StackExchangeClusteringP2P.v2",
             "TRECCOVID",
-            "Touche2020",
+            "Touche2020Retrieval.v3",
             "ToxicConversationsClassification",
             "TweetSentimentExtractionClassification",
             "TwentyNewsgroupsClustering.v2",
@@ -186,7 +186,7 @@ def load_results(
             "StackOverflowDupQuestions",
             "SummEval",
             "TRECCOVID",
-            "Touche2020Retrieval.v3",
+            "Touche2020",
             "ToxicConversationsClassification",
             "TweetSentimentExtractionClassification",
             "TwentyNewsgroupsClustering",
@@ -308,6 +308,29 @@ def load_results(
     citation=None,
 )
 
+MTEB_RETRIEVAL_MEDICAL = Benchmark(
+    name="MTEB(Medical)",
+    tasks=get_tasks(
+        tasks=[
+            "CUREv1",
+            "NFCorpus",
+            "TRECCOVID",
+            "TRECCOVID-PL",
+            "SciFact",
+            "SciFact-PL",
+            "MedicalQARetrieval",
+            "PublicHealthQA",
+            "MedrxivClusteringP2P.v2",
+            "MedrxivClusteringS2S.v2",
+            "CmedqaRetrieval",
+            "CMedQAv2-reranking",
+        ],
+    ),
+    description="A curated set of MTEB tasks designed to evaluate systems in the context of medical information retrieval.",
+    reference="",
+    citation=None,
+)
+
 MTEB_MINERS_BITEXT_MINING = Benchmark(
     name="MINERSBitextMining",
     tasks=get_tasks(
@@ -702,6 +725,7 @@ def load_results(
             "SpartQA",
             "TempReasonL1",
             "TRECCOVID",
+            "CUREv1",
             "WinoGrande",
             "BelebeleRetrieval",
             "MLQARetrieval",