Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
dd5d226
fix: Count unique texts, data leaks in calculate metrics (#1438)
Samoed Nov 14, 2024
04ac3f2
fix: update task metadata to allow for null (#1448)
KennethEnevoldsen Nov 14, 2024
f6a49fe
Update tasks table
github-actions[bot] Nov 14, 2024
78c0e4e
1.19.5
invalid-email-address Nov 14, 2024
4e86cea
Fix: Made data parsing in the leaderboard figure more robust (#1450)
x-tabdeveloping Nov 14, 2024
039d010
Fixed task loading (#1451)
x-tabdeveloping Nov 14, 2024
feb1ab7
fix: publish (#1452)
x-tabdeveloping Nov 14, 2024
3397633
1.19.6
invalid-email-address Nov 14, 2024
14d7523
fix: Fix load external results with `None` mteb_version (#1453)
Samoed Nov 14, 2024
68eb498
1.19.7
invalid-email-address Nov 14, 2024
58c459b
WIP: Polishing up leaderboard UI (#1461)
x-tabdeveloping Nov 15, 2024
1b920ac
fix: loading pre 1.11.0 (#1460)
Samoed Nov 15, 2024
a988fef
1.19.8
invalid-email-address Nov 15, 2024
9b2aece
fix: swap touche2020 to maintain compatibility (#1469)
isaac-chung Nov 17, 2024
8bb4a29
1.19.9
invalid-email-address Nov 17, 2024
2fb6fe7
docs: Add sum per language for task counts (#1468)
isaac-chung Nov 18, 2024
fde124a
fix: pinned datasets to <3.0.0 (#1470)
Napuh Nov 19, 2024
7186e04
1.19.10
invalid-email-address Nov 19, 2024
1cc6c9e
feat: add CUREv1 retrieval dataset (#1459)
dbuades Nov 21, 2024
4408717
Update tasks table
github-actions[bot] Nov 21, 2024
3ff38ec
1.20.0
invalid-email-address Nov 21, 2024
917ad7f
fix: check if `model` attr of model exists (#1499)
Samoed Nov 26, 2024
cde720e
1.20.1
invalid-email-address Nov 26, 2024
2f500e1
Merge branch 'refs/heads/main' into merge_main_v2_
Samoed Nov 26, 2024
e48e75c
add cure statistics
Samoed Nov 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions docs/create_tasks_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def create_tasks_table(tasks: list[mteb.AbsTask]) -> str:
return table


def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str:
def create_task_lang_table(tasks: list[mteb.AbsTask], sort_by_sum=False) -> str:
table_dict = {}
## Group by language. If it is a multilingual dataset, 1 is added to all languages present.
for task in tasks:
Expand All @@ -82,22 +82,27 @@ def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str:
## Wrangle for polars
pl_table_dict = []
for lang, d in table_dict.items():
d.update({"lang": lang})
d.update({"0-lang": lang}) # for sorting columns
pl_table_dict.append(d)

df = pl.DataFrame(pl_table_dict).sort(by="lang")
df = pl.DataFrame(pl_table_dict).sort(by="0-lang")
df = df.with_columns(sum=pl.sum_horizontal(get_args(TASK_TYPE)))
df = df.select(sorted(df.columns))
if sort_by_sum:
df = df.sort(by="sum", descending=True)

total = df.sum()

task_names_md = " | ".join(sorted(get_args(TASK_TYPE)))
horizontal_line_md = "---|---" * len(sorted(get_args(TASK_TYPE)))
horizontal_line_md = "---|---" * (len(sorted(get_args(TASK_TYPE))) + 1)
table = f"""
| Language | {task_names_md} |
| Language | {task_names_md} | Sum |
|{horizontal_line_md}|
"""

for row in df.iter_rows():
table += f"| {row[-1]} "
for num in row[:-1]:
table += f"| {row[0]} "
for num in row[1:]:
table += f"| {num} "
table += "|\n"

Expand Down
2 changes: 2 additions & 0 deletions mteb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
MTEB_ENG_CLASSIC,
MTEB_MAIN_RU,
MTEB_RETRIEVAL_LAW,
MTEB_RETRIEVAL_MEDICAL,
MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
CoIR,
)
Expand All @@ -24,6 +25,7 @@
"MTEB_ENG_CLASSIC",
"MTEB_MAIN_RU",
"MTEB_RETRIEVAL_LAW",
"MTEB_RETRIEVAL_MEDICAL",
"MTEB_RETRIEVAL_WITH_INSTRUCTIONS",
"CoIR",
"TASKS_REGISTRY",
Expand Down
28 changes: 26 additions & 2 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def load_results(
"StackExchangeClustering.v2",
"StackExchangeClusteringP2P.v2",
"TRECCOVID",
"Touche2020",
"Touche2020Retrieval.v3",
"ToxicConversationsClassification",
"TweetSentimentExtractionClassification",
"TwentyNewsgroupsClustering.v2",
Expand Down Expand Up @@ -186,7 +186,7 @@ def load_results(
"StackOverflowDupQuestions",
"SummEval",
"TRECCOVID",
"Touche2020Retrieval.v3",
"Touche2020",
"ToxicConversationsClassification",
"TweetSentimentExtractionClassification",
"TwentyNewsgroupsClustering",
Expand Down Expand Up @@ -308,6 +308,29 @@ def load_results(
citation=None,
)

MTEB_RETRIEVAL_MEDICAL = Benchmark(
name="MTEB(Medical)",
tasks=get_tasks(
tasks=[
"CUREv1",
"NFCorpus",
"TRECCOVID",
"TRECCOVID-PL",
"SciFact",
"SciFact-PL",
"MedicalQARetrieval",
"PublicHealthQA",
"MedrxivClusteringP2P.v2",
"MedrxivClusteringS2S.v2",
"CmedqaRetrieval",
"CMedQAv2-reranking",
],
),
description="A curated set of MTEB tasks designed to evaluate systems in the context of medical information retrieval.",
reference="",
citation=None,
)

MTEB_MINERS_BITEXT_MINING = Benchmark(
name="MINERSBitextMining",
tasks=get_tasks(
Expand Down Expand Up @@ -702,6 +725,7 @@ def load_results(
"SpartQA",
"TempReasonL1",
"TRECCOVID",
"CUREv1",
"WinoGrande",
"BelebeleRetrieval",
"MLQARetrieval",
Expand Down
Loading