embeddings-benchmark · Samoed · Oct 12, 2025 · Oct 6, 2025 · Oct 6, 2025 · Oct 6, 2025
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
diff --git a/docs/tasks.md b/docs/tasks.md
@@ -561,7 +561,7 @@ The following tables give you an overview of the tasks in MTEB.
 | [METI2IRetrieval](https://arxiv.org/abs/2202.01747) (Ypsilantis et al., 2021) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 348597} | {'test': {'number_of_characters': 0, 'num_samples': 348597, 'num_queries': 87942, 'num_documents': 260655, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 260655, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 87942, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.96, 'max_relevant_docs_per_query': 9, 'unique_relevant_docs': 172713}} |
 | [MIRACLReranking](https://project-miracl.github.io/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Reranking | s2s | [Encyclopaedic, Written] | None | None |
 | [MIRACLRetrieval](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
-| [MIRACLRetrievalHardNegatives](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
+| [MIRACLRetrievalHardNegatives.v2](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
 | [MIRACLVisionRetrieval](https://arxiv.org/pdf/2407.01449) (Radek Osmulski, 2025) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | DocumentUnderstanding | t2i | [Encyclopaedic] | None | None |
 | [MKQARetrieval](https://github.com/apple/ml-mkqa) (Shayne Longpre, 2020) | ['ara', 'dan', 'deu', 'eng', 'fin', 'fra', 'heb', 'hun', 'ita', 'jpn', 'khm', 'kor', 'msa', 'nld', 'nno', 'nob', 'nor', 'pol', 'por', 'rus', 'spa', 'swe', 'tha', 'tur', 'vie', 'zho'] | Retrieval | s2p | [Written] | None | None |
 | [MLQARetrieval](https://huggingface.co/datasets/mlqa) (Lewis et al., 2019) | ['ara', 'deu', 'eng', 'hin', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None |
@@ -882,7 +882,7 @@ The following tables give you an overview of the tasks in MTEB.
 | [SiswatiNewsClassification.v2](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['ssw'] | Classification | s2s | [News, Written] | None | None |
 | [SketchyI2IRetrieval](https://arxiv.org/abs/2202.01747) (Ypsilantis et al., 2021) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 477886} | {'test': {'number_of_characters': 0, 'num_samples': 477886, 'num_queries': 452886, 'num_documents': 25000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 25000, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 452886, 'min_relevant_docs_per_query': 100, 'average_relevant_docs_per_query': 100.0, 'max_relevant_docs_per_query': 100, 'unique_relevant_docs': 12500}} |
 | [SlovakHateSpeechClassification.v2](https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak) | ['slk'] | Classification | s2s | [Social, Written] | None | None |
-| [SlovakMovieReviewSentimentClassification.v2](https://arxiv.org/pdf/2304.01922) ({\v{S, 2023) | ['svk'] | Classification | s2s | [Reviews, Written] | None | None |
+| [SlovakMovieReviewSentimentClassification.v2](https://arxiv.org/pdf/2304.01922) ({\v{S, 2023) | ['slk'] | Classification | s2s | [Reviews, Written] | None | None |
 | [SlovakSumRetrieval](https://huggingface.co/datasets/NaiveNeuron/slovaksum) | ['slk'] | Retrieval | s2s | [News, Social, Web, Written] | None | None |
 | [SouthAfricanLangClassification](https://www.kaggle.com/competitions/south-african-language-identification/) (ExploreAI Academy et al., 2022) | ['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul'] | Classification | s2s | [Non-fiction, Web, Written] | None | None |
 | [SpanishNewsClassification.v2](https://huggingface.co/datasets/MarcOrfilaCarreras/spanish-news) | ['spa'] | Classification | s2s | [News, Written] | None | None |
@@ -1920,7 +1920,7 @@ The following tables give you an overview of the tasks in MTEB.
 | sim | Mende (Papua New Guinea) | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
 | sin | Sinhala | Indo-European | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
 | sja | Epena | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| slk | Slovak | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 |
+| slk | Slovak | Indo-European | 0 | 0 | 0 | 5 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 |
 | sll | Salt-Yui | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
 | slv | Slovenian | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 |
 | smk | Bolinao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
@@ -1959,7 +1959,6 @@ The following tables give you an overview of the tasks in MTEB.
 | sun | Sundanese | Austronesian | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 |
 | sus | Susu | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
 | suz | Sunwar | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
-| svk | Slovakian Sign Language | Sign Language | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
 | swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 1 | 7 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 |
 | swe | Swedish | Indo-European | 0 | 1 | 0 | 6 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 0 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 30 |
 | swg | Swabian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |

diff --git a/mteb/_evaluators/regression_evaluator.py b/mteb/_evaluators/regression_evaluator.py
@@ -112,12 +112,12 @@ def create_dataloaders(
                 batch_size=batch_size,
             )
         elif self.task_metadata.modalities == ["text"]:
-            if self.label_column_name != "text":
+            if self.values_column != "text":
                 self.train_dataset = self.train_dataset.rename_column(
-                    self.label_column_name, "text"
+                    self.values_column, "text"
                 )
                 self.eval_dataset = self.eval_dataset.rename_column(
-                    self.label_column_name, "text"
+                    self.values_column, "text"
                 )
             dataloader_train = DataLoader(self.train_dataset)
             dataloader_test = DataLoader(self.eval_dataset)

diff --git a/mteb/benchmarks/_create_table.py b/mteb/benchmarks/_create_table.py
@@ -334,13 +334,6 @@ def _create_summary_table_mean_public_private(
         ),
     )
 
-    # Add zero-shot percentage
-    tasks = get_tasks(tasks=list(data["task_name"].unique()))
-    joint_table.insert(
-        1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks))
-    )
-    joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1)
-
     # Clean up model names (remove HF organization)
     joint_table["model_name"] = joint_table["model_name"].map(
         lambda name: name.split("/")[-1]
@@ -497,3 +490,132 @@ def _create_summary_table_mean_subset(
     joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank"))
 
     return joint_table
+
+
+def _create_summary_table_mean_task_type(
+    benchmark_results: BenchmarkResults,
+) -> pd.DataFrame:
+    """Create summary table from BenchmarkResults.
+
+    Returns a DataFrame with one row per model containing summary statistics
+    and task type averages.
+
+    Args:
+        benchmark_results: BenchmarkResults object containing model results
+
+    Returns:
+        DataFrame with model summaries, ready for styling in the leaderboard
+    """
+    data = benchmark_results.to_dataframe(format="long")
+
+    if data.empty:
+        no_results_frame = pd.DataFrame(
+            {"No results": ["You can try relaxing your criteria"]}
+        )
+        return no_results_frame
+
+    # Convert to DataFrame and pivot
+    per_task = data.pivot(index="model_name", columns="task_name", values="score")
+
+    # Remove models with no scores
+    to_remove = per_task.isna().all(axis="columns")
+    if to_remove.all():
+        no_results_frame = pd.DataFrame(
+            {"No results": ["You can try relaxing your criteria"]}
+        )
+        return no_results_frame
+
+    models_to_remove = list(per_task[to_remove].index)
+    per_task = per_task.drop(models_to_remove, axis=0)
+
+    # Calculate means by task type
+    mean_per_type = _get_means_per_types(per_task)
+    mean_per_type = mean_per_type.pivot(
+        index="model_name", columns="task_type", values="score"
+    )
+    mean_per_type.columns = [
+        _split_on_capital(column) for column in mean_per_type.columns
+    ]
+
+    # Calculate overall means
+    typed_mean = mean_per_type.mean(skipna=False, axis=1)
+
+    # Build joint table
+    joint_table = mean_per_type.copy()
+    joint_table = joint_table.drop(models_to_remove, axis=0)
+    joint_table.insert(0, "mean_by_task_type", typed_mean)
+    joint_table = joint_table.sort_values("mean_by_task_type", ascending=False)
+    joint_table["borda_rank"] = _get_borda_rank(per_task)
+    joint_table["rank"] = [i + 1 for i in range(len(joint_table))]
+    joint_table = joint_table.reset_index()
+
+    # Add model metadata
+    model_metas = joint_table["model_name"].map(mteb.get_model_meta)
+    joint_table = joint_table[model_metas.notna()]
+    joint_table["model_link"] = model_metas.map(lambda m: m.reference)
+
+    # Insert model metadata columns
+    joint_table.insert(
+        1,
+        "Max Tokens",
+        model_metas.map(lambda m: _format_max_tokens(m.max_tokens)),
+    )
+    joint_table.insert(
+        1,
+        "Embedding Dimensions",
+        model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"),
+    )
+    joint_table.insert(
+        1,
+        "Number of Parameters",
+        model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
+    )
+    joint_table.insert(
+        1,
+        "Memory Usage (MB)",
+        model_metas.map(
+            lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown"
+        ),
+    )
+
+    # Add zero-shot percentage
+    tasks = get_tasks(tasks=list(data["task_name"].unique()))
+    joint_table.insert(
+        1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks))
+    )
+    joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1)
+
+    # Clean up model names (remove HF organization)
+    joint_table["model_name"] = joint_table["model_name"].map(
+        lambda name: name.split("/")[-1]
+    )
+
+    # Add markdown links to model names
+    name_w_link = (
+        "[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")"
+    )
+    joint_table["model_name"] = joint_table["model_name"].mask(
+        joint_table["model_link"].notna(), name_w_link
+    )
+    joint_table = joint_table.drop(columns=["model_link"])
+
+    # Rename columns
+    joint_table = joint_table.rename(
+        columns={
+            "model_name": "Model",
+            "mean_by_task_type": "Mean (TaskType)",
+            "borda_rank": "Rank (Borda)",
+        }
+    )
+
+    if "Any Any Multilingual Retrieval" in joint_table.columns:
+        joint_table = joint_table.rename(
+            columns={"Any Any Multilingual Retrieval": "Multilingual Retrieval"}
+        )
+    if "Any Any Retrieval" in joint_table.columns:
+        joint_table = joint_table.rename(columns={"Any Any Retrieval": "Retrieval"})
+
+    # Move borda rank to front
+    joint_table.insert(0, "Rank", joint_table.pop("rank"))
+
+    return joint_table
diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py
@@ -9,6 +9,7 @@
     _create_summary_table_from_benchmark_results,
     _create_summary_table_mean_public_private,
     _create_summary_table_mean_subset,
+    _create_summary_table_mean_task_type,
 )
 from mteb.load_results import load_results
 from mteb.results import BenchmarkResults
@@ -100,3 +101,11 @@ def _create_summary_table(
     ) -> pd.DataFrame:
         """Create summary table. Called by the leaderboard app."""
         return _create_summary_table_mean_subset(benchmark_results)
+
+
+class MIEBBenchmark(Benchmark):
+    def _create_summary_table(
+        self, benchmark_results: BenchmarkResults
+    ) -> pd.DataFrame:
+        """Create summary table. Called by the leaderboard app."""
+        return _create_summary_table_mean_task_type(benchmark_results)
diff --git a/mteb/benchmarks/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks/benchmarks.py
@@ -1,4 +1,4 @@
-from mteb.benchmarks.benchmark import Benchmark, HUMEBenchmark
+from mteb.benchmarks.benchmark import Benchmark, HUMEBenchmark, MIEBBenchmark
 from mteb.overview import MTEBTasks, get_task, get_tasks
 
 MMTEB_CITATION = r"""@article{enevoldsen2025mmtebmassivemultilingualtext,
@@ -1770,7 +1770,7 @@
     "WebQAT2TRetrieval",
 ]
 
-MIEB_ENG = Benchmark(
+MIEB_ENG = MIEBBenchmark(
     name="MIEB(eng)",
     display_name="Image-Text, English",
     icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-picture.svg",
@@ -1799,7 +1799,7 @@
 """,
 )
 
-MIEB_MULTILINGUAL = Benchmark(
+MIEB_MULTILINGUAL = MIEBBenchmark(
     name="MIEB(Multilingual)",
     display_name="Image-Text, Multilingual",
     icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-pictures.svg",
@@ -1834,7 +1834,7 @@
 """,
 )
 
-MIEB_LITE = Benchmark(
+MIEB_LITE = MIEBBenchmark(
     name="MIEB(lite)",
     display_name="Image-Text, Lite",
     icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-landscape.svg",
@@ -1918,7 +1918,7 @@
 """,
 )
 
-MIEB_IMG = Benchmark(
+MIEB_IMG = MIEBBenchmark(
     name="MIEB(Img)",
     display_name="Image only",
     icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-pictures.svg",