diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 15ac01dae4..777a05c238 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -50,7 +50,7 @@ The following table gives you an overview of the benchmarks in MTEB. | RTEB(Code, beta) | RTEB Code | 8 | Retrieval: 8 | [Programming, Written] | eng,go,javascript,jpn,python,sql | | RTEB(Health, beta) | RTEB Healthcare | 4 | Retrieval: 4 | [Academic, Medical, Written] | deu,eng,fra,spa | | RTEB(Law, beta) | RTEB Legal | 7 | Retrieval: 7 | [Legal, Written] | deu,eng,fra,jpn | -| RTEB(beta) | RTEB Retrieval Embedding Benchmark | 28 | Retrieval: 28 | [Academic, Encyclopaedic, Financial, Legal, Medical, Non-fiction, Programming, Written] | deu,eng,fra,go,javascript,jpn,python,spa,sql | +| RTEB(beta) | RTEB Retrieval Embedding Benchmark | 29 | Retrieval: 29 | [Academic, Encyclopaedic, Financial, Legal, Medical, Non-fiction, Programming, Written] | ara,ben,deu,eng,fas,fin,fra,go,hin,ind,javascript,jpn,kor,python,rus,spa,sql,swa,tel,tha,yor,zho | | RTEB(deu, beta) | RTEB German | 4 | Retrieval: 4 | [Legal, Medical, Non-fiction, Written] | deu | | RTEB(eng, beta) | RTEB English | 20 | Retrieval: 20 | [Academic, Financial, Legal, Medical, Non-fiction, Programming, Written] | eng,fra,go,javascript,python,spa,sql | | RTEB(fin, beta) | RTEB Finance | 7 | Retrieval: 7 | [Financial, Non-fiction, Written] | eng | diff --git a/docs/tasks.md b/docs/tasks.md index 74f7a9a6d8..3419e0d9b2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -889,6 +889,8 @@ The following tables give you an overview of the tasks in MTEB. | [SwahiliNewsClassification.v2](https://huggingface.co/datasets/Mollel/SwahiliNewsClassification) (Davis et al., 2020) | ['swa'] | Classification | s2s | [News, Written] | None | None | | [SweFaqRetrieval](https://spraakbanken.gu.se/en/resources/superlim) (Berdi{\v{c, 2023) | ['swe'] | Retrieval | s2s | [Government, Non-fiction, Written] | None | None | | [SweRecClassification.v2](https://aclanthology.org/2023.nodalida-1.20/) (Nielsen et al., 2023) | ['swe'] | Classification | s2s | [Reviews, Written] | None | None | +| [SwedishPatentCPCGroupClassification](https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254) (Salim et al., 2025) | ['swe'] | MultilabelClassification | s2s | [Government, Legal] | None | None | +| [SwedishPatentCPCSubclassClassification](https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254) (Salim et al., 2025) | ['swe'] | MultilabelClassification | s2s | [Government, Legal] | None | None | | [SwedishSentimentClassification.v2](https://huggingface.co/datasets/swedish_reviews) | ['swe'] | Classification | s2s | [Reviews, Written] | None | None | | [SwednClusteringP2P](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [SwednClusteringS2S](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Clustering | s2s | [News, Non-fiction, Written] | None | None | @@ -1937,7 +1939,7 @@ The following tables give you an overview of the tasks in MTEB. | suz | Sunwar | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | svk | Slovakian Sign Language | Sign Language | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 1 | 7 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | -| swe | Swedish | Indo-European | 0 | 1 | 0 | 6 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| swe | Swedish | Indo-European | 0 | 1 | 0 | 6 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 0 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 30 | | swg | Swabian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | swh | Swahili (individual language) | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | | swp | Suau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -2138,7 +2140,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 0 | 55 | 49 | 1496 | 872 | 321 | 7 | 137 | 22 | 5 | 0 | 3 | 29 | 96 | 4 | 68 | 702 | 91 | 2 | 2 | 6 | 7 | 37 | 24 | +| Total | None | None | None | 0 | 55 | 49 | 1496 | 872 | 321 | 7 | 137 | 22 | 5 | 0 | 3 | 31 | 96 | 4 | 68 | 702 | 91 | 2 | 2 | 6 | 7 | 37 | 24 | diff --git a/mteb/benchmarks/_create_table.py b/mteb/benchmarks/_create_table.py new file mode 100644 index 0000000000..e977b6e2fc --- /dev/null +++ b/mteb/benchmarks/_create_table.py @@ -0,0 +1,382 @@ +from __future__ import annotations + +import math +import re +from collections import defaultdict + +import numpy as np +import pandas as pd + +from mteb.load_results.benchmark_results import BenchmarkResults +from mteb.overview import get_task, get_tasks + + +def _borda_count(scores: pd.Series) -> pd.Series: + n = len(scores) + ranks = scores.rank(method="average", ascending=False) + counts = n - ranks + return counts + + +def _get_borda_rank(score_table: pd.DataFrame) -> pd.Series: + borda_counts = score_table.apply(_borda_count, axis="index") + mean_borda = borda_counts.sum(axis=1) + return mean_borda.rank(method="min", ascending=False).astype(int) + + +def _split_on_capital(s: str) -> str: + """Splits on capital letters and joins with spaces""" + return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s)) + + +def _format_n_parameters(n_parameters) -> str: + if (n_parameters is None) or (not int(n_parameters)): + return "Unknown" + n_thousand = int(n_parameters // 1e3) + if n_thousand < 1: + return str(int(n_parameters)) + n_zeros = math.log10(n_thousand) + if n_zeros >= 6: + return str(n_thousand // (10**6)) + "B" + if n_zeros >= 3: + return str(n_thousand // (10**3)) + "M" + return str(n_thousand) + "K" + + +def _format_max_tokens(max_tokens: float | None) -> str: + if max_tokens is None: + return "Unknown" + if max_tokens == np.inf: + return "Infinite" + return str(int(max_tokens)) + + +def _failsafe_get_model_meta(model_name): + try: + from mteb.models.overview import get_model_meta + + return get_model_meta(model_name) + except Exception: + return None + + +def _get_means_per_types(per_task: pd.DataFrame): + task_names_per_type = defaultdict(list) + for task_name in per_task.columns: + task_type = get_task(task_name).metadata.type + task_names_per_type[task_type].append(task_name) + records = [] + for task_type, tasks in task_names_per_type.items(): + for model_name, scores in per_task.iterrows(): + records.append( + dict( + model_name=model_name, + task_type=task_type, + score=scores[tasks].mean(skipna=False), + ) + ) + return pd.DataFrame.from_records(records) + + +def _create_summary_table_from_benchmark_results( + benchmark_results: BenchmarkResults, +) -> pd.DataFrame: + """Create summary table from BenchmarkResults. + + Returns a DataFrame with one row per model containing summary statistics + and task type averages. + + Args: + benchmark_results: BenchmarkResults object containing model results + + Returns: + DataFrame with model summaries, ready for styling in the leaderboard + """ + data = benchmark_results.to_dataframe(format="long") + + if data.empty: + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + + # Convert to DataFrame and pivot + per_task = data.pivot(index="model_name", columns="task_name", values="score") + + # Remove models with no scores + to_remove = per_task.isna().all(axis="columns") + if to_remove.all(): + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + + models_to_remove = list(per_task[to_remove].index) + per_task = per_task.drop(models_to_remove, axis=0) + + # Calculate means by task type + mean_per_type = _get_means_per_types(per_task) + mean_per_type = mean_per_type.pivot( + index="model_name", columns="task_type", values="score" + ) + mean_per_type.columns = [ + _split_on_capital(column) for column in mean_per_type.columns + ] + + # Calculate overall means + typed_mean = mean_per_type.mean(skipna=False, axis=1) + overall_mean = per_task.mean(skipna=False, axis=1) + + # Build joint table + joint_table = mean_per_type.copy() + joint_table = joint_table.drop(models_to_remove, axis=0) + joint_table.insert(0, "mean", overall_mean) + joint_table.insert(1, "mean_by_task_type", typed_mean) + joint_table["borda_rank"] = _get_borda_rank(per_task) + joint_table = joint_table.sort_values("borda_rank", ascending=True) + joint_table = joint_table.reset_index() + + # Add model metadata + model_metas = joint_table["model_name"].map(_failsafe_get_model_meta) + joint_table = joint_table[model_metas.notna()] + joint_table["model_link"] = model_metas.map(lambda m: m.reference) + + # Insert model metadata columns + joint_table.insert( + 1, + "Max Tokens", + model_metas.map(lambda m: _format_max_tokens(m.max_tokens)), + ) + joint_table.insert( + 1, + "Embedding Dimensions", + model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"), + ) + joint_table.insert( + 1, + "Number of Parameters", + model_metas.map(lambda m: _format_n_parameters(m.n_parameters)), + ) + joint_table.insert( + 1, + "Memory Usage (MB)", + model_metas.map( + lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown" + ), + ) + + # Add zero-shot percentage + tasks = get_tasks(tasks=list(data["task_name"].unique())) + joint_table.insert( + 1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks)) + ) + joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1) + + # Clean up model names (remove HF organization) + joint_table["model_name"] = joint_table["model_name"].map( + lambda name: name.split("/")[-1] + ) + + # Add markdown links to model names + name_w_link = ( + "[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")" + ) + joint_table["model_name"] = joint_table["model_name"].mask( + joint_table["model_link"].notna(), name_w_link + ) + joint_table = joint_table.drop(columns=["model_link"]) + + # Rename columns + joint_table = joint_table.rename( + columns={ + "model_name": "Model", + "mean_by_task_type": "Mean (TaskType)", + "mean": "Mean (Task)", + } + ) + + # Move borda rank to front + joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) + + return joint_table + + +def _create_per_task_table_from_benchmark_results( + benchmark_results: BenchmarkResults, +) -> pd.DataFrame: + """Create per-task table from BenchmarkResults. + + Returns a DataFrame with one row per model and one column per task. + + Args: + benchmark_results: BenchmarkResults object containing model results + + Returns: + DataFrame with per-task scores, ready for styling in the leaderboard + """ + # Get scores in long format + data = benchmark_results.to_dataframe(format="long") + + if data.empty: + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + + # Convert to DataFrame and pivot + per_task = data.pivot(index="model_name", columns="task_name", values="score") + + # Remove models with no scores + to_remove = per_task.isna().all(axis="columns") + if to_remove.all(): + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + + models_to_remove = list(per_task[to_remove].index) + per_task = per_task.drop(models_to_remove, axis=0) + + # Add borda rank and sort + per_task["borda_rank"] = _get_borda_rank(per_task) + per_task = per_task.sort_values("borda_rank", ascending=True) + per_task = per_task.drop(columns=["borda_rank"]) + per_task = per_task.reset_index() + + # Clean up model names (remove HF organization) + per_task["model_name"] = per_task["model_name"].map( + lambda name: name.split("/")[-1] + ) + per_task = per_task.rename( + columns={ + "model_name": "Model", + } + ) + + return per_task + + +def _create_summary_table_mean_public_private( + benchmark_results: BenchmarkResults, +) -> pd.DataFrame: + """Create summary table from BenchmarkResults. + + Returns a DataFrame with one row per model containing summary statistics + and task type averages. + + Args: + benchmark_results: BenchmarkResults object containing model results + + Returns: + DataFrame with model summaries, ready for styling in the leaderboard + """ + data = benchmark_results.to_dataframe(format="long") + + if data.empty: + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + public_task_name = benchmark_results.filter_tasks(privacy="public").task_names + private_task_name = benchmark_results.filter_tasks(privacy="private").task_names + # Convert to DataFrame and pivot + per_task = data.pivot(index="model_name", columns="task_name", values="score") + + # Remove models with no scores + to_remove = per_task.isna().all(axis="columns") + if to_remove.all(): + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + + models_to_remove = list(per_task[to_remove].index) + per_task = per_task.drop(models_to_remove, axis=0) + + # Calculate means by task type + mean_per_type = _get_means_per_types(per_task) + mean_per_type = mean_per_type.pivot( + index="model_name", columns="task_type", values="score" + ) + mean_per_type.columns = [ + _split_on_capital(column) for column in mean_per_type.columns + ] + + # Calculate overall means + public_mean = per_task[public_task_name].mean(skipna=False, axis=1) + private_mean = per_task[private_task_name].mean(skipna=False, axis=1) + + # Build joint table + joint_table = mean_per_type.copy() + joint_table = joint_table.drop(models_to_remove, axis=0) + joint_table.insert(0, "mean(public)", public_mean) + joint_table.insert(1, "mean(private)", private_mean) + joint_table["borda_rank"] = _get_borda_rank(per_task) + joint_table = joint_table.sort_values("borda_rank", ascending=True) + joint_table = joint_table.reset_index() + + # Add model metadata + model_metas = joint_table["model_name"].map(_failsafe_get_model_meta) + joint_table = joint_table[model_metas.notna()] + joint_table["model_link"] = model_metas.map(lambda m: m.reference) + + # Insert model metadata columns + joint_table.insert( + 1, + "Max Tokens", + model_metas.map(lambda m: _format_max_tokens(m.max_tokens)), + ) + joint_table.insert( + 1, + "Embedding Dimensions", + model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"), + ) + joint_table.insert( + 1, + "Number of Parameters", + model_metas.map(lambda m: _format_n_parameters(m.n_parameters)), + ) + joint_table.insert( + 1, + "Memory Usage (MB)", + model_metas.map( + lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown" + ), + ) + + # Add zero-shot percentage + tasks = get_tasks(tasks=list(data["task_name"].unique())) + joint_table.insert( + 1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks)) + ) + joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1) + + # Clean up model names (remove HF organization) + joint_table["model_name"] = joint_table["model_name"].map( + lambda name: name.split("/")[-1] + ) + + # Add markdown links to model names + name_w_link = ( + "[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")" + ) + joint_table["model_name"] = joint_table["model_name"].mask( + joint_table["model_link"].notna(), name_w_link + ) + joint_table = joint_table.drop(columns=["model_link"]) + + # Rename columns + rename_dict = { + "model_name": "Model", + "mean(public)": "Mean (Public)", + "mean(private)": "Mean (Private)", + } + # For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task) + if "Retrieval" in joint_table.columns: + rename_dict["Retrieval"] = "Mean (Task)" + joint_table = joint_table.rename(columns=rename_dict) + + # Move borda rank to front + joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) + + return joint_table diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py index 37b654ac92..c6a570be8b 100644 --- a/mteb/benchmarks/benchmark.py +++ b/mteb/benchmarks/benchmark.py @@ -4,8 +4,14 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Annotated +import pandas as pd from pydantic import AnyUrl, BeforeValidator, TypeAdapter +from mteb.benchmarks._create_table import ( + _create_per_task_table_from_benchmark_results, + _create_summary_table_from_benchmark_results, + _create_summary_table_mean_public_private, +) from mteb.load_results.load_results import load_results if TYPE_CHECKING: @@ -72,3 +78,23 @@ def load_results( results = base_results.select_tasks(self.tasks) self.results_cache[base_results] = results return results + + def _create_summary_table( + self, benchmark_results: BenchmarkResults + ) -> pd.DataFrame: + """Create summary table. Called by the leaderboard app.""" + return _create_summary_table_from_benchmark_results(benchmark_results) + + def _create_per_task_table( + self, benchmark_results: BenchmarkResults + ) -> pd.DataFrame: + """Create per-task table. Called by the leaderboard app.""" + return _create_per_task_table_from_benchmark_results(benchmark_results) + + +class RtebBenchmark(Benchmark): + def _create_summary_table( + self, benchmark_results: BenchmarkResults + ) -> pd.DataFrame: + """Create summary table. Called by the leaderboard app.""" + return _create_summary_table_mean_public_private(benchmark_results) diff --git a/mteb/benchmarks/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/benchmarks/rteb_benchmarks.py index 1bdc14a814..5136b9dbe8 100644 --- a/mteb/benchmarks/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/benchmarks/rteb_benchmarks.py @@ -1,7 +1,7 @@ # RTEB Benchmarks - Retrieval Embedding Benchmark from __future__ import annotations -from mteb.benchmarks.benchmark import Benchmark +from mteb.benchmarks.benchmark import RtebBenchmark from mteb.overview import get_tasks RTEB_CITATION = r"""@article{rteb2024, @@ -10,7 +10,7 @@ year = {2024}, }""" -RTEB_MAIN = Benchmark( +RTEB_MAIN = RtebBenchmark( name="RTEB(beta)", display_name="RTEB Retrieval Embedding Benchmark", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-search.svg", @@ -31,6 +31,7 @@ "FreshStackRetrieval", "ChatDoctorRetrieval", "CUREv1", + "MIRACLRetrievalHardNegatives", # Closed datasets "Code1Retrieval", "JapaneseCode1Retrieval", @@ -47,12 +48,12 @@ "JapaneseLegal1Retrieval", ], ), - description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 28 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across English, French, German, and Japanese languages.", + description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 29 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across English, French, German, and Japanese languages.", citation=RTEB_CITATION, contacts=["fzowl"], ) -RTEB_ENGLISH = Benchmark( +RTEB_ENGLISH = RtebBenchmark( name="RTEB(eng, beta)", display_name="RTEB English", icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg", @@ -87,7 +88,7 @@ contacts=["fzowl"], ) -RTEB_FRENCH = Benchmark( +RTEB_FRENCH = RtebBenchmark( name="RTEB(fr, beta)", display_name="RTEB French", icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", @@ -105,7 +106,7 @@ contacts=["fzowl"], ) -RTEB_GERMAN = Benchmark( +RTEB_GERMAN = RtebBenchmark( name="RTEB(deu, beta)", display_name="RTEB German", icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg", @@ -123,7 +124,7 @@ contacts=["fzowl"], ) -RTEB_JAPANESE = Benchmark( +RTEB_JAPANESE = RtebBenchmark( name="RTEB(jpn, beta)", display_name="RTEB Japanese", icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg", @@ -139,7 +140,7 @@ contacts=["fzowl"], ) -RTEB_FINANCE = Benchmark( +RTEB_FINANCE = RtebBenchmark( name="RTEB(fin, beta)", display_name="RTEB Finance", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-price-tag.svg", @@ -160,7 +161,7 @@ contacts=["fzowl"], ) -RTEB_LEGAL = Benchmark( +RTEB_LEGAL = RtebBenchmark( name="RTEB(Law, beta)", display_name="RTEB Legal", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg", @@ -181,7 +182,7 @@ contacts=["fzowl"], ) -RTEB_CODE = Benchmark( +RTEB_CODE = RtebBenchmark( name="RTEB(Code, beta)", display_name="RTEB Code", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", @@ -203,7 +204,7 @@ contacts=["fzowl"], ) -RTEB_HEALTHCARE = Benchmark( +RTEB_HEALTHCARE = RtebBenchmark( name="RTEB(Health, beta)", display_name="RTEB Healthcare", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg", diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 0ec8b91fde..c37a8edc79 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -25,7 +25,10 @@ make_selector, ) from mteb.leaderboard.figures import performance_size_plot, radar_chart -from mteb.leaderboard.table import create_tables +from mteb.leaderboard.table import ( + apply_per_task_styling_from_benchmark, + apply_summary_styling_from_benchmark, +) from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ logger = logging.getLogger(__name__) @@ -120,6 +123,7 @@ def update_task_info(task_names: str) -> gr.DataFrame: "reference", "main_score", "modalities", + "is_public", ] ) df["languages"] = df["languages"].map(format_list) @@ -135,6 +139,7 @@ def update_task_info(task_names: str) -> gr.DataFrame: "domains": "Domains", "main_score": "Metric", "modalities": "Modality", + "is_public": "Public", } ) df = df.drop(columns="reference") @@ -236,10 +241,21 @@ def get_leaderboard_app() -> gr.Blocks: max_model_size=MAX_MODEL_SIZE, zero_shot_setting="allow_all", ) + default_filtered_scores = [ + entry for entry in default_scores if entry["model_name"] in filtered_models + ] + + # Filter BenchmarkResults based on default filtered models (as required by Kenneth) + filtered_model_names = [entry["model_name"] for entry in default_filtered_scores] + filtered_benchmark_results = default_results.select_models(filtered_model_names) - summary_table, per_task_table = create_tables( - [entry for entry in default_scores if entry["model_name"] in filtered_models] + summary_table = apply_summary_styling_from_benchmark( + default_benchmark, filtered_benchmark_results + ) + per_task_table = apply_per_task_styling_from_benchmark( + default_benchmark, filtered_benchmark_results ) + lang_select = gr.Dropdown( LANGUAGE, value=sorted(default_results.languages), @@ -774,19 +790,43 @@ def update_tables( tasks = set(tasks) benchmark = mteb.get_benchmark(benchmark_name) benchmark_tasks = {task.metadata.name for task in benchmark.tasks} - if (benchmark_tasks != tasks) or (models_to_keep is not None): - filtered_scores = [] - for entry in scores: - if entry["task_name"] not in tasks: - continue - if (models_to_keep is not None) and ( - entry["model_name"] not in models_to_keep - ): - continue - filtered_scores.append(entry) - else: - filtered_scores = scores - summary, per_task = create_tables(filtered_scores) + + # Extract filtered model and task names from scores (respects UI filters) + filtered_model_names = set() + filtered_task_names = set() + + for entry in scores: + if entry["task_name"] not in tasks: + continue + if (models_to_keep is not None) and ( + entry["model_name"] not in models_to_keep + ): + continue + filtered_model_names.add(entry["model_name"]) + filtered_task_names.add(entry["task_name"]) + + # Create filtered BenchmarkResults as required by Kenneth + benchmark_results = all_benchmark_results[benchmark_name] + filtered_benchmark_results = benchmark_results + + # Apply task filtering if needed + if filtered_task_names != benchmark_tasks: + filtered_benchmark_results = filtered_benchmark_results.filter_tasks( + task_names=list(filtered_task_names) + ) + + # Apply model filtering if needed + if filtered_model_names: + filtered_benchmark_results = filtered_benchmark_results.select_models( + list(filtered_model_names) + ) + + summary = apply_summary_styling_from_benchmark( + benchmark, filtered_benchmark_results + ) + per_task = apply_per_task_styling_from_benchmark( + benchmark, filtered_benchmark_results + ) elapsed = time.time() - start_time logger.debug(f"update_tables callback: {elapsed}s") return summary, per_task diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 5286680dc9..623e508caa 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -1,9 +1,5 @@ from __future__ import annotations -import math -import re -from collections import defaultdict - import gradio as gr import matplotlib.pyplot as plt import numpy as np @@ -11,46 +7,11 @@ from matplotlib.colors import LinearSegmentedColormap from pandas.api.types import is_numeric_dtype -from mteb.models.overview import get_model_meta -from mteb.overview import get_task, get_tasks - - -def borda_count(scores: pd.Series) -> pd.Series: - n = len(scores) - ranks = scores.rank(method="average", ascending=False) - counts = n - ranks - return counts - - -def get_borda_rank(score_table: pd.DataFrame) -> pd.Series: - borda_counts = score_table.apply(borda_count, axis="index") - mean_borda = borda_counts.sum(axis=1) - return mean_borda.rank(method="min", ascending=False).astype(int) - def format_scores(score: float) -> float: return round(score * 100, 2) -def format_n_parameters(n_parameters) -> str: - if (n_parameters is None) or (not int(n_parameters)): - return "Unknown" - n_thousand = int(n_parameters // 1e3) - if n_thousand < 1: - return str(int(n_parameters)) - n_zeros = math.log10(n_thousand) - if n_zeros >= 6: - return str(n_thousand // (10**6)) + "B" - if n_zeros >= 3: - return str(n_thousand // (10**3)) + "M" - return str(n_thousand) + "K" - - -def split_on_capital(s: str) -> str: - """Splits on capital letters and joins with spaces""" - return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s)) - - def get_column_types(df: pd.DataFrame) -> list[str]: types = [] for column_name in df.columns: @@ -78,39 +39,6 @@ def get_column_widths(df: pd.DataFrame) -> list[str]: return widths -def get_means_per_types(per_task: pd.DataFrame): - task_names_per_type = defaultdict(list) - for task_name in per_task.columns: - task_type = get_task(task_name).metadata.type - task_names_per_type[task_type].append(task_name) - records = [] - for task_type, tasks in task_names_per_type.items(): - for model_name, scores in per_task.iterrows(): - records.append( - dict( - model_name=model_name, - task_type=task_type, - score=scores[tasks].mean(skipna=False), - ) - ) - return pd.DataFrame.from_records(records) - - -def failsafe_get_model_meta(model_name): - try: - return get_model_meta(model_name) - except Exception: - return None - - -def format_max_tokens(max_tokens: float | None) -> str: - if max_tokens is None: - return "Unknown" - if max_tokens == np.inf: - return "Infinite" - return str(int(max_tokens)) - - def format_zero_shot(zero_shot_percentage: int): if zero_shot_percentage == -1: return "⚠️ NA" @@ -128,119 +56,58 @@ def create_light_green_cmap(): return light_green_cmap -def scores_to_tables(scores_long: list[dict], search_query: str | None = None): - if not scores_long: - no_results_frame = pd.DataFrame( - {"No results": ["You can try relaxing your criteria"]} - ) - return gr.DataFrame(no_results_frame), gr.DataFrame(no_results_frame) - data = pd.DataFrame.from_records(scores_long) - per_task = data.pivot(index="model_name", columns="task_name", values="score") - mean_per_type = get_means_per_types(per_task) - mean_per_type = mean_per_type.pivot( - index="model_name", columns="task_type", values="score" - ) - mean_per_type.columns = [ - split_on_capital(column) for column in mean_per_type.columns - ] - to_remove = per_task.isna().all(axis="columns") - if search_query: - names = per_task.index.get_level_values("model_name") - names = pd.Series(names, index=per_task.index) - to_remove |= ~names.str.contains(search_query, regex=True) - if to_remove.all(): - no_results_frame = pd.DataFrame( - {"No results": ["You can try relaxing your criteria"]} - ) - return gr.DataFrame(no_results_frame), gr.DataFrame(no_results_frame) - models_to_remove = list(per_task[to_remove].index) - typed_mean = mean_per_type.mean(skipna=False, axis=1) - overall_mean = per_task.mean(skipna=False, axis=1) - joint_table = mean_per_type.copy() - per_task = per_task.drop(models_to_remove, axis=0) - joint_table = joint_table.drop(models_to_remove, axis=0) - joint_table.insert(0, "mean", overall_mean) - joint_table.insert(1, "mean_by_task_type", typed_mean) - joint_table["borda_rank"] = get_borda_rank(per_task) - joint_table = joint_table.sort_values("borda_rank", ascending=True) - per_task["borda_rank"] = joint_table["borda_rank"] - per_task = per_task.sort_values("borda_rank", ascending=True) - per_task = per_task.drop(columns=["borda_rank"]) - joint_table = joint_table.reset_index() - model_metas = joint_table["model_name"].map(failsafe_get_model_meta) - joint_table = joint_table[model_metas.notna()] - joint_table["model_link"] = model_metas.map(lambda m: m.reference) - joint_table.insert( - 1, - "Max Tokens", - model_metas.map(lambda m: format_max_tokens(m.max_tokens)), - ) - joint_table.insert( - 1, - "Embedding Dimensions", - model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"), - ) - joint_table.insert( - 1, - "Number of Parameters", - model_metas.map(lambda m: format_n_parameters(m.n_parameters)), - ) - joint_table.insert( - 1, - "Memory Usage (MB)", - model_metas.map( - lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown" - ), - ) - tasks = get_tasks(tasks=list(data["task_name"].unique())) - joint_table.insert( - 1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks)) - ) - joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1) - # joint_table = joint_table[joint_table["Zero-shot"].notna()] - # Removing HF organization from model - joint_table["model_name"] = joint_table["model_name"].map( - lambda name: name.split("/")[-1] - ) - # Adding markdown link to model names - name_w_link = ( - "[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")" - ) - joint_table["model_name"] = joint_table["model_name"].mask( - joint_table["model_link"].notna(), name_w_link - ) - joint_table = joint_table.drop(columns=["model_link"]) - joint_table = joint_table.rename( - columns={ - "model_name": "Model", - "mean_by_task_type": "Mean (TaskType)", - "mean": "Mean (Task)", - } - ) - per_task = per_task.reset_index() - per_task["model_name"] = per_task["model_name"].map( - lambda name: name.split("/")[-1] - ) - per_task = per_task.rename( - columns={ - "model_name": "Model", - } - ) - joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) - column_types = get_column_types(joint_table) - # setting model name column to markdown - column_types[1] = "markdown" - score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns] +def apply_summary_styling_from_benchmark( + benchmark_instance, benchmark_results +) -> gr.DataFrame: + """Apply styling to summary table created by the benchmark instance's _create_summary_table method. + + This supports polymorphism - different benchmark classes can have different table generation logic. + + Args: + benchmark_instance: The benchmark instance (could be Benchmark, RTEBBenchmark, etc.) + benchmark_results: BenchmarkResults object containing model results (may be pre-filtered) + + Returns: + Styled gr.DataFrame ready for display in the leaderboard + """ + # Use the instance method to support polymorphism + summary_df = benchmark_instance._create_summary_table(benchmark_results) + + # If it's a no-results DataFrame, return it as-is + if "No results" in summary_df.columns: + return gr.DataFrame(summary_df) + + # Apply the styling + return _apply_summary_table_styling(summary_df) + + +def apply_per_task_styling_from_benchmark( + benchmark_instance, benchmark_results +) -> gr.DataFrame: + """Apply styling to per-task table created by the benchmark instance's _create_per_task_table method. + + This supports polymorphism - different benchmark classes can have different table generation logic. + + Args: + benchmark_instance: The benchmark instance (could be Benchmark, RTEBBenchmark, etc.) + benchmark_results: BenchmarkResults object containing model results (may be pre-filtered) + + Returns: + Styled gr.DataFrame ready for display in the leaderboard + """ + # Use the instance method to support polymorphism + per_task_df = benchmark_instance._create_per_task_table(benchmark_results) - return joint_table, per_task, score_columns, column_types + # If it's a no-results DataFrame, return it as-is + if "No results" in per_task_df.columns: + return gr.DataFrame(per_task_df) + # Apply the styling + return _apply_per_task_table_styling(per_task_df) -def apply_styling( - joint_table: pd.DataFrame, - per_task: pd.DataFrame, - score_columns: list[str], - column_types: list[str], -) -> tuple[gr.DataFrame, gr.DataFrame]: + +def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame: + """Apply styling to a raw summary DataFrame""" excluded_columns = [ "Rank (Borda)", "Model", @@ -249,18 +116,27 @@ def apply_styling( "Max Tokens", "Memory Usage (MB)", ] + gradient_columns = [ col for col in joint_table.columns if col not in excluded_columns ] light_green_cmap = create_light_green_cmap() + + # Determine score columns (before formatting) + score_columns = [ + col + for col in joint_table.columns + if col not in excluded_columns + ["Zero-shot"] + ] + numeric_data = joint_table.copy() + + # Format data for display joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot) joint_table[score_columns] = joint_table[score_columns].map(format_scores) + joint_table_style = joint_table.style.format( - { - **dict.fromkeys(score_columns, "{:.2f}"), - "Rank (Borda)": "{:.0f}", - }, + {**dict.fromkeys(score_columns, "{:.2f}"), "Rank (Borda)": "{:.0f}"}, na_rep="", ) joint_table_style = joint_table_style.highlight_min( @@ -289,58 +165,45 @@ def apply_styling( vmax=100, gmap=gmap_values.loc[mask], ) + + column_types = get_column_types(joint_table_style.data) + # setting model name column to markdown + if len(column_types) > 1: + column_types[1] = "markdown" + + column_widths = get_column_widths(joint_table_style.data) + if len(column_widths) > 0: + column_widths[0] = "100px" + if len(column_widths) > 1: + column_widths[1] = "250px" + + return gr.DataFrame( + joint_table_style, + datatype=column_types, + interactive=False, + pinned_columns=3, + column_widths=column_widths, + wrap=True, + show_fullscreen_button=True, + show_copy_button=True, + show_search="filter", + ) + + +def _apply_per_task_table_styling(per_task: pd.DataFrame) -> gr.DataFrame: + """Apply styling to a raw per-task DataFrame""" task_score_columns = per_task.select_dtypes("number").columns per_task[task_score_columns] *= 100 + per_task_style = per_task.style.format( "{:.2f}", subset=task_score_columns, na_rep="" ).highlight_max(subset=task_score_columns, props="font-weight: bold") - # TODO: uncomment this when Gradio fixes it. - # The fix is already merged and contained in this release: https://github.com/gradio-app/gradio/pull/11032 - # It will be available in Gradio 5.25.3 - # for col in task_score_columns: - # if col != "Model": - # mask = per_task[col].notna() - # per_task_style = per_task_style.background_gradient( - # cmap=light_green_cmap, - # subset=pd.IndexSlice[mask, col], - # gmap=per_task[col].loc[mask], - # ) - column_widths = get_column_widths(joint_table_style.data) - column_widths[0] = "100px" - column_widths[1] = "250px" - return ( - gr.DataFrame( - joint_table_style, - datatype=column_types, - interactive=False, - pinned_columns=3, - column_widths=column_widths, - wrap=True, - show_fullscreen_button=True, - show_copy_button=True, - show_search="filter", - ), - gr.DataFrame( - per_task_style, - interactive=False, - pinned_columns=1, - show_fullscreen_button=True, - show_copy_button=True, - show_search="filter", - ), - ) - -def create_tables( - scores_long: list[dict], search_query: str | None = None -) -> tuple[gr.DataFrame, gr.DataFrame]: - result = scores_to_tables(scores_long, search_query) - # dataframe with No Results is returned, so no need to apply styling - if len(result) == 2: - joint_table, per_task = result - return joint_table, per_task - joint_table, per_task, score_columns, column_types = result - summary_table, per_task_table = apply_styling( - joint_table, per_task, score_columns, column_types + return gr.DataFrame( + per_task_style, + interactive=False, + pinned_columns=1, + show_fullscreen_button=True, + show_copy_button=True, + show_search="filter", ) - return summary_table, per_task_table diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index 4c83d3b156..679415b081 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -107,6 +107,7 @@ def filter_tasks( domains: list[TASK_DOMAIN] | None = None, task_types: list[TASK_TYPE] | None = None, modalities: list[MODALITIES] | None = None, + privacy: Literal["public", "private"] | None = None, ) -> ModelResult: # TODO: v2 see filter_tasks in BenchmarkResults - but can be moved to a private function or removed new_task_results = [] @@ -127,6 +128,10 @@ def filter_tasks( task_modalities = getattr(task_result, "modalities", []) if not any(modality in task_modalities for modality in modalities): continue + if (privacy is not None) and ( + task_result.task_is_public != (privacy == "public") + ): + continue new_task_results.append(task_result) return type(self).model_construct( model_name=self.model_name, @@ -395,6 +400,7 @@ def filter_tasks( domains: list[TASK_DOMAIN] | None = None, task_types: list[TASK_TYPE] | None = None, # type: ignore modalities: list[MODALITIES] | None = None, + privacy: Literal["public", "private"] | None = None, ) -> BenchmarkResults: # TODO: Same as filter_models model_results = [ @@ -404,6 +410,7 @@ def filter_tasks( domains=domains, task_types=task_types, modalities=modalities, + privacy=privacy, ) for res in self.model_results ] diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index c8218075c9..3aaf70fd79 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -228,6 +228,10 @@ def domains(self) -> list[str]: def task_type(self) -> str: return self.task.metadata.type + @property + def task_is_public(self) -> bool: + return getattr(self.task.metadata, "is_public", True) + def to_dict(self) -> dict: return self.model_dump() diff --git a/mteb/models/bmretriever_models.py b/mteb/models/bmretriever_models.py index c579682947..b14b710976 100644 --- a/mteb/models/bmretriever_models.py +++ b/mteb/models/bmretriever_models.py @@ -37,12 +37,20 @@ def __init__( self.model_name = model_name self.instruction_template = instruction_template self.apply_instruction_to_passages = apply_instruction_to_passages - self.add_eos_token = add_eos_token self.prompts_dict = prompts_dict + tokenizer_params = {} + if add_eos_token: + tokenizer_params["add_eos_token"] = add_eos_token + if max_seq_length is not None: + tokenizer_params["model_max_length"] = max_seq_length + if padding_side is not None: + tokenizer_params["padding_side"] = padding_side + + kwargs.setdefault("tokenizer_args", {}).update(tokenizer_params) + transformer = Transformer( model_name, - max_seq_length=max_seq_length, **kwargs, ) pooling = Pooling( @@ -50,12 +58,6 @@ def __init__( ) self.model = SentenceTransformer(modules=[transformer, pooling]) - if max_seq_length is not None: - self.model.max_seq_length = max_seq_length - - if padding_side is not None: - self.model.tokenizer.padding_side = padding_side - # https://huggingface.co/datasets/BMRetriever/biomed_retrieval_dataset BMRETRIEVER_TRAINING_DATA = { @@ -158,7 +160,7 @@ def __init__( loader=partial( BMRetrieverWrapper, model_name="BMRetriever/BMRetriever-7B", - config_args={"revision": "e3569bfbcfe3a1bc48c142e11a7b0f38e86065a3"}, + config_args={"revision": "13e6adb9273c5f254e037987d6b44e9e4b005b9a"}, model_args={"torch_dtype": torch.float32}, instruction_template=instruction_template, padding_side="left", @@ -168,7 +170,7 @@ def __init__( name="BMRetriever/BMRetriever-7B", languages=["eng-Latn"], open_weights=True, - revision="e3569bfbcfe3a1bc48c142e11a7b0f38e86065a3", + revision="13e6adb9273c5f254e037987d6b44e9e4b005b9a", release_date="2024-04-29", embed_dim=4096, n_parameters=7_110_660_096, diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index dbbfd35dfa..e076db7b7f 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -1,7 +1,7 @@ from __future__ import annotations from functools import partial -from typing import Any +from typing import Any, Literal, get_args import numpy as np import torch @@ -123,6 +123,13 @@ "zul-Latn", ] +EMBEDDING_TYPE = Literal[ + "float", + "int8", + "uint8", + "binary", +] + # Implementation follows https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/main/src/seb/registered_models/cohere_models.py class CohereTextEmbeddingModel(Wrapper): @@ -131,11 +138,16 @@ def __init__( model_name: str, sep: str = " ", model_prompts: dict[str, str] | None = None, + embedding_type: EMBEDDING_TYPE = "float", + output_dimension: int | None = None, **kwargs, ) -> None: self.model_name = model_name self.sep = sep self.model_prompts = self.validate_task_to_prompt_name(model_prompts) + assert embedding_type in get_args(EMBEDDING_TYPE) + self.embedding_type = embedding_type + self.output_dimension = output_dimension def _embed( self, @@ -160,11 +172,16 @@ def _embed( for batch in tqdm.tqdm(batches, leave=False, disable=not show_progress_bar): while retries > 0: # Cohere's API is not always reliable try: - response = client.embed( - texts=batch, - model=self.model_name, - input_type=cohere_task_type, - ) + embed_kwargs = { + "texts": batch, + "model": self.model_name, + "input_type": cohere_task_type, + "embedding_types": [self.embedding_type], + } + if self.output_dimension is not None: + embed_kwargs["output_dimension"] = self.output_dimension + + response = client.embed(**embed_kwargs) break except Exception as e: print(f"Retrying... {retries} retries left.") @@ -172,9 +189,43 @@ def _embed( if retries == 0: raise e - all_embeddings.extend(torch.tensor(response.embeddings).numpy()) + # Get embeddings based on requested type + if self.embedding_type == "float": + embeddings = response.embeddings.float + elif self.embedding_type == "int8": + embeddings = response.embeddings.int8 + elif self.embedding_type == "uint8": + embeddings = response.embeddings.uint8 + elif self.embedding_type == "binary": + embeddings = response.embeddings.binary + else: + raise ValueError(f"Embedding type {self.embedding_type} not allowed") + all_embeddings.extend(torch.tensor(embeddings).numpy()) + + embeddings_array = np.array(all_embeddings) + + # Post-process embeddings based on type (similar to voyage_models.py) + primary_embedding_type = self.embedding_type + + if primary_embedding_type == "binary": + # Unpack bit-packed embeddings: each byte contains 8 embedding values + unpacked_embeddings = [] + for embedding in embeddings_array: + # Convert bytes to bits and unpack + unpacked = [] + for byte_val in embedding: + # Extract 8 bits from each byte (LSB first) + for bit_pos in range(8): + bit_val = (byte_val >> bit_pos) & 1 + # Convert 0/1 to -1/1 for binary (signed) + unpacked.append(1.0 if bit_val else -1.0) + unpacked_embeddings.append(unpacked) + embeddings_array = np.array(unpacked_embeddings, dtype=np.float32) + elif primary_embedding_type in ["int8", "uint8"]: + # Convert int8/uint8 embeddings to float32 + embeddings_array = embeddings_array.astype(np.float32) - return np.array(all_embeddings) + return embeddings_array def encode( self, diff --git a/mteb/models/cohere_v.py b/mteb/models/cohere_v.py index 22aa2c8d36..731d3addb9 100644 --- a/mteb/models/cohere_v.py +++ b/mteb/models/cohere_v.py @@ -5,7 +5,7 @@ import os import time from functools import partial -from typing import Any +from typing import Any, Literal, get_args import torch from PIL import Image @@ -16,6 +16,33 @@ from mteb.model_meta import ModelMeta from mteb.requires_package import requires_image_dependencies, requires_package + +def _post_process_embeddings( + embeddings_array: torch.Tensor, embedding_type: str +) -> torch.Tensor: + """Post-process embeddings based on type (similar to voyage_models.py)""" + if embedding_type == "binary": + # Unpack bit-packed embeddings: each byte contains 8 embedding values + unpacked_embeddings = [] + for embedding in embeddings_array: + # Convert bytes to bits and unpack + unpacked = [] + for byte_val in embedding: + # Extract 8 bits from each byte (LSB first) + for bit_pos in range(8): + bit_val = (byte_val >> bit_pos) & 1 + # Convert 0/1 to -1/1 for binary (signed) + unpacked.append(1.0 if bit_val else -1.0) + unpacked_embeddings.append(unpacked) + return torch.tensor(unpacked_embeddings, dtype=torch.float32) + elif embedding_type in ["int8", "uint8"]: + # Convert int8/uint8 embeddings to float32 + return embeddings_array.float() + else: + # For float and other types, return as-is + return embeddings_array + + all_languages = [ "afr-Latn", "amh-Ethi", @@ -128,6 +155,13 @@ "zul-Latn", ] +EMBEDDING_TYPE = Literal[ + "float", + "int8", + "uint8", + "binary", +] + def cohere_v_loader(**kwargs): model_name = kwargs.get("model_name", "Cohere") @@ -140,6 +174,8 @@ class CohereMultiModalModelWrapper: def __init__( self, model_name: str, + embedding_type: EMBEDDING_TYPE = "float", + output_dimension: int | None = None, **kwargs: Any, ): """Wrapper for Cohere multimodal embedding model, @@ -152,6 +188,9 @@ def __init__( from torchvision import transforms self.model_name = model_name + assert embedding_type in get_args(EMBEDDING_TYPE) + self.embedding_type = embedding_type + self.output_dimension = output_dimension api_key = os.getenv("COHERE_API_KEY") self.client = cohere.ClientV2(api_key) self.image_format = "JPEG" @@ -170,14 +209,39 @@ def get_text_embeddings( for i in tqdm(range(0, len(texts), batch_size)): batch_texts = texts[i : i + batch_size] - response = self.client.embed( - texts=batch_texts, - model=self.model_name, - input_type="search_document", - ) - all_text_embeddings.append(torch.tensor(response.embeddings.float)) + embed_kwargs = { + "texts": batch_texts, + "model": self.model_name, + "input_type": "search_document", + "embedding_types": [self.embedding_type], + } + if self.output_dimension is not None: + embed_kwargs["output_dimension"] = self.output_dimension + + response = self.client.embed(**embed_kwargs) + + # Get embeddings based on requested type + if self.embedding_type == "float": + embeddings = response.embeddings.float + elif self.embedding_type == "int8": + embeddings = response.embeddings.int8 + elif self.embedding_type == "uint8": + embeddings = response.embeddings.uint8 + elif self.embedding_type == "binary": + embeddings = response.embeddings.binary + else: + raise ValueError( + f"Embedding type {self.embedding_type} not allowed" + ) + all_text_embeddings.append(torch.tensor(embeddings)) all_text_embeddings = torch.cat(all_text_embeddings, dim=0) + + # Post-process embeddings based on type + all_text_embeddings = _post_process_embeddings( + all_text_embeddings, self.embedding_type + ) + return all_text_embeddings def get_image_embeddings( @@ -206,15 +270,31 @@ def get_image_embeddings( image_base64 = ( f"data:{content_type};base64,{stringified_buffer}" ) - response = self.client.embed( - model=self.model_name, - input_type="image", - embedding_types=["float"], - images=[image_base64], - ) - all_image_embeddings.append( - torch.tensor(response.embeddings.float) - ) + embed_kwargs = { + "model": self.model_name, + "input_type": "image", + "embedding_types": [self.embedding_type], + "images": [image_base64], + } + if self.output_dimension is not None: + embed_kwargs["output_dimension"] = self.output_dimension + + response = self.client.embed(**embed_kwargs) + + # Get embeddings based on requested type + if self.embedding_type == "float": + embeddings = response.embeddings.float + elif self.embedding_type == "int8": + embeddings = response.embeddings.int8 + elif self.embedding_type == "uint8": + embeddings = response.embeddings.uint8 + elif self.embedding_type == "binary": + embeddings = response.embeddings.binary + else: + raise ValueError( + f"Embedding type {self.embedding_type} not allowed" + ) + all_image_embeddings.append(torch.tensor(embeddings)) time.sleep(1.5) else: for i in tqdm(range(0, len(images), batch_size)): @@ -231,17 +311,38 @@ def get_image_embeddings( image_base64 = ( f"data:{content_type};base64,{stringified_buffer}" ) - response = self.client.embed( - model=self.model_name, - input_type="image", - embedding_types=["float"], - images=[image_base64], - ) - all_image_embeddings.append( - torch.tensor(response.embeddings.float) - ) + embed_kwargs = { + "model": self.model_name, + "input_type": "image", + "embedding_types": [self.embedding_type], + "images": [image_base64], + } + if self.output_dimension is not None: + embed_kwargs["output_dimension"] = self.output_dimension + + response = self.client.embed(**embed_kwargs) + + # Get embeddings based on requested type + if self.embedding_type == "float": + embeddings = response.embeddings.float + elif self.embedding_type == "int8": + embeddings = response.embeddings.int8 + elif self.embedding_type == "uint8": + embeddings = response.embeddings.uint8 + elif self.embedding_type == "binary": + embeddings = response.embeddings.binary + else: + # Fallback for unknown types + embeddings = response.embeddings.float + all_image_embeddings.append(torch.tensor(embeddings)) time.sleep(1.5) all_image_embeddings = torch.cat(all_image_embeddings, dim=0) + + # Post-process embeddings based on type + all_image_embeddings = _post_process_embeddings( + all_image_embeddings, self.embedding_type + ) + return all_image_embeddings def calculate_probs(self, text_embeddings, image_embeddings): @@ -360,3 +461,49 @@ def get_fused_embeddings( use_instructions=False, training_datasets=None, ) + +cohere_embed_v4_multimodal_binary = ModelMeta( + loader=partial(cohere_v_loader, model_name="embed-v4.0", embedding_type="binary"), + name="Cohere/Cohere-embed-v4.0 (output_dtype=binary)", + languages=all_languages, + revision="1", + release_date="2024-12-01", + n_parameters=None, + memory_usage_mb=None, + max_tokens=128000, + embed_dim=1536, + license=None, + similarity_fn_name="cosine", + framework=[], + modalities=["image", "text"], + open_weights=False, + public_training_code=None, + public_training_data=None, + reference="https://docs.cohere.com/docs/embeddings", + use_instructions=False, + training_datasets=None, + adapted_from="Cohere/Cohere-embed-v4.0", +) + +cohere_embed_v4_multimodal_int8 = ModelMeta( + loader=partial(cohere_v_loader, model_name="embed-v4.0", embedding_type="int8"), + name="Cohere/Cohere-embed-v4.0 (output_dtype=int8)", + languages=all_languages, + revision="1", + release_date="2024-12-01", + n_parameters=None, + memory_usage_mb=None, + max_tokens=128000, + embed_dim=1536, + license=None, + similarity_fn_name="cosine", + framework=[], + modalities=["image", "text"], + open_weights=False, + public_training_code=None, + public_training_data=None, + reference="https://docs.cohere.com/docs/embeddings", + use_instructions=False, + training_datasets=None, + adapted_from="Cohere/Cohere-embed-v4.0", +) diff --git a/mteb/models/colpali_models.py b/mteb/models/colpali_models.py index 35396b92d6..24f668fbec 100644 --- a/mteb/models/colpali_models.py +++ b/mteb/models/colpali_models.py @@ -131,7 +131,7 @@ def calculate_probs(self, text_embeddings, image_embeddings): return scores.softmax(dim=-1) def similarity(self, a, b): - return self.processor.score(a, b, **self.processor_kwargs) + return self.processor.score(a, b, device=self.device) class ColPaliWrapper(ColPaliEngineWrapper): diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index c5be2a672d..4c085459f7 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -118,16 +118,22 @@ def __init__( "No instruction template provided. Instructions will be used as-is." ) + tokenizer_params = {} + if add_eos_token: + tokenizer_params["add_eos_token"] = add_eos_token + if max_seq_length is not None: + # https://github.com/UKPLab/sentence-transformers/blob/7341bf155b4349b88690b78c84beb5aa658c439f/sentence_transformers/models/Transformer.py#L115 + tokenizer_params["model_max_length"] = max_seq_length + if padding_side is not None: + tokenizer_params["padding_side"] = padding_side + + kwargs.setdefault("tokenizer_kwargs", {}).update(tokenizer_params) + self.model_name = model_name self.model = SentenceTransformer(model_name, revision=revision, **kwargs) self.instruction_template = instruction_template self.apply_instruction_to_passages = apply_instruction_to_passages - self.add_eos_token = add_eos_token self.prompts_dict = prompts_dict - if max_seq_length is not None: - self.model.max_seq_length = max_seq_length - if padding_side is not None: - self.model.tokenizer.padding_side = padding_side def encode( self, @@ -137,15 +143,9 @@ def encode( prompt_type: PromptType | None = None, **kwargs: Any, ) -> np.ndarray: - if self.add_eos_token: - sentences = [ - example + self.model.tokenizer.eos_token for example in sentences - ] - instruction = self.get_task_instruction( task_name, prompt_type, self.prompts_dict ) - # to passage prompts won't be applied to passages if ( not self.apply_instruction_to_passages diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 9c3be1a76a..af492ef8e7 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -15,44 +15,215 @@ ) GIGA_task_prompts = { - "TERRa": "Given a premise, retrieve a hypothesis that is entailed by the premise\nquery: ", - "STS22": "Retrieve semantically similar text\nquery: ", - "RuSTSBenchmarkSTS": "Retrieve semantically similar text\nquery: ", - "RUParaPhraserSTS": "Retrieve semantically similar text\nquery: ", - "CEDRClassification": "Дан комментарий, определи выраженную в нем эмоцию (радость, грусть, удивление, страх, гнев или нейтрально) \nкомментарий: ", - "GeoreviewClassification": "Classify the organization rating based on the reviews\nquery: ", - "GeoreviewClusteringP2P": "Классифицируй рейтинг организации на основе отзыва \nотзыв: ", - "HeadlineClassification": "Классифицируй тему данного новостного заголовка \nзаголовок: ", - "InappropriatenessClassification": "Классифицируй данный комментарий как токсичный или не токсичный \nкомментарий: ", - "KinopoiskClassification": "Classify the sentiment expressed in the given movie review text\nquery: ", - "MassiveIntentClassification": "Given a user utterance as query, find the user intents\nquery: ", - "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios\nquery: ", - "RuReviewsClassification": "Classify product reviews into positive, negative or neutral sentiment\nquery: ", - "RuSciBenchGRNTIClassification": "Classify the category of scientific papers based on the titles and abstracts\nquery: ", - "RuSciBenchGRNTIClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации \nаннотация: ", - "RuSciBenchOECDClassification": "Classify the category of scientific papers based on the titles and abstracts\nquery: ", - "RuSciBenchOECDClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации \nаннотация: ", - "SensitiveTopicsClassification": "Классифицируй чувствительную тему по запросу \nзапрос: ", + "TERRa": "Given a premise, retrieve a hypothesis that is entailed by the premise", + "RuSTSBenchmarkSTS": "Retrieve semantically similar text", + "RUParaPhraserSTS": "Retrieve semantically similar text", + "CEDRClassification": "Дан комментарий, определи выраженную в нем эмоцию (радость, грусть, удивление, страх, гнев или нейтрально)", + "GeoreviewClassification": "Classify the organization rating based on the reviews", + "GeoreviewClusteringP2P": "Классифицируй рейтинг организации на основе отзыва", + "HeadlineClassification": "Классифицируй тему данного новостного заголовка", + "InappropriatenessClassification": "Классифицируй данный комментарий как токсичный или не токсичный", + "KinopoiskClassification": "Classify the sentiment expressed in the given movie review text", + "RuReviewsClassification": "Classify product reviews into positive, negative or neutral sentiment", + "RuSciBenchGRNTIClassification": "Classify the category of scientific papers based on the titles and abstracts", + "RuSciBenchGRNTIClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации", + "RuSciBenchOECDClassification": "Classify the category of scientific papers based on the titles and abstracts", + "RuSciBenchOECDClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации", + "SensitiveTopicsClassification": "Классифицируй чувствительную тему по запросу", "RuBQRetrieval": { - "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "query": "Given a question, retrieve Wikipedia passages that answer the question", "document": "", }, "RuBQReranking": { - "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "query": "Given a question, retrieve Wikipedia passages that answer the question", "document": "", }, "RiaNewsRetrieval": { - "query": "Given a news title, retrieve relevant news article\nquery: ", + "query": "Given a news title, retrieve relevant news article", "document": "", }, "MIRACLReranking": { - "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "query": "Given a question, retrieve Wikipedia passages that answer the question", "document": "", }, "MIRACLRetrieval": { - "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "query": "Given a question, retrieve Wikipedia passages that answer the question", "document": "", }, + "ArguAna": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "Given a search query, retrieve passages that answer the question", + }, + "CQADupstackAndroidRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackEnglishRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackGamingRetrieval": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "Given a search query, retrieve passages that answer the question", + }, + "CQADupstackGisRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackMathematicaRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackPhysicsRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackProgrammersRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackStatsRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackTexRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackUnixRetrieval": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "Given a search query, retrieve passages that answer the question", + }, + "CQADupstackWebmastersRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackWordpressRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "ClimateFEVER": { + "query": "Given a claim about climate change, retrieve documents that support or refute the claim", + "document": "", + }, + "ClimateFEVERHardNegatives": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "DBPedia": { + "query": "Given a query, retrieve relevant entity descriptions from DBPedia", + "document": "", + }, + "FEVER": { + "query": "Given a claim, retrieve documents that support or refute the claim", + "document": "", + }, + "FEVERHardNegatives": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "FiQA2018": { + "query": "Given a web search query, retrieve relevant passages that answer the query", + "document": "", + }, + "HotpotQA": { + "query": "Given a multi-hop question, retrieve documents that can help answer the question", + "document": "", + }, + "HotpotQAHardNegatives": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "MSMARCO": { + "query": "Given a web search query, retrieve relevant passages that answer the query", + "document": "", + }, + "NFdocument": { + "query": "Given a question, retrieve relevant documents that best answer the question", + "document": "", + }, + "NQ": { + "query": "Given a question, retrieve Wikipedia passages that answer the question", + "document": "", + }, + "QuoraRetrieval": { + "query": "Given a question, retrieve questions that are semantically equivalent to the given question", + "document": "", + }, + "SCIDOCS": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "SciFact": { + "query": "Given a scientific claim, retrieve documents that support or refute the claim", + "document": "", + }, + "TRECCOVID": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "Touche2020": { + "query": "Given a question, retrieve detailed and persuasive arguments that answer the question", + "document": "", + }, + "Touche2020Retrieval.v3": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "BIOSSES": "Retrieve semantically similar text", + "SICK-R": "Retrieve semantically similar text", + "STS12": "Retrieve semantically similar text", + "STS13": "Retrieve semantically similar text", + "STS14": "Retrieve semantically similar text", + "STS15": "Retrieve semantically similar text", + "STS16": "Retrieve semantically similar text", + "STS17": "Retrieve semantically similar text", + "STS22": "Retrieve semantically similar text", + "STS22.v2": "Retrieve semantically similar text", + "STSBenchmark": "Retrieve semantically similar text", + "SummEval": "Given a news summary, retrieve other semantically similar summaries", + "SummEvalSummarization.v2": "Given a news summary, retrieve other semantically similar summaries", + "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not-counterfactual", + "AmazonPolarityClassification": "Classify Amazon reviews into positive or negative sentiment", + "AmazonReviewsClassification": "Classify the given Amazon review into its appropriate rating category", + "Banking77Classification": "Given a online banking query, find the corresponding intents", + "EmotionClassification": "Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", + "ImdbClassification": "Classify the sentiment expressed in the given movie review text from the IMDB dataset", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios", + "MTOPDomainClassification": "Classify the intent domain of the given utterance in task-oriented conversation", + "MTOPIntentClassification": "Classify the intent of the given utterance in task-oriented conversation", + "ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic", + "TweetSentimentExtractionClassification": "Classify the sentiment of a given tweet as either positive, negative, or neutral", + "ArxivClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArxivClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "ArXivHierarchicalClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArXivHierarchicalClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "BiorxivClusteringP2P": "Identify the main category of Biorxiv papers based on the titles and abstracts", + "BiorxivClusteringS2S": "Identify the main category of Biorxiv papers based on the titles", + "BiorxivClusteringP2P.v2": "Identify the main category of Biorxiv papers based on the titles and abstracts", + "MedrxivClusteringP2P": "Identify the main category of Medrxiv papers based on the titles and abstract", + "MedrxivClusteringS2S": "Identify the main category of Medrxiv papers based on the titles", + "MedrxivClusteringP2P.v2": "Identify the main category of Medrxiv papers based on the titles and abstract", + "MedrxivClusteringS2S.v2": "Identify the main category of Medrxiv papers based on the titles", + "RedditClustering": "Identify the topic or theme of Reddit posts based on the titles", + "RedditClusteringP2P": "Identify the topic or theme of Reddit posts based on the titles and posts", + "StackExchangeClustering": "Identify the topic or theme of StackExchange posts based on the titles", + "StackExchangeClusteringP2P": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "StackExchangeClustering.v2": "Identify the topic or theme of StackExchange posts based on the titles", + "StackExchangeClusteringP2P.v2": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "TwentyNewsgroupsClustering": "Identify the topic or theme of the given news articles", + "TwentyNewsgroupsClustering.v2": "Identify the topic or theme of the given news articles", + "AskUbuntuDupQuestions": { + "query": "Retrieve duplicate questions from AskUbuntu forum", + "document": "Retrieve duplicate questions from AskUbuntu forum", + }, + "MindSmallReranking": "Given a search query, retrieve passages that answer the question", + "SciDocsRR": "Given a title of a scientific paper, retrieve the titles of other relevant papers", + "StackOverflowDupQuestions": "Retrieve duplicate questions from StackOverflow forum", + "SprintDuplicateQuestions": "Retrieve duplicate questions from Sprint forum", + "TwitterSemEval2015": "Retrieve tweets that are semantically similar to the given tweet", + "TwitterURLCorpus": "Retrieve tweets that are semantically similar to the given tweet", } rubert_tiny = ModelMeta( @@ -633,12 +804,12 @@ loader=partial( # type: ignore InstructSentenceTransformerWrapper, model_name="ai-sage/Giga-Embeddings-instruct", - revision="40b27667b9ad586d7812675df76e5062ccc80b0e", - instruction_template="{instruction}", - max_seq_length=512, - apply_instruction_to_passages=False, - prompts_dict=GIGA_task_prompts, + revision="0ad5b29bfecd806cecc9d66b927d828a736594dc", trust_remote_code=True, + instruction_template="Instruct: {instruction}\nQuery: ", + max_seq_length=4096, + apply_instruction_to_passages=True, + prompts_dict=GIGA_task_prompts, model_kwargs={ "torch_dtype": torch.bfloat16, }, @@ -646,8 +817,8 @@ name="ai-sage/Giga-Embeddings-instruct", languages=["eng-Latn", "rus-Cyrl"], open_weights=True, - revision="40b27667b9ad586d7812675df76e5062ccc80b0e", - release_date="2025-06-05", + revision="0ad5b29bfecd806cecc9d66b927d828a736594dc", + release_date="2025-09-23", n_parameters=3_227_176_961, memory_usage_mb=12865, embed_dim=2048, diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index aaee6ae9da..5e0d882c82 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -205,6 +205,33 @@ def _batched_encode( PromptType.document.value: "document", } +voyage_3_large = ModelMeta( + name="voyageai/voyage-3-large", # Date of publication of this post https://blog.voyageai.com/2025/01/07/voyage-3-large/ + revision="1", + release_date="2025-01-07", + languages=None, # supported languages not specified + loader=partial( # type: ignore + VoyageWrapper, + model_name="voyage-3-large", + max_tokens=32000, + model_prompts=model_prompts, + ), + max_tokens=32000, + embed_dim=1024, + open_weights=False, + n_parameters=None, + memory_usage_mb=None, + license=None, + reference="https://blog.voyageai.com/2025/01/07/voyage-3-large/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, +) + + voyage_3_5 = ModelMeta( name="voyageai/voyage-3.5", revision="1", diff --git a/mteb/models/wrapper.py b/mteb/models/wrapper.py index ccbdc59713..0c0112fbc4 100644 --- a/mteb/models/wrapper.py +++ b/mteb/models/wrapper.py @@ -200,5 +200,5 @@ def get_task_instruction( ) -> str: instruction = self.get_instruction(task_name, prompt_type, prompts_dict) if self.instruction_template: - return self.format_instruction(instruction) + return self.format_instruction(instruction, prompt_type) return instruction diff --git a/mteb/tasks/MultiLabelClassification/__init__.py b/mteb/tasks/MultiLabelClassification/__init__.py index 096f96a880..a998fb6ac9 100644 --- a/mteb/tasks/MultiLabelClassification/__init__.py +++ b/mteb/tasks/MultiLabelClassification/__init__.py @@ -8,3 +8,5 @@ from .rus.CEDRClassification import * from .rus.ru_toixic_multilabelclassification_okmlcup import * from .rus.SensitiveTopicsClassification import * +from .swe.SwedishPatentCPCGroupClassification import * +from .swe.SwedishPatentCPCSubclassClassification import * diff --git a/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCGroupClassification.py b/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCGroupClassification.py new file mode 100644 index 0000000000..2eb93c61d0 --- /dev/null +++ b/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCGroupClassification.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskMultilabelClassification import ( + AbsTaskMultilabelClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SwedishPatentCPCGroupClassification(AbsTaskMultilabelClassification): + metadata = TaskMetadata( + name="SwedishPatentCPCGroupClassification", + description="""This dataset contains historical Swedish patent documents (1885-1972) classified according to the Cooperative Patent Classification (CPC) system at the group level. Each document can have multiple labels, making this a challenging multi-label classification task with significant class imbalance and data sparsity characteristics. The dataset includes patent claims text extracted from digitally recreated versions of historical Swedish patents, generated using Optical Character Recognition (OCR) from original paper documents. The text quality varies due to OCR limitations, but all CPC labels were manually assigned by patent engineers at PRV (Swedish Patent and Registration Office), ensuring high reliability for machine learning applications.""", + reference="https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254", + type="MultilabelClassification", + category="s2s", + modalities=["text"], + eval_splits=["train"], + eval_langs=["swe-Latn"], + main_score="accuracy", + dataset={ + "path": "atheer2104/swedish-patent-cpc-group-new", + "revision": "d1980d69e2fcf11e912025ba6bb1e3afe6b9168a", + }, + date=("1885-01-01", "1972-01-01"), + domains=["Legal", "Government"], + task_subtypes=[], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@mastersthesis{Salim1987995, + author = {Salim, Atheer}, + institution = {KTH, School of Electrical Engineering and Computer Science (EECS)}, + keywords = {Multi-label Text Classification, Machine Learning, Patent Classification, Deep Learning, Natural Language Processing, Textklassificering med flera Klasser, Maskininlärning, Patentklassificering, Djupinlärning, Språkteknologi}, + number = {2025:571}, + pages = {70}, + school = {KTH, School of Electrical Engineering and Computer Science (EECS)}, + series = {TRITA-EECS-EX}, + title = {Machine Learning for Classifying Historical Swedish Patents : A Comparison of Textual and Combined Data Approaches}, + url = {https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254}, + year = {2025}, +} +""", + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"], n_samples=8192 + ) + + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"], n_samples=2048 + ) diff --git a/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCSubclassClassification.py b/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCSubclassClassification.py new file mode 100644 index 0000000000..f9c0670c74 --- /dev/null +++ b/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCSubclassClassification.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskMultilabelClassification import ( + AbsTaskMultilabelClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SwedishPatentCPCSubclassClassification(AbsTaskMultilabelClassification): + metadata = TaskMetadata( + name="SwedishPatentCPCSubclassClassification", + description="""This dataset contains historical Swedish patent documents (1885-1972) classified according to the Cooperative Patent Classification (CPC) system. Each document can have multiple labels, making this a multi-label classification task with significant implications for patent retrieval and prior art search. + The dataset includes patent claims text extracted from digitally recreated versions of historical Swedish patents, generated using Optical Character Recognition (OCR) from original paper documents. The text quality varies due to OCR limitations, but all CPC labels were manually assigned by patent engineers at PRV (Swedish Patent and Registration Office), ensuring high reliability for machine learning applications.""", + reference="https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254", + type="MultilabelClassification", + category="s2s", + modalities=["text"], + eval_splits=["train"], + eval_langs=["swe-Latn"], + main_score="accuracy", + dataset={ + "path": "atheer2104/swedish-patent-cpc-subclass-new", + "revision": "114fcab0a716a27cf3f54a7ebd6e08f45f62de88", + }, + date=("1885-01-01", "1972-01-01"), + domains=["Legal", "Government"], + task_subtypes=[], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@mastersthesis{Salim1987995, + author = {Salim, Atheer}, + institution = {KTH, School of Electrical Engineering and Computer Science (EECS)}, + keywords = {Multi-label Text Classification, Machine Learning, Patent Classification, Deep Learning, Natural Language Processing, Textklassificering med flera Klasser, Maskininlärning, Patentklassificering, Djupinlärning, Språkteknologi}, + number = {2025:571}, + pages = {70}, + school = {KTH, School of Electrical Engineering and Computer Science (EECS)}, + series = {TRITA-EECS-EX}, + title = {Machine Learning for Classifying Historical Swedish Patents : A Comparison of Textual and Combined Data Approaches}, + url = {https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254}, + year = {2025}, +} +""", + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"], n_samples=8192 + ) + + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"], n_samples=2048 + ) diff --git a/mteb/tasks/MultiLabelClassification/swe/__init__.py b/mteb/tasks/MultiLabelClassification/swe/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/pyproject.toml b/pyproject.toml index 96110d5d6c..55da5ac840 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.38.57" +version = "1.38.61" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ diff --git a/scripts/make_leaderboard.py b/scripts/make_leaderboard.py index 4e322b3210..fff52536c6 100644 --- a/scripts/make_leaderboard.py +++ b/scripts/make_leaderboard.py @@ -7,7 +7,10 @@ import pandas as pd import mteb -from mteb.leaderboard.table import create_tables +from mteb.leaderboard.table import ( + apply_per_task_styling_from_benchmark, + apply_summary_styling_from_benchmark, +) from mteb.load_results import load_results logging.basicConfig(level=logging.INFO) @@ -60,11 +63,14 @@ def load_leaderboard( base_results=benchmark_results ).join_revisions() - # Convert scores into long format - scores_long = benchmark_results_filtered.get_scores(format="long") - # Convert scores into leaderboard tables - summary_gr_df, per_task_gr_df = create_tables(scores_long=scores_long) + loaded_benchmark = mteb.get_benchmark(benchmark.name) + summary_gr_df = apply_summary_styling_from_benchmark( + loaded_benchmark, benchmark_results_filtered + ) + per_task_gr_df = apply_per_task_styling_from_benchmark( + loaded_benchmark, benchmark_results_filtered + ) # Convert Gradio DataFrames to Pandas summary_df = pd.DataFrame(