From 82d9e297eb08a1d9c1f6b1cf854bc316ac978fe5 Mon Sep 17 00:00:00 2001 From: Yongbin Choi Date: Sat, 27 Sep 2025 08:35:26 +0900 Subject: [PATCH 01/20] fix(models): ensure prompt_type is passed to format_instruction (#3216) --- mteb/models/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/wrapper.py b/mteb/models/wrapper.py index ccbdc59713..0c0112fbc4 100644 --- a/mteb/models/wrapper.py +++ b/mteb/models/wrapper.py @@ -200,5 +200,5 @@ def get_task_instruction( ) -> str: instruction = self.get_instruction(task_name, prompt_type, prompts_dict) if self.instruction_template: - return self.format_instruction(instruction) + return self.format_instruction(instruction, prompt_type) return instruction From d0d427de5fa73239a7c0f1811e171919e1772997 Mon Sep 17 00:00:00 2001 From: semantic-release Date: Sat, 27 Sep 2025 00:58:41 +0000 Subject: [PATCH 02/20] 1.38.58 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 96110d5d6c..891cfa51ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.38.57" +version = "1.38.58" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 08bba494f03ab0933aa1fcdab59b848002069d30 Mon Sep 17 00:00:00 2001 From: fzoll <5575946+fzoll@users.noreply.github.com> Date: Sat, 27 Sep 2025 11:48:55 +0200 Subject: [PATCH 03/20] Adding Cohere's output_dimension and embedding_type parameter (#3204) * Adding Cohere's output_dimension and embedding_type parameter Cohere's embed-v4 binary and int8 * Correcting due to comments --- mteb/models/cohere_models.py | 67 ++++++++++-- mteb/models/cohere_v.py | 197 ++++++++++++++++++++++++++++++----- 2 files changed, 231 insertions(+), 33 deletions(-) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index dbbfd35dfa..e076db7b7f 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -1,7 +1,7 @@ from __future__ import annotations from functools import partial -from typing import Any +from typing import Any, Literal, get_args import numpy as np import torch @@ -123,6 +123,13 @@ "zul-Latn", ] +EMBEDDING_TYPE = Literal[ + "float", + "int8", + "uint8", + "binary", +] + # Implementation follows https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/main/src/seb/registered_models/cohere_models.py class CohereTextEmbeddingModel(Wrapper): @@ -131,11 +138,16 @@ def __init__( model_name: str, sep: str = " ", model_prompts: dict[str, str] | None = None, + embedding_type: EMBEDDING_TYPE = "float", + output_dimension: int | None = None, **kwargs, ) -> None: self.model_name = model_name self.sep = sep self.model_prompts = self.validate_task_to_prompt_name(model_prompts) + assert embedding_type in get_args(EMBEDDING_TYPE) + self.embedding_type = embedding_type + self.output_dimension = output_dimension def _embed( self, @@ -160,11 +172,16 @@ def _embed( for batch in tqdm.tqdm(batches, leave=False, disable=not show_progress_bar): while retries > 0: # Cohere's API is not always reliable try: - response = client.embed( - texts=batch, - model=self.model_name, - input_type=cohere_task_type, - ) + embed_kwargs = { + "texts": batch, + "model": self.model_name, + "input_type": cohere_task_type, + "embedding_types": [self.embedding_type], + } + if self.output_dimension is not None: + embed_kwargs["output_dimension"] = self.output_dimension + + response = client.embed(**embed_kwargs) break except Exception as e: print(f"Retrying... {retries} retries left.") @@ -172,9 +189,43 @@ def _embed( if retries == 0: raise e - all_embeddings.extend(torch.tensor(response.embeddings).numpy()) + # Get embeddings based on requested type + if self.embedding_type == "float": + embeddings = response.embeddings.float + elif self.embedding_type == "int8": + embeddings = response.embeddings.int8 + elif self.embedding_type == "uint8": + embeddings = response.embeddings.uint8 + elif self.embedding_type == "binary": + embeddings = response.embeddings.binary + else: + raise ValueError(f"Embedding type {self.embedding_type} not allowed") + all_embeddings.extend(torch.tensor(embeddings).numpy()) + + embeddings_array = np.array(all_embeddings) + + # Post-process embeddings based on type (similar to voyage_models.py) + primary_embedding_type = self.embedding_type + + if primary_embedding_type == "binary": + # Unpack bit-packed embeddings: each byte contains 8 embedding values + unpacked_embeddings = [] + for embedding in embeddings_array: + # Convert bytes to bits and unpack + unpacked = [] + for byte_val in embedding: + # Extract 8 bits from each byte (LSB first) + for bit_pos in range(8): + bit_val = (byte_val >> bit_pos) & 1 + # Convert 0/1 to -1/1 for binary (signed) + unpacked.append(1.0 if bit_val else -1.0) + unpacked_embeddings.append(unpacked) + embeddings_array = np.array(unpacked_embeddings, dtype=np.float32) + elif primary_embedding_type in ["int8", "uint8"]: + # Convert int8/uint8 embeddings to float32 + embeddings_array = embeddings_array.astype(np.float32) - return np.array(all_embeddings) + return embeddings_array def encode( self, diff --git a/mteb/models/cohere_v.py b/mteb/models/cohere_v.py index 22aa2c8d36..731d3addb9 100644 --- a/mteb/models/cohere_v.py +++ b/mteb/models/cohere_v.py @@ -5,7 +5,7 @@ import os import time from functools import partial -from typing import Any +from typing import Any, Literal, get_args import torch from PIL import Image @@ -16,6 +16,33 @@ from mteb.model_meta import ModelMeta from mteb.requires_package import requires_image_dependencies, requires_package + +def _post_process_embeddings( + embeddings_array: torch.Tensor, embedding_type: str +) -> torch.Tensor: + """Post-process embeddings based on type (similar to voyage_models.py)""" + if embedding_type == "binary": + # Unpack bit-packed embeddings: each byte contains 8 embedding values + unpacked_embeddings = [] + for embedding in embeddings_array: + # Convert bytes to bits and unpack + unpacked = [] + for byte_val in embedding: + # Extract 8 bits from each byte (LSB first) + for bit_pos in range(8): + bit_val = (byte_val >> bit_pos) & 1 + # Convert 0/1 to -1/1 for binary (signed) + unpacked.append(1.0 if bit_val else -1.0) + unpacked_embeddings.append(unpacked) + return torch.tensor(unpacked_embeddings, dtype=torch.float32) + elif embedding_type in ["int8", "uint8"]: + # Convert int8/uint8 embeddings to float32 + return embeddings_array.float() + else: + # For float and other types, return as-is + return embeddings_array + + all_languages = [ "afr-Latn", "amh-Ethi", @@ -128,6 +155,13 @@ "zul-Latn", ] +EMBEDDING_TYPE = Literal[ + "float", + "int8", + "uint8", + "binary", +] + def cohere_v_loader(**kwargs): model_name = kwargs.get("model_name", "Cohere") @@ -140,6 +174,8 @@ class CohereMultiModalModelWrapper: def __init__( self, model_name: str, + embedding_type: EMBEDDING_TYPE = "float", + output_dimension: int | None = None, **kwargs: Any, ): """Wrapper for Cohere multimodal embedding model, @@ -152,6 +188,9 @@ def __init__( from torchvision import transforms self.model_name = model_name + assert embedding_type in get_args(EMBEDDING_TYPE) + self.embedding_type = embedding_type + self.output_dimension = output_dimension api_key = os.getenv("COHERE_API_KEY") self.client = cohere.ClientV2(api_key) self.image_format = "JPEG" @@ -170,14 +209,39 @@ def get_text_embeddings( for i in tqdm(range(0, len(texts), batch_size)): batch_texts = texts[i : i + batch_size] - response = self.client.embed( - texts=batch_texts, - model=self.model_name, - input_type="search_document", - ) - all_text_embeddings.append(torch.tensor(response.embeddings.float)) + embed_kwargs = { + "texts": batch_texts, + "model": self.model_name, + "input_type": "search_document", + "embedding_types": [self.embedding_type], + } + if self.output_dimension is not None: + embed_kwargs["output_dimension"] = self.output_dimension + + response = self.client.embed(**embed_kwargs) + + # Get embeddings based on requested type + if self.embedding_type == "float": + embeddings = response.embeddings.float + elif self.embedding_type == "int8": + embeddings = response.embeddings.int8 + elif self.embedding_type == "uint8": + embeddings = response.embeddings.uint8 + elif self.embedding_type == "binary": + embeddings = response.embeddings.binary + else: + raise ValueError( + f"Embedding type {self.embedding_type} not allowed" + ) + all_text_embeddings.append(torch.tensor(embeddings)) all_text_embeddings = torch.cat(all_text_embeddings, dim=0) + + # Post-process embeddings based on type + all_text_embeddings = _post_process_embeddings( + all_text_embeddings, self.embedding_type + ) + return all_text_embeddings def get_image_embeddings( @@ -206,15 +270,31 @@ def get_image_embeddings( image_base64 = ( f"data:{content_type};base64,{stringified_buffer}" ) - response = self.client.embed( - model=self.model_name, - input_type="image", - embedding_types=["float"], - images=[image_base64], - ) - all_image_embeddings.append( - torch.tensor(response.embeddings.float) - ) + embed_kwargs = { + "model": self.model_name, + "input_type": "image", + "embedding_types": [self.embedding_type], + "images": [image_base64], + } + if self.output_dimension is not None: + embed_kwargs["output_dimension"] = self.output_dimension + + response = self.client.embed(**embed_kwargs) + + # Get embeddings based on requested type + if self.embedding_type == "float": + embeddings = response.embeddings.float + elif self.embedding_type == "int8": + embeddings = response.embeddings.int8 + elif self.embedding_type == "uint8": + embeddings = response.embeddings.uint8 + elif self.embedding_type == "binary": + embeddings = response.embeddings.binary + else: + raise ValueError( + f"Embedding type {self.embedding_type} not allowed" + ) + all_image_embeddings.append(torch.tensor(embeddings)) time.sleep(1.5) else: for i in tqdm(range(0, len(images), batch_size)): @@ -231,17 +311,38 @@ def get_image_embeddings( image_base64 = ( f"data:{content_type};base64,{stringified_buffer}" ) - response = self.client.embed( - model=self.model_name, - input_type="image", - embedding_types=["float"], - images=[image_base64], - ) - all_image_embeddings.append( - torch.tensor(response.embeddings.float) - ) + embed_kwargs = { + "model": self.model_name, + "input_type": "image", + "embedding_types": [self.embedding_type], + "images": [image_base64], + } + if self.output_dimension is not None: + embed_kwargs["output_dimension"] = self.output_dimension + + response = self.client.embed(**embed_kwargs) + + # Get embeddings based on requested type + if self.embedding_type == "float": + embeddings = response.embeddings.float + elif self.embedding_type == "int8": + embeddings = response.embeddings.int8 + elif self.embedding_type == "uint8": + embeddings = response.embeddings.uint8 + elif self.embedding_type == "binary": + embeddings = response.embeddings.binary + else: + # Fallback for unknown types + embeddings = response.embeddings.float + all_image_embeddings.append(torch.tensor(embeddings)) time.sleep(1.5) all_image_embeddings = torch.cat(all_image_embeddings, dim=0) + + # Post-process embeddings based on type + all_image_embeddings = _post_process_embeddings( + all_image_embeddings, self.embedding_type + ) + return all_image_embeddings def calculate_probs(self, text_embeddings, image_embeddings): @@ -360,3 +461,49 @@ def get_fused_embeddings( use_instructions=False, training_datasets=None, ) + +cohere_embed_v4_multimodal_binary = ModelMeta( + loader=partial(cohere_v_loader, model_name="embed-v4.0", embedding_type="binary"), + name="Cohere/Cohere-embed-v4.0 (output_dtype=binary)", + languages=all_languages, + revision="1", + release_date="2024-12-01", + n_parameters=None, + memory_usage_mb=None, + max_tokens=128000, + embed_dim=1536, + license=None, + similarity_fn_name="cosine", + framework=[], + modalities=["image", "text"], + open_weights=False, + public_training_code=None, + public_training_data=None, + reference="https://docs.cohere.com/docs/embeddings", + use_instructions=False, + training_datasets=None, + adapted_from="Cohere/Cohere-embed-v4.0", +) + +cohere_embed_v4_multimodal_int8 = ModelMeta( + loader=partial(cohere_v_loader, model_name="embed-v4.0", embedding_type="int8"), + name="Cohere/Cohere-embed-v4.0 (output_dtype=int8)", + languages=all_languages, + revision="1", + release_date="2024-12-01", + n_parameters=None, + memory_usage_mb=None, + max_tokens=128000, + embed_dim=1536, + license=None, + similarity_fn_name="cosine", + framework=[], + modalities=["image", "text"], + open_weights=False, + public_training_code=None, + public_training_data=None, + reference="https://docs.cohere.com/docs/embeddings", + use_instructions=False, + training_datasets=None, + adapted_from="Cohere/Cohere-embed-v4.0", +) From e863bc1db388361996409ebb3950f560ae7b0ff5 Mon Sep 17 00:00:00 2001 From: Atheer Date: Sat, 27 Sep 2025 17:10:21 +0200 Subject: [PATCH 04/20] dataset: add swedish cpc patent classifications to mteb (#3072) * feat: add swedish cpc patent classifications to mteb * fix: formatting and init imports * fix: update mteb task according to feedback * fix: perform citation and code formatting * fix: add train and test split for both datasets --- .../MultiLabelClassification/__init__.py | 2 + .../SwedishPatentCPCGroupClassification.py | 54 ++++++++++++++++++ .../SwedishPatentCPCSubclassClassification.py | 55 +++++++++++++++++++ .../MultiLabelClassification/swe/__init__.py | 0 4 files changed, 111 insertions(+) create mode 100644 mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCGroupClassification.py create mode 100644 mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCSubclassClassification.py create mode 100644 mteb/tasks/MultiLabelClassification/swe/__init__.py diff --git a/mteb/tasks/MultiLabelClassification/__init__.py b/mteb/tasks/MultiLabelClassification/__init__.py index 096f96a880..a998fb6ac9 100644 --- a/mteb/tasks/MultiLabelClassification/__init__.py +++ b/mteb/tasks/MultiLabelClassification/__init__.py @@ -8,3 +8,5 @@ from .rus.CEDRClassification import * from .rus.ru_toixic_multilabelclassification_okmlcup import * from .rus.SensitiveTopicsClassification import * +from .swe.SwedishPatentCPCGroupClassification import * +from .swe.SwedishPatentCPCSubclassClassification import * diff --git a/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCGroupClassification.py b/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCGroupClassification.py new file mode 100644 index 0000000000..2eb93c61d0 --- /dev/null +++ b/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCGroupClassification.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskMultilabelClassification import ( + AbsTaskMultilabelClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SwedishPatentCPCGroupClassification(AbsTaskMultilabelClassification): + metadata = TaskMetadata( + name="SwedishPatentCPCGroupClassification", + description="""This dataset contains historical Swedish patent documents (1885-1972) classified according to the Cooperative Patent Classification (CPC) system at the group level. Each document can have multiple labels, making this a challenging multi-label classification task with significant class imbalance and data sparsity characteristics. The dataset includes patent claims text extracted from digitally recreated versions of historical Swedish patents, generated using Optical Character Recognition (OCR) from original paper documents. The text quality varies due to OCR limitations, but all CPC labels were manually assigned by patent engineers at PRV (Swedish Patent and Registration Office), ensuring high reliability for machine learning applications.""", + reference="https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254", + type="MultilabelClassification", + category="s2s", + modalities=["text"], + eval_splits=["train"], + eval_langs=["swe-Latn"], + main_score="accuracy", + dataset={ + "path": "atheer2104/swedish-patent-cpc-group-new", + "revision": "d1980d69e2fcf11e912025ba6bb1e3afe6b9168a", + }, + date=("1885-01-01", "1972-01-01"), + domains=["Legal", "Government"], + task_subtypes=[], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@mastersthesis{Salim1987995, + author = {Salim, Atheer}, + institution = {KTH, School of Electrical Engineering and Computer Science (EECS)}, + keywords = {Multi-label Text Classification, Machine Learning, Patent Classification, Deep Learning, Natural Language Processing, Textklassificering med flera Klasser, Maskininlärning, Patentklassificering, Djupinlärning, Språkteknologi}, + number = {2025:571}, + pages = {70}, + school = {KTH, School of Electrical Engineering and Computer Science (EECS)}, + series = {TRITA-EECS-EX}, + title = {Machine Learning for Classifying Historical Swedish Patents : A Comparison of Textual and Combined Data Approaches}, + url = {https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254}, + year = {2025}, +} +""", + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"], n_samples=8192 + ) + + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"], n_samples=2048 + ) diff --git a/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCSubclassClassification.py b/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCSubclassClassification.py new file mode 100644 index 0000000000..f9c0670c74 --- /dev/null +++ b/mteb/tasks/MultiLabelClassification/swe/SwedishPatentCPCSubclassClassification.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskMultilabelClassification import ( + AbsTaskMultilabelClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SwedishPatentCPCSubclassClassification(AbsTaskMultilabelClassification): + metadata = TaskMetadata( + name="SwedishPatentCPCSubclassClassification", + description="""This dataset contains historical Swedish patent documents (1885-1972) classified according to the Cooperative Patent Classification (CPC) system. Each document can have multiple labels, making this a multi-label classification task with significant implications for patent retrieval and prior art search. + The dataset includes patent claims text extracted from digitally recreated versions of historical Swedish patents, generated using Optical Character Recognition (OCR) from original paper documents. The text quality varies due to OCR limitations, but all CPC labels were manually assigned by patent engineers at PRV (Swedish Patent and Registration Office), ensuring high reliability for machine learning applications.""", + reference="https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254", + type="MultilabelClassification", + category="s2s", + modalities=["text"], + eval_splits=["train"], + eval_langs=["swe-Latn"], + main_score="accuracy", + dataset={ + "path": "atheer2104/swedish-patent-cpc-subclass-new", + "revision": "114fcab0a716a27cf3f54a7ebd6e08f45f62de88", + }, + date=("1885-01-01", "1972-01-01"), + domains=["Legal", "Government"], + task_subtypes=[], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@mastersthesis{Salim1987995, + author = {Salim, Atheer}, + institution = {KTH, School of Electrical Engineering and Computer Science (EECS)}, + keywords = {Multi-label Text Classification, Machine Learning, Patent Classification, Deep Learning, Natural Language Processing, Textklassificering med flera Klasser, Maskininlärning, Patentklassificering, Djupinlärning, Språkteknologi}, + number = {2025:571}, + pages = {70}, + school = {KTH, School of Electrical Engineering and Computer Science (EECS)}, + series = {TRITA-EECS-EX}, + title = {Machine Learning for Classifying Historical Swedish Patents : A Comparison of Textual and Combined Data Approaches}, + url = {https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254}, + year = {2025}, +} +""", + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"], n_samples=8192 + ) + + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"], n_samples=2048 + ) diff --git a/mteb/tasks/MultiLabelClassification/swe/__init__.py b/mteb/tasks/MultiLabelClassification/swe/__init__.py new file mode 100644 index 0000000000..e69de29bb2 From 8c180d4ed5a9d437c7740e746cf960602790be29 Mon Sep 17 00:00:00 2001 From: Yong woo Song Date: Sun, 28 Sep 2025 00:11:19 +0900 Subject: [PATCH 05/20] fix: AttributeError in ColPaliEngineWrapper similarity method (#3177) * fix: delete kwargs for similarity score in ColPaliEngineWrapper for method behavior * chore: fix colpali_models similarity handle device --- mteb/models/colpali_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/colpali_models.py b/mteb/models/colpali_models.py index 35396b92d6..24f668fbec 100644 --- a/mteb/models/colpali_models.py +++ b/mteb/models/colpali_models.py @@ -131,7 +131,7 @@ def calculate_probs(self, text_embeddings, image_embeddings): return scores.softmax(dim=-1) def similarity(self, a, b): - return self.processor.score(a, b, **self.processor_kwargs) + return self.processor.score(a, b, device=self.device) class ColPaliWrapper(ColPaliEngineWrapper): From 0aacba4bfa6f630bd57daf898927e2a69cfc801c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 27 Sep 2025 15:14:00 +0000 Subject: [PATCH 06/20] Update tasks & benchmarks tables --- docs/tasks.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 74f7a9a6d8..3419e0d9b2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -889,6 +889,8 @@ The following tables give you an overview of the tasks in MTEB. | [SwahiliNewsClassification.v2](https://huggingface.co/datasets/Mollel/SwahiliNewsClassification) (Davis et al., 2020) | ['swa'] | Classification | s2s | [News, Written] | None | None | | [SweFaqRetrieval](https://spraakbanken.gu.se/en/resources/superlim) (Berdi{\v{c, 2023) | ['swe'] | Retrieval | s2s | [Government, Non-fiction, Written] | None | None | | [SweRecClassification.v2](https://aclanthology.org/2023.nodalida-1.20/) (Nielsen et al., 2023) | ['swe'] | Classification | s2s | [Reviews, Written] | None | None | +| [SwedishPatentCPCGroupClassification](https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254) (Salim et al., 2025) | ['swe'] | MultilabelClassification | s2s | [Government, Legal] | None | None | +| [SwedishPatentCPCSubclassClassification](https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254) (Salim et al., 2025) | ['swe'] | MultilabelClassification | s2s | [Government, Legal] | None | None | | [SwedishSentimentClassification.v2](https://huggingface.co/datasets/swedish_reviews) | ['swe'] | Classification | s2s | [Reviews, Written] | None | None | | [SwednClusteringP2P](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [SwednClusteringS2S](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Clustering | s2s | [News, Non-fiction, Written] | None | None | @@ -1937,7 +1939,7 @@ The following tables give you an overview of the tasks in MTEB. | suz | Sunwar | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | svk | Slovakian Sign Language | Sign Language | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 1 | 7 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | -| swe | Swedish | Indo-European | 0 | 1 | 0 | 6 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| swe | Swedish | Indo-European | 0 | 1 | 0 | 6 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 0 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 30 | | swg | Swabian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | swh | Swahili (individual language) | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | | swp | Suau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -2138,7 +2140,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 0 | 55 | 49 | 1496 | 872 | 321 | 7 | 137 | 22 | 5 | 0 | 3 | 29 | 96 | 4 | 68 | 702 | 91 | 2 | 2 | 6 | 7 | 37 | 24 | +| Total | None | None | None | 0 | 55 | 49 | 1496 | 872 | 321 | 7 | 137 | 22 | 5 | 0 | 3 | 31 | 96 | 4 | 68 | 702 | 91 | 2 | 2 | 6 | 7 | 37 | 24 | From 2e292cf5764dbeebf9b0b5846d13e95fbf978e32 Mon Sep 17 00:00:00 2001 From: semantic-release Date: Sat, 27 Sep 2025 16:29:49 +0000 Subject: [PATCH 07/20] 1.38.59 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 891cfa51ec..ed0ae71ab6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.38.58" +version = "1.38.59" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From f58ac2bb21b3c3cef7787335025f435adf2b8855 Mon Sep 17 00:00:00 2001 From: Yongbin Choi Date: Sun, 28 Sep 2025 04:40:49 +0900 Subject: [PATCH 08/20] fix: prevent EOS token truncation (#3218) * fix(models): prevent EOS token truncation for BMRetriever * refactor(models): refactor tokenizer setup in `InstructSentenceTransformerWrapper` * fix(models): correct eos token handling in `BMRetrieverWrapper` --- mteb/models/bmretriever_models.py | 22 ++++++++++++---------- mteb/models/instruct_wrapper.py | 22 +++++++++++----------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/mteb/models/bmretriever_models.py b/mteb/models/bmretriever_models.py index c579682947..b14b710976 100644 --- a/mteb/models/bmretriever_models.py +++ b/mteb/models/bmretriever_models.py @@ -37,12 +37,20 @@ def __init__( self.model_name = model_name self.instruction_template = instruction_template self.apply_instruction_to_passages = apply_instruction_to_passages - self.add_eos_token = add_eos_token self.prompts_dict = prompts_dict + tokenizer_params = {} + if add_eos_token: + tokenizer_params["add_eos_token"] = add_eos_token + if max_seq_length is not None: + tokenizer_params["model_max_length"] = max_seq_length + if padding_side is not None: + tokenizer_params["padding_side"] = padding_side + + kwargs.setdefault("tokenizer_args", {}).update(tokenizer_params) + transformer = Transformer( model_name, - max_seq_length=max_seq_length, **kwargs, ) pooling = Pooling( @@ -50,12 +58,6 @@ def __init__( ) self.model = SentenceTransformer(modules=[transformer, pooling]) - if max_seq_length is not None: - self.model.max_seq_length = max_seq_length - - if padding_side is not None: - self.model.tokenizer.padding_side = padding_side - # https://huggingface.co/datasets/BMRetriever/biomed_retrieval_dataset BMRETRIEVER_TRAINING_DATA = { @@ -158,7 +160,7 @@ def __init__( loader=partial( BMRetrieverWrapper, model_name="BMRetriever/BMRetriever-7B", - config_args={"revision": "e3569bfbcfe3a1bc48c142e11a7b0f38e86065a3"}, + config_args={"revision": "13e6adb9273c5f254e037987d6b44e9e4b005b9a"}, model_args={"torch_dtype": torch.float32}, instruction_template=instruction_template, padding_side="left", @@ -168,7 +170,7 @@ def __init__( name="BMRetriever/BMRetriever-7B", languages=["eng-Latn"], open_weights=True, - revision="e3569bfbcfe3a1bc48c142e11a7b0f38e86065a3", + revision="13e6adb9273c5f254e037987d6b44e9e4b005b9a", release_date="2024-04-29", embed_dim=4096, n_parameters=7_110_660_096, diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index c5be2a672d..4c085459f7 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -118,16 +118,22 @@ def __init__( "No instruction template provided. Instructions will be used as-is." ) + tokenizer_params = {} + if add_eos_token: + tokenizer_params["add_eos_token"] = add_eos_token + if max_seq_length is not None: + # https://github.com/UKPLab/sentence-transformers/blob/7341bf155b4349b88690b78c84beb5aa658c439f/sentence_transformers/models/Transformer.py#L115 + tokenizer_params["model_max_length"] = max_seq_length + if padding_side is not None: + tokenizer_params["padding_side"] = padding_side + + kwargs.setdefault("tokenizer_kwargs", {}).update(tokenizer_params) + self.model_name = model_name self.model = SentenceTransformer(model_name, revision=revision, **kwargs) self.instruction_template = instruction_template self.apply_instruction_to_passages = apply_instruction_to_passages - self.add_eos_token = add_eos_token self.prompts_dict = prompts_dict - if max_seq_length is not None: - self.model.max_seq_length = max_seq_length - if padding_side is not None: - self.model.tokenizer.padding_side = padding_side def encode( self, @@ -137,15 +143,9 @@ def encode( prompt_type: PromptType | None = None, **kwargs: Any, ) -> np.ndarray: - if self.add_eos_token: - sentences = [ - example + self.model.tokenizer.eos_token for example in sentences - ] - instruction = self.get_task_instruction( task_name, prompt_type, self.prompts_dict ) - # to passage prompts won't be applied to passages if ( not self.apply_instruction_to_passages From 3e86531d02460c1f219f41a5c3f82e81a5c43650 Mon Sep 17 00:00:00 2001 From: semantic-release Date: Sat, 27 Sep 2025 19:42:49 +0000 Subject: [PATCH 09/20] 1.38.60 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ed0ae71ab6..34551aeeec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.38.59" +version = "1.38.60" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 15f99091e740bea471264f8b202b1fb30bf0d023 Mon Sep 17 00:00:00 2001 From: Egor <31567312+ekolodin@users.noreply.github.com> Date: Mon, 29 Sep 2025 12:15:46 +0300 Subject: [PATCH 10/20] Update giga embeddings (#3210) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update giga embeddings * update giga embeddings * 3b-september-2025 * fixed * lint * Update mteb/models/ru_sentence_models.py Co-authored-by: Roman Solomatin * change revision due to flash-attn dependency * change apply_instruction_to_passages --------- Co-authored-by: Kolodin Egor Co-authored-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Co-authored-by: Roman Solomatin Co-authored-by: Неизвестный Пользователь722497 --- mteb/models/ru_sentence_models.py | 231 ++++++++++++++++++++++++++---- 1 file changed, 201 insertions(+), 30 deletions(-) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 9c3be1a76a..af492ef8e7 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -15,44 +15,215 @@ ) GIGA_task_prompts = { - "TERRa": "Given a premise, retrieve a hypothesis that is entailed by the premise\nquery: ", - "STS22": "Retrieve semantically similar text\nquery: ", - "RuSTSBenchmarkSTS": "Retrieve semantically similar text\nquery: ", - "RUParaPhraserSTS": "Retrieve semantically similar text\nquery: ", - "CEDRClassification": "Дан комментарий, определи выраженную в нем эмоцию (радость, грусть, удивление, страх, гнев или нейтрально) \nкомментарий: ", - "GeoreviewClassification": "Classify the organization rating based on the reviews\nquery: ", - "GeoreviewClusteringP2P": "Классифицируй рейтинг организации на основе отзыва \nотзыв: ", - "HeadlineClassification": "Классифицируй тему данного новостного заголовка \nзаголовок: ", - "InappropriatenessClassification": "Классифицируй данный комментарий как токсичный или не токсичный \nкомментарий: ", - "KinopoiskClassification": "Classify the sentiment expressed in the given movie review text\nquery: ", - "MassiveIntentClassification": "Given a user utterance as query, find the user intents\nquery: ", - "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios\nquery: ", - "RuReviewsClassification": "Classify product reviews into positive, negative or neutral sentiment\nquery: ", - "RuSciBenchGRNTIClassification": "Classify the category of scientific papers based on the titles and abstracts\nquery: ", - "RuSciBenchGRNTIClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации \nаннотация: ", - "RuSciBenchOECDClassification": "Classify the category of scientific papers based on the titles and abstracts\nquery: ", - "RuSciBenchOECDClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации \nаннотация: ", - "SensitiveTopicsClassification": "Классифицируй чувствительную тему по запросу \nзапрос: ", + "TERRa": "Given a premise, retrieve a hypothesis that is entailed by the premise", + "RuSTSBenchmarkSTS": "Retrieve semantically similar text", + "RUParaPhraserSTS": "Retrieve semantically similar text", + "CEDRClassification": "Дан комментарий, определи выраженную в нем эмоцию (радость, грусть, удивление, страх, гнев или нейтрально)", + "GeoreviewClassification": "Classify the organization rating based on the reviews", + "GeoreviewClusteringP2P": "Классифицируй рейтинг организации на основе отзыва", + "HeadlineClassification": "Классифицируй тему данного новостного заголовка", + "InappropriatenessClassification": "Классифицируй данный комментарий как токсичный или не токсичный", + "KinopoiskClassification": "Classify the sentiment expressed in the given movie review text", + "RuReviewsClassification": "Classify product reviews into positive, negative or neutral sentiment", + "RuSciBenchGRNTIClassification": "Classify the category of scientific papers based on the titles and abstracts", + "RuSciBenchGRNTIClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации", + "RuSciBenchOECDClassification": "Classify the category of scientific papers based on the titles and abstracts", + "RuSciBenchOECDClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации", + "SensitiveTopicsClassification": "Классифицируй чувствительную тему по запросу", "RuBQRetrieval": { - "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "query": "Given a question, retrieve Wikipedia passages that answer the question", "document": "", }, "RuBQReranking": { - "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "query": "Given a question, retrieve Wikipedia passages that answer the question", "document": "", }, "RiaNewsRetrieval": { - "query": "Given a news title, retrieve relevant news article\nquery: ", + "query": "Given a news title, retrieve relevant news article", "document": "", }, "MIRACLReranking": { - "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "query": "Given a question, retrieve Wikipedia passages that answer the question", "document": "", }, "MIRACLRetrieval": { - "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "query": "Given a question, retrieve Wikipedia passages that answer the question", "document": "", }, + "ArguAna": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "Given a search query, retrieve passages that answer the question", + }, + "CQADupstackAndroidRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackEnglishRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackGamingRetrieval": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "Given a search query, retrieve passages that answer the question", + }, + "CQADupstackGisRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackMathematicaRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackPhysicsRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackProgrammersRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackStatsRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackTexRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackUnixRetrieval": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "Given a search query, retrieve passages that answer the question", + }, + "CQADupstackWebmastersRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "CQADupstackWordpressRetrieval": { + "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "document": "", + }, + "ClimateFEVER": { + "query": "Given a claim about climate change, retrieve documents that support or refute the claim", + "document": "", + }, + "ClimateFEVERHardNegatives": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "DBPedia": { + "query": "Given a query, retrieve relevant entity descriptions from DBPedia", + "document": "", + }, + "FEVER": { + "query": "Given a claim, retrieve documents that support or refute the claim", + "document": "", + }, + "FEVERHardNegatives": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "FiQA2018": { + "query": "Given a web search query, retrieve relevant passages that answer the query", + "document": "", + }, + "HotpotQA": { + "query": "Given a multi-hop question, retrieve documents that can help answer the question", + "document": "", + }, + "HotpotQAHardNegatives": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "MSMARCO": { + "query": "Given a web search query, retrieve relevant passages that answer the query", + "document": "", + }, + "NFdocument": { + "query": "Given a question, retrieve relevant documents that best answer the question", + "document": "", + }, + "NQ": { + "query": "Given a question, retrieve Wikipedia passages that answer the question", + "document": "", + }, + "QuoraRetrieval": { + "query": "Given a question, retrieve questions that are semantically equivalent to the given question", + "document": "", + }, + "SCIDOCS": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "SciFact": { + "query": "Given a scientific claim, retrieve documents that support or refute the claim", + "document": "", + }, + "TRECCOVID": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "Touche2020": { + "query": "Given a question, retrieve detailed and persuasive arguments that answer the question", + "document": "", + }, + "Touche2020Retrieval.v3": { + "query": "Given a search query, retrieve passages that answer the question", + "document": "", + }, + "BIOSSES": "Retrieve semantically similar text", + "SICK-R": "Retrieve semantically similar text", + "STS12": "Retrieve semantically similar text", + "STS13": "Retrieve semantically similar text", + "STS14": "Retrieve semantically similar text", + "STS15": "Retrieve semantically similar text", + "STS16": "Retrieve semantically similar text", + "STS17": "Retrieve semantically similar text", + "STS22": "Retrieve semantically similar text", + "STS22.v2": "Retrieve semantically similar text", + "STSBenchmark": "Retrieve semantically similar text", + "SummEval": "Given a news summary, retrieve other semantically similar summaries", + "SummEvalSummarization.v2": "Given a news summary, retrieve other semantically similar summaries", + "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not-counterfactual", + "AmazonPolarityClassification": "Classify Amazon reviews into positive or negative sentiment", + "AmazonReviewsClassification": "Classify the given Amazon review into its appropriate rating category", + "Banking77Classification": "Given a online banking query, find the corresponding intents", + "EmotionClassification": "Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", + "ImdbClassification": "Classify the sentiment expressed in the given movie review text from the IMDB dataset", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios", + "MTOPDomainClassification": "Classify the intent domain of the given utterance in task-oriented conversation", + "MTOPIntentClassification": "Classify the intent of the given utterance in task-oriented conversation", + "ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic", + "TweetSentimentExtractionClassification": "Classify the sentiment of a given tweet as either positive, negative, or neutral", + "ArxivClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArxivClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "ArXivHierarchicalClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArXivHierarchicalClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "BiorxivClusteringP2P": "Identify the main category of Biorxiv papers based on the titles and abstracts", + "BiorxivClusteringS2S": "Identify the main category of Biorxiv papers based on the titles", + "BiorxivClusteringP2P.v2": "Identify the main category of Biorxiv papers based on the titles and abstracts", + "MedrxivClusteringP2P": "Identify the main category of Medrxiv papers based on the titles and abstract", + "MedrxivClusteringS2S": "Identify the main category of Medrxiv papers based on the titles", + "MedrxivClusteringP2P.v2": "Identify the main category of Medrxiv papers based on the titles and abstract", + "MedrxivClusteringS2S.v2": "Identify the main category of Medrxiv papers based on the titles", + "RedditClustering": "Identify the topic or theme of Reddit posts based on the titles", + "RedditClusteringP2P": "Identify the topic or theme of Reddit posts based on the titles and posts", + "StackExchangeClustering": "Identify the topic or theme of StackExchange posts based on the titles", + "StackExchangeClusteringP2P": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "StackExchangeClustering.v2": "Identify the topic or theme of StackExchange posts based on the titles", + "StackExchangeClusteringP2P.v2": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "TwentyNewsgroupsClustering": "Identify the topic or theme of the given news articles", + "TwentyNewsgroupsClustering.v2": "Identify the topic or theme of the given news articles", + "AskUbuntuDupQuestions": { + "query": "Retrieve duplicate questions from AskUbuntu forum", + "document": "Retrieve duplicate questions from AskUbuntu forum", + }, + "MindSmallReranking": "Given a search query, retrieve passages that answer the question", + "SciDocsRR": "Given a title of a scientific paper, retrieve the titles of other relevant papers", + "StackOverflowDupQuestions": "Retrieve duplicate questions from StackOverflow forum", + "SprintDuplicateQuestions": "Retrieve duplicate questions from Sprint forum", + "TwitterSemEval2015": "Retrieve tweets that are semantically similar to the given tweet", + "TwitterURLCorpus": "Retrieve tweets that are semantically similar to the given tweet", } rubert_tiny = ModelMeta( @@ -633,12 +804,12 @@ loader=partial( # type: ignore InstructSentenceTransformerWrapper, model_name="ai-sage/Giga-Embeddings-instruct", - revision="40b27667b9ad586d7812675df76e5062ccc80b0e", - instruction_template="{instruction}", - max_seq_length=512, - apply_instruction_to_passages=False, - prompts_dict=GIGA_task_prompts, + revision="0ad5b29bfecd806cecc9d66b927d828a736594dc", trust_remote_code=True, + instruction_template="Instruct: {instruction}\nQuery: ", + max_seq_length=4096, + apply_instruction_to_passages=True, + prompts_dict=GIGA_task_prompts, model_kwargs={ "torch_dtype": torch.bfloat16, }, @@ -646,8 +817,8 @@ name="ai-sage/Giga-Embeddings-instruct", languages=["eng-Latn", "rus-Cyrl"], open_weights=True, - revision="40b27667b9ad586d7812675df76e5062ccc80b0e", - release_date="2025-06-05", + revision="0ad5b29bfecd806cecc9d66b927d828a736594dc", + release_date="2025-09-23", n_parameters=3_227_176_961, memory_usage_mb=12865, embed_dim=2048, From cb03bd4edbdc626401f5877f7630df2ec3c6fc46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=AC=91=E5=B0=BF=E4=BC=8A=E4=BA=BA?= <44760272+q275343119@users.noreply.github.com> Date: Tue, 30 Sep 2025 00:05:04 +0800 Subject: [PATCH 11/20] fix: Refactor split create_tables into static Benchmark methods (#3126) * feat - Split create_tables into static Benchmark methods * feat - format * Update mteb/leaderboard/table.py Co-authored-by: Kenneth Enevoldsen * feat - remove search query;take benchmark result as input;addressing the circular import, * feat - format * Update mteb/benchmarks/benchmark.py Co-authored-by: Kenneth Enevoldsen * Update mteb/benchmarks/benchmark.py Co-authored-by: Kenneth Enevoldsen * feat - use to_dataframe;clean table.py;move creat_table * feat - fix circular import * feat - clean-up * feat - format --------- Co-authored-by: Kenneth Enevoldsen --- mteb/benchmarks/_create_table.py | 256 ++++++++++++++++++++++++ mteb/benchmarks/benchmark.py | 17 ++ mteb/leaderboard/app.py | 70 +++++-- mteb/leaderboard/table.py | 331 +++++++++---------------------- scripts/make_leaderboard.py | 16 +- 5 files changed, 435 insertions(+), 255 deletions(-) create mode 100644 mteb/benchmarks/_create_table.py diff --git a/mteb/benchmarks/_create_table.py b/mteb/benchmarks/_create_table.py new file mode 100644 index 0000000000..08d3c4ef1d --- /dev/null +++ b/mteb/benchmarks/_create_table.py @@ -0,0 +1,256 @@ +from __future__ import annotations + +import math +import re +from collections import defaultdict + +import numpy as np +import pandas as pd + +from mteb.load_results.benchmark_results import BenchmarkResults +from mteb.overview import get_task, get_tasks + + +def _borda_count(scores: pd.Series) -> pd.Series: + n = len(scores) + ranks = scores.rank(method="average", ascending=False) + counts = n - ranks + return counts + + +def _get_borda_rank(score_table: pd.DataFrame) -> pd.Series: + borda_counts = score_table.apply(_borda_count, axis="index") + mean_borda = borda_counts.sum(axis=1) + return mean_borda.rank(method="min", ascending=False).astype(int) + + +def _split_on_capital(s: str) -> str: + """Splits on capital letters and joins with spaces""" + return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s)) + + +def _format_n_parameters(n_parameters) -> str: + if (n_parameters is None) or (not int(n_parameters)): + return "Unknown" + n_thousand = int(n_parameters // 1e3) + if n_thousand < 1: + return str(int(n_parameters)) + n_zeros = math.log10(n_thousand) + if n_zeros >= 6: + return str(n_thousand // (10**6)) + "B" + if n_zeros >= 3: + return str(n_thousand // (10**3)) + "M" + return str(n_thousand) + "K" + + +def _format_max_tokens(max_tokens: float | None) -> str: + if max_tokens is None: + return "Unknown" + if max_tokens == np.inf: + return "Infinite" + return str(int(max_tokens)) + + +def _failsafe_get_model_meta(model_name): + try: + from mteb.models.overview import get_model_meta + + return get_model_meta(model_name) + except Exception: + return None + + +def _get_means_per_types(per_task: pd.DataFrame): + task_names_per_type = defaultdict(list) + for task_name in per_task.columns: + task_type = get_task(task_name).metadata.type + task_names_per_type[task_type].append(task_name) + records = [] + for task_type, tasks in task_names_per_type.items(): + for model_name, scores in per_task.iterrows(): + records.append( + dict( + model_name=model_name, + task_type=task_type, + score=scores[tasks].mean(skipna=False), + ) + ) + return pd.DataFrame.from_records(records) + + +def _create_summary_table_from_benchmark_results( + benchmark_results: BenchmarkResults, +) -> pd.DataFrame: + """Create summary table from BenchmarkResults. + + Returns a DataFrame with one row per model containing summary statistics + and task type averages. + + Args: + benchmark_results: BenchmarkResults object containing model results + + Returns: + DataFrame with model summaries, ready for styling in the leaderboard + """ + data = benchmark_results.to_dataframe(format="long") + + if data.empty: + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + + # Convert to DataFrame and pivot + per_task = data.pivot(index="model_name", columns="task_name", values="score") + + # Remove models with no scores + to_remove = per_task.isna().all(axis="columns") + if to_remove.all(): + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + + models_to_remove = list(per_task[to_remove].index) + per_task = per_task.drop(models_to_remove, axis=0) + + # Calculate means by task type + mean_per_type = _get_means_per_types(per_task) + mean_per_type = mean_per_type.pivot( + index="model_name", columns="task_type", values="score" + ) + mean_per_type.columns = [ + _split_on_capital(column) for column in mean_per_type.columns + ] + + # Calculate overall means + typed_mean = mean_per_type.mean(skipna=False, axis=1) + overall_mean = per_task.mean(skipna=False, axis=1) + + # Build joint table + joint_table = mean_per_type.copy() + joint_table = joint_table.drop(models_to_remove, axis=0) + joint_table.insert(0, "mean", overall_mean) + joint_table.insert(1, "mean_by_task_type", typed_mean) + joint_table["borda_rank"] = _get_borda_rank(per_task) + joint_table = joint_table.sort_values("borda_rank", ascending=True) + joint_table = joint_table.reset_index() + + # Add model metadata + model_metas = joint_table["model_name"].map(_failsafe_get_model_meta) + joint_table = joint_table[model_metas.notna()] + joint_table["model_link"] = model_metas.map(lambda m: m.reference) + + # Insert model metadata columns + joint_table.insert( + 1, + "Max Tokens", + model_metas.map(lambda m: _format_max_tokens(m.max_tokens)), + ) + joint_table.insert( + 1, + "Embedding Dimensions", + model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"), + ) + joint_table.insert( + 1, + "Number of Parameters", + model_metas.map(lambda m: _format_n_parameters(m.n_parameters)), + ) + joint_table.insert( + 1, + "Memory Usage (MB)", + model_metas.map( + lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown" + ), + ) + + # Add zero-shot percentage + tasks = get_tasks(tasks=list(data["task_name"].unique())) + joint_table.insert( + 1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks)) + ) + joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1) + + # Clean up model names (remove HF organization) + joint_table["model_name"] = joint_table["model_name"].map( + lambda name: name.split("/")[-1] + ) + + # Add markdown links to model names + name_w_link = ( + "[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")" + ) + joint_table["model_name"] = joint_table["model_name"].mask( + joint_table["model_link"].notna(), name_w_link + ) + joint_table = joint_table.drop(columns=["model_link"]) + + # Rename columns + joint_table = joint_table.rename( + columns={ + "model_name": "Model", + "mean_by_task_type": "Mean (TaskType)", + "mean": "Mean (Task)", + } + ) + + # Move borda rank to front + joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) + + return joint_table + + +def _create_per_task_table_from_benchmark_results( + benchmark_results: BenchmarkResults, +) -> pd.DataFrame: + """Create per-task table from BenchmarkResults. + + Returns a DataFrame with one row per model and one column per task. + + Args: + benchmark_results: BenchmarkResults object containing model results + + Returns: + DataFrame with per-task scores, ready for styling in the leaderboard + """ + # Get scores in long format + data = benchmark_results.to_dataframe(format="long") + + if data.empty: + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + + # Convert to DataFrame and pivot + per_task = data.pivot(index="model_name", columns="task_name", values="score") + + # Remove models with no scores + to_remove = per_task.isna().all(axis="columns") + if to_remove.all(): + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + + models_to_remove = list(per_task[to_remove].index) + per_task = per_task.drop(models_to_remove, axis=0) + + # Add borda rank and sort + per_task["borda_rank"] = _get_borda_rank(per_task) + per_task = per_task.sort_values("borda_rank", ascending=True) + per_task = per_task.drop(columns=["borda_rank"]) + per_task = per_task.reset_index() + + # Clean up model names (remove HF organization) + per_task["model_name"] = per_task["model_name"].map( + lambda name: name.split("/")[-1] + ) + per_task = per_task.rename( + columns={ + "model_name": "Model", + } + ) + + return per_task diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py index 37b654ac92..e48b455ad3 100644 --- a/mteb/benchmarks/benchmark.py +++ b/mteb/benchmarks/benchmark.py @@ -4,8 +4,13 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Annotated +import pandas as pd from pydantic import AnyUrl, BeforeValidator, TypeAdapter +from mteb.benchmarks._create_table import ( + _create_per_task_table_from_benchmark_results, + _create_summary_table_from_benchmark_results, +) from mteb.load_results.load_results import load_results if TYPE_CHECKING: @@ -72,3 +77,15 @@ def load_results( results = base_results.select_tasks(self.tasks) self.results_cache[base_results] = results return results + + def _create_summary_table( + self, benchmark_results: BenchmarkResults + ) -> pd.DataFrame: + """Create summary table. Called by the leaderboard app.""" + return _create_summary_table_from_benchmark_results(benchmark_results) + + def _create_per_task_table( + self, benchmark_results: BenchmarkResults + ) -> pd.DataFrame: + """Create per-task table. Called by the leaderboard app.""" + return _create_per_task_table_from_benchmark_results(benchmark_results) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 0ec8b91fde..88f56ffd22 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -25,7 +25,10 @@ make_selector, ) from mteb.leaderboard.figures import performance_size_plot, radar_chart -from mteb.leaderboard.table import create_tables +from mteb.leaderboard.table import ( + apply_per_task_styling_from_benchmark, + apply_summary_styling_from_benchmark, +) from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ logger = logging.getLogger(__name__) @@ -236,10 +239,21 @@ def get_leaderboard_app() -> gr.Blocks: max_model_size=MAX_MODEL_SIZE, zero_shot_setting="allow_all", ) + default_filtered_scores = [ + entry for entry in default_scores if entry["model_name"] in filtered_models + ] + + # Filter BenchmarkResults based on default filtered models (as required by Kenneth) + filtered_model_names = [entry["model_name"] for entry in default_filtered_scores] + filtered_benchmark_results = default_results.select_models(filtered_model_names) - summary_table, per_task_table = create_tables( - [entry for entry in default_scores if entry["model_name"] in filtered_models] + summary_table = apply_summary_styling_from_benchmark( + default_benchmark, filtered_benchmark_results + ) + per_task_table = apply_per_task_styling_from_benchmark( + default_benchmark, filtered_benchmark_results ) + lang_select = gr.Dropdown( LANGUAGE, value=sorted(default_results.languages), @@ -774,19 +788,43 @@ def update_tables( tasks = set(tasks) benchmark = mteb.get_benchmark(benchmark_name) benchmark_tasks = {task.metadata.name for task in benchmark.tasks} - if (benchmark_tasks != tasks) or (models_to_keep is not None): - filtered_scores = [] - for entry in scores: - if entry["task_name"] not in tasks: - continue - if (models_to_keep is not None) and ( - entry["model_name"] not in models_to_keep - ): - continue - filtered_scores.append(entry) - else: - filtered_scores = scores - summary, per_task = create_tables(filtered_scores) + + # Extract filtered model and task names from scores (respects UI filters) + filtered_model_names = set() + filtered_task_names = set() + + for entry in scores: + if entry["task_name"] not in tasks: + continue + if (models_to_keep is not None) and ( + entry["model_name"] not in models_to_keep + ): + continue + filtered_model_names.add(entry["model_name"]) + filtered_task_names.add(entry["task_name"]) + + # Create filtered BenchmarkResults as required by Kenneth + benchmark_results = all_benchmark_results[benchmark_name] + filtered_benchmark_results = benchmark_results + + # Apply task filtering if needed + if filtered_task_names != benchmark_tasks: + filtered_benchmark_results = filtered_benchmark_results.filter_tasks( + task_names=list(filtered_task_names) + ) + + # Apply model filtering if needed + if filtered_model_names: + filtered_benchmark_results = filtered_benchmark_results.select_models( + list(filtered_model_names) + ) + + summary = apply_summary_styling_from_benchmark( + benchmark, filtered_benchmark_results + ) + per_task = apply_per_task_styling_from_benchmark( + benchmark, filtered_benchmark_results + ) elapsed = time.time() - start_time logger.debug(f"update_tables callback: {elapsed}s") return summary, per_task diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 5286680dc9..623e508caa 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -1,9 +1,5 @@ from __future__ import annotations -import math -import re -from collections import defaultdict - import gradio as gr import matplotlib.pyplot as plt import numpy as np @@ -11,46 +7,11 @@ from matplotlib.colors import LinearSegmentedColormap from pandas.api.types import is_numeric_dtype -from mteb.models.overview import get_model_meta -from mteb.overview import get_task, get_tasks - - -def borda_count(scores: pd.Series) -> pd.Series: - n = len(scores) - ranks = scores.rank(method="average", ascending=False) - counts = n - ranks - return counts - - -def get_borda_rank(score_table: pd.DataFrame) -> pd.Series: - borda_counts = score_table.apply(borda_count, axis="index") - mean_borda = borda_counts.sum(axis=1) - return mean_borda.rank(method="min", ascending=False).astype(int) - def format_scores(score: float) -> float: return round(score * 100, 2) -def format_n_parameters(n_parameters) -> str: - if (n_parameters is None) or (not int(n_parameters)): - return "Unknown" - n_thousand = int(n_parameters // 1e3) - if n_thousand < 1: - return str(int(n_parameters)) - n_zeros = math.log10(n_thousand) - if n_zeros >= 6: - return str(n_thousand // (10**6)) + "B" - if n_zeros >= 3: - return str(n_thousand // (10**3)) + "M" - return str(n_thousand) + "K" - - -def split_on_capital(s: str) -> str: - """Splits on capital letters and joins with spaces""" - return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s)) - - def get_column_types(df: pd.DataFrame) -> list[str]: types = [] for column_name in df.columns: @@ -78,39 +39,6 @@ def get_column_widths(df: pd.DataFrame) -> list[str]: return widths -def get_means_per_types(per_task: pd.DataFrame): - task_names_per_type = defaultdict(list) - for task_name in per_task.columns: - task_type = get_task(task_name).metadata.type - task_names_per_type[task_type].append(task_name) - records = [] - for task_type, tasks in task_names_per_type.items(): - for model_name, scores in per_task.iterrows(): - records.append( - dict( - model_name=model_name, - task_type=task_type, - score=scores[tasks].mean(skipna=False), - ) - ) - return pd.DataFrame.from_records(records) - - -def failsafe_get_model_meta(model_name): - try: - return get_model_meta(model_name) - except Exception: - return None - - -def format_max_tokens(max_tokens: float | None) -> str: - if max_tokens is None: - return "Unknown" - if max_tokens == np.inf: - return "Infinite" - return str(int(max_tokens)) - - def format_zero_shot(zero_shot_percentage: int): if zero_shot_percentage == -1: return "⚠️ NA" @@ -128,119 +56,58 @@ def create_light_green_cmap(): return light_green_cmap -def scores_to_tables(scores_long: list[dict], search_query: str | None = None): - if not scores_long: - no_results_frame = pd.DataFrame( - {"No results": ["You can try relaxing your criteria"]} - ) - return gr.DataFrame(no_results_frame), gr.DataFrame(no_results_frame) - data = pd.DataFrame.from_records(scores_long) - per_task = data.pivot(index="model_name", columns="task_name", values="score") - mean_per_type = get_means_per_types(per_task) - mean_per_type = mean_per_type.pivot( - index="model_name", columns="task_type", values="score" - ) - mean_per_type.columns = [ - split_on_capital(column) for column in mean_per_type.columns - ] - to_remove = per_task.isna().all(axis="columns") - if search_query: - names = per_task.index.get_level_values("model_name") - names = pd.Series(names, index=per_task.index) - to_remove |= ~names.str.contains(search_query, regex=True) - if to_remove.all(): - no_results_frame = pd.DataFrame( - {"No results": ["You can try relaxing your criteria"]} - ) - return gr.DataFrame(no_results_frame), gr.DataFrame(no_results_frame) - models_to_remove = list(per_task[to_remove].index) - typed_mean = mean_per_type.mean(skipna=False, axis=1) - overall_mean = per_task.mean(skipna=False, axis=1) - joint_table = mean_per_type.copy() - per_task = per_task.drop(models_to_remove, axis=0) - joint_table = joint_table.drop(models_to_remove, axis=0) - joint_table.insert(0, "mean", overall_mean) - joint_table.insert(1, "mean_by_task_type", typed_mean) - joint_table["borda_rank"] = get_borda_rank(per_task) - joint_table = joint_table.sort_values("borda_rank", ascending=True) - per_task["borda_rank"] = joint_table["borda_rank"] - per_task = per_task.sort_values("borda_rank", ascending=True) - per_task = per_task.drop(columns=["borda_rank"]) - joint_table = joint_table.reset_index() - model_metas = joint_table["model_name"].map(failsafe_get_model_meta) - joint_table = joint_table[model_metas.notna()] - joint_table["model_link"] = model_metas.map(lambda m: m.reference) - joint_table.insert( - 1, - "Max Tokens", - model_metas.map(lambda m: format_max_tokens(m.max_tokens)), - ) - joint_table.insert( - 1, - "Embedding Dimensions", - model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"), - ) - joint_table.insert( - 1, - "Number of Parameters", - model_metas.map(lambda m: format_n_parameters(m.n_parameters)), - ) - joint_table.insert( - 1, - "Memory Usage (MB)", - model_metas.map( - lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown" - ), - ) - tasks = get_tasks(tasks=list(data["task_name"].unique())) - joint_table.insert( - 1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks)) - ) - joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1) - # joint_table = joint_table[joint_table["Zero-shot"].notna()] - # Removing HF organization from model - joint_table["model_name"] = joint_table["model_name"].map( - lambda name: name.split("/")[-1] - ) - # Adding markdown link to model names - name_w_link = ( - "[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")" - ) - joint_table["model_name"] = joint_table["model_name"].mask( - joint_table["model_link"].notna(), name_w_link - ) - joint_table = joint_table.drop(columns=["model_link"]) - joint_table = joint_table.rename( - columns={ - "model_name": "Model", - "mean_by_task_type": "Mean (TaskType)", - "mean": "Mean (Task)", - } - ) - per_task = per_task.reset_index() - per_task["model_name"] = per_task["model_name"].map( - lambda name: name.split("/")[-1] - ) - per_task = per_task.rename( - columns={ - "model_name": "Model", - } - ) - joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) - column_types = get_column_types(joint_table) - # setting model name column to markdown - column_types[1] = "markdown" - score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns] +def apply_summary_styling_from_benchmark( + benchmark_instance, benchmark_results +) -> gr.DataFrame: + """Apply styling to summary table created by the benchmark instance's _create_summary_table method. + + This supports polymorphism - different benchmark classes can have different table generation logic. + + Args: + benchmark_instance: The benchmark instance (could be Benchmark, RTEBBenchmark, etc.) + benchmark_results: BenchmarkResults object containing model results (may be pre-filtered) + + Returns: + Styled gr.DataFrame ready for display in the leaderboard + """ + # Use the instance method to support polymorphism + summary_df = benchmark_instance._create_summary_table(benchmark_results) + + # If it's a no-results DataFrame, return it as-is + if "No results" in summary_df.columns: + return gr.DataFrame(summary_df) + + # Apply the styling + return _apply_summary_table_styling(summary_df) + + +def apply_per_task_styling_from_benchmark( + benchmark_instance, benchmark_results +) -> gr.DataFrame: + """Apply styling to per-task table created by the benchmark instance's _create_per_task_table method. + + This supports polymorphism - different benchmark classes can have different table generation logic. + + Args: + benchmark_instance: The benchmark instance (could be Benchmark, RTEBBenchmark, etc.) + benchmark_results: BenchmarkResults object containing model results (may be pre-filtered) + + Returns: + Styled gr.DataFrame ready for display in the leaderboard + """ + # Use the instance method to support polymorphism + per_task_df = benchmark_instance._create_per_task_table(benchmark_results) - return joint_table, per_task, score_columns, column_types + # If it's a no-results DataFrame, return it as-is + if "No results" in per_task_df.columns: + return gr.DataFrame(per_task_df) + # Apply the styling + return _apply_per_task_table_styling(per_task_df) -def apply_styling( - joint_table: pd.DataFrame, - per_task: pd.DataFrame, - score_columns: list[str], - column_types: list[str], -) -> tuple[gr.DataFrame, gr.DataFrame]: + +def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame: + """Apply styling to a raw summary DataFrame""" excluded_columns = [ "Rank (Borda)", "Model", @@ -249,18 +116,27 @@ def apply_styling( "Max Tokens", "Memory Usage (MB)", ] + gradient_columns = [ col for col in joint_table.columns if col not in excluded_columns ] light_green_cmap = create_light_green_cmap() + + # Determine score columns (before formatting) + score_columns = [ + col + for col in joint_table.columns + if col not in excluded_columns + ["Zero-shot"] + ] + numeric_data = joint_table.copy() + + # Format data for display joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot) joint_table[score_columns] = joint_table[score_columns].map(format_scores) + joint_table_style = joint_table.style.format( - { - **dict.fromkeys(score_columns, "{:.2f}"), - "Rank (Borda)": "{:.0f}", - }, + {**dict.fromkeys(score_columns, "{:.2f}"), "Rank (Borda)": "{:.0f}"}, na_rep="", ) joint_table_style = joint_table_style.highlight_min( @@ -289,58 +165,45 @@ def apply_styling( vmax=100, gmap=gmap_values.loc[mask], ) + + column_types = get_column_types(joint_table_style.data) + # setting model name column to markdown + if len(column_types) > 1: + column_types[1] = "markdown" + + column_widths = get_column_widths(joint_table_style.data) + if len(column_widths) > 0: + column_widths[0] = "100px" + if len(column_widths) > 1: + column_widths[1] = "250px" + + return gr.DataFrame( + joint_table_style, + datatype=column_types, + interactive=False, + pinned_columns=3, + column_widths=column_widths, + wrap=True, + show_fullscreen_button=True, + show_copy_button=True, + show_search="filter", + ) + + +def _apply_per_task_table_styling(per_task: pd.DataFrame) -> gr.DataFrame: + """Apply styling to a raw per-task DataFrame""" task_score_columns = per_task.select_dtypes("number").columns per_task[task_score_columns] *= 100 + per_task_style = per_task.style.format( "{:.2f}", subset=task_score_columns, na_rep="" ).highlight_max(subset=task_score_columns, props="font-weight: bold") - # TODO: uncomment this when Gradio fixes it. - # The fix is already merged and contained in this release: https://github.com/gradio-app/gradio/pull/11032 - # It will be available in Gradio 5.25.3 - # for col in task_score_columns: - # if col != "Model": - # mask = per_task[col].notna() - # per_task_style = per_task_style.background_gradient( - # cmap=light_green_cmap, - # subset=pd.IndexSlice[mask, col], - # gmap=per_task[col].loc[mask], - # ) - column_widths = get_column_widths(joint_table_style.data) - column_widths[0] = "100px" - column_widths[1] = "250px" - return ( - gr.DataFrame( - joint_table_style, - datatype=column_types, - interactive=False, - pinned_columns=3, - column_widths=column_widths, - wrap=True, - show_fullscreen_button=True, - show_copy_button=True, - show_search="filter", - ), - gr.DataFrame( - per_task_style, - interactive=False, - pinned_columns=1, - show_fullscreen_button=True, - show_copy_button=True, - show_search="filter", - ), - ) - -def create_tables( - scores_long: list[dict], search_query: str | None = None -) -> tuple[gr.DataFrame, gr.DataFrame]: - result = scores_to_tables(scores_long, search_query) - # dataframe with No Results is returned, so no need to apply styling - if len(result) == 2: - joint_table, per_task = result - return joint_table, per_task - joint_table, per_task, score_columns, column_types = result - summary_table, per_task_table = apply_styling( - joint_table, per_task, score_columns, column_types + return gr.DataFrame( + per_task_style, + interactive=False, + pinned_columns=1, + show_fullscreen_button=True, + show_copy_button=True, + show_search="filter", ) - return summary_table, per_task_table diff --git a/scripts/make_leaderboard.py b/scripts/make_leaderboard.py index 4e322b3210..fff52536c6 100644 --- a/scripts/make_leaderboard.py +++ b/scripts/make_leaderboard.py @@ -7,7 +7,10 @@ import pandas as pd import mteb -from mteb.leaderboard.table import create_tables +from mteb.leaderboard.table import ( + apply_per_task_styling_from_benchmark, + apply_summary_styling_from_benchmark, +) from mteb.load_results import load_results logging.basicConfig(level=logging.INFO) @@ -60,11 +63,14 @@ def load_leaderboard( base_results=benchmark_results ).join_revisions() - # Convert scores into long format - scores_long = benchmark_results_filtered.get_scores(format="long") - # Convert scores into leaderboard tables - summary_gr_df, per_task_gr_df = create_tables(scores_long=scores_long) + loaded_benchmark = mteb.get_benchmark(benchmark.name) + summary_gr_df = apply_summary_styling_from_benchmark( + loaded_benchmark, benchmark_results_filtered + ) + per_task_gr_df = apply_per_task_styling_from_benchmark( + loaded_benchmark, benchmark_results_filtered + ) # Convert Gradio DataFrames to Pandas summary_df = pd.DataFrame( From a52723aeef6dc471e00221f5d4a2e25c99875396 Mon Sep 17 00:00:00 2001 From: semantic-release Date: Mon, 29 Sep 2025 16:52:01 +0000 Subject: [PATCH 12/20] 1.38.61 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 34551aeeec..55da5ac840 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.38.60" +version = "1.38.61" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 4f5868491c09506d256ebaff0c07e02fec0f70b4 Mon Sep 17 00:00:00 2001 From: fzoll <5575946+fzoll@users.noreply.github.com> Date: Mon, 29 Sep 2025 23:42:20 +0200 Subject: [PATCH 13/20] Extending the RTEB benchmark (#3223) Adding another voyageai model --- mteb/benchmarks/benchmarks/rteb_benchmarks.py | 3 ++- mteb/models/voyage_models.py | 27 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/mteb/benchmarks/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/benchmarks/rteb_benchmarks.py index 1bdc14a814..d124b4c208 100644 --- a/mteb/benchmarks/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/benchmarks/rteb_benchmarks.py @@ -31,6 +31,7 @@ "FreshStackRetrieval", "ChatDoctorRetrieval", "CUREv1", + "MIRACLRetrievalHardNegatives", # Closed datasets "Code1Retrieval", "JapaneseCode1Retrieval", @@ -47,7 +48,7 @@ "JapaneseLegal1Retrieval", ], ), - description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 28 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across English, French, German, and Japanese languages.", + description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 29 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across English, French, German, and Japanese languages.", citation=RTEB_CITATION, contacts=["fzowl"], ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index aaee6ae9da..5e0d882c82 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -205,6 +205,33 @@ def _batched_encode( PromptType.document.value: "document", } +voyage_3_large = ModelMeta( + name="voyageai/voyage-3-large", # Date of publication of this post https://blog.voyageai.com/2025/01/07/voyage-3-large/ + revision="1", + release_date="2025-01-07", + languages=None, # supported languages not specified + loader=partial( # type: ignore + VoyageWrapper, + model_name="voyage-3-large", + max_tokens=32000, + model_prompts=model_prompts, + ), + max_tokens=32000, + embed_dim=1024, + open_weights=False, + n_parameters=None, + memory_usage_mb=None, + license=None, + reference="https://blog.voyageai.com/2025/01/07/voyage-3-large/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, +) + + voyage_3_5 = ModelMeta( name="voyageai/voyage-3.5", revision="1", From 7f5990a226d807b7e271e6375e56fbd0037efb32 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 29 Sep 2025 21:45:13 +0000 Subject: [PATCH 14/20] Update tasks & benchmarks tables --- docs/benchmarks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 15ac01dae4..777a05c238 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -50,7 +50,7 @@ The following table gives you an overview of the benchmarks in MTEB. | RTEB(Code, beta) | RTEB Code | 8 | Retrieval: 8 | [Programming, Written] | eng,go,javascript,jpn,python,sql | | RTEB(Health, beta) | RTEB Healthcare | 4 | Retrieval: 4 | [Academic, Medical, Written] | deu,eng,fra,spa | | RTEB(Law, beta) | RTEB Legal | 7 | Retrieval: 7 | [Legal, Written] | deu,eng,fra,jpn | -| RTEB(beta) | RTEB Retrieval Embedding Benchmark | 28 | Retrieval: 28 | [Academic, Encyclopaedic, Financial, Legal, Medical, Non-fiction, Programming, Written] | deu,eng,fra,go,javascript,jpn,python,spa,sql | +| RTEB(beta) | RTEB Retrieval Embedding Benchmark | 29 | Retrieval: 29 | [Academic, Encyclopaedic, Financial, Legal, Medical, Non-fiction, Programming, Written] | ara,ben,deu,eng,fas,fin,fra,go,hin,ind,javascript,jpn,kor,python,rus,spa,sql,swa,tel,tha,yor,zho | | RTEB(deu, beta) | RTEB German | 4 | Retrieval: 4 | [Legal, Medical, Non-fiction, Written] | deu | | RTEB(eng, beta) | RTEB English | 20 | Retrieval: 20 | [Academic, Financial, Legal, Medical, Non-fiction, Programming, Written] | eng,fra,go,javascript,python,spa,sql | | RTEB(fin, beta) | RTEB Finance | 7 | Retrieval: 7 | [Financial, Non-fiction, Written] | eng | From e196ab94aff1793824a0c3256f2a7b9b10a07acd Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Tue, 30 Sep 2025 14:19:23 +0800 Subject: [PATCH 15/20] feat - filter_by_privacy --- mteb/load_results/benchmark_results.py | 7 +++++++ mteb/load_results/task_results.py | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index 4c83d3b156..261e8f6b00 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -107,6 +107,7 @@ def filter_tasks( domains: list[TASK_DOMAIN] | None = None, task_types: list[TASK_TYPE] | None = None, modalities: list[MODALITIES] | None = None, + privacy: Literal["public", "private"] | None = None, ) -> ModelResult: # TODO: v2 see filter_tasks in BenchmarkResults - but can be moved to a private function or removed new_task_results = [] @@ -127,6 +128,10 @@ def filter_tasks( task_modalities = getattr(task_result, "modalities", []) if not any(modality in task_modalities for modality in modalities): continue + if (privacy is not None) and ( + task_result.is_public != (privacy == "public") + ): + continue new_task_results.append(task_result) return type(self).model_construct( model_name=self.model_name, @@ -395,6 +400,7 @@ def filter_tasks( domains: list[TASK_DOMAIN] | None = None, task_types: list[TASK_TYPE] | None = None, # type: ignore modalities: list[MODALITIES] | None = None, + privacy: Literal["public", "private"] | None = None, ) -> BenchmarkResults: # TODO: Same as filter_models model_results = [ @@ -404,6 +410,7 @@ def filter_tasks( domains=domains, task_types=task_types, modalities=modalities, + privacy=privacy, ) for res in self.model_results ] diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index c8218075c9..4c0dc53d47 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -146,6 +146,7 @@ class TaskResult(BaseModel): scores: dict[Split, list[ScoresDict]] evaluation_time: float | None kg_co2_emissions: float | None = None + is_public: bool = True @classmethod def from_task_results( @@ -175,6 +176,7 @@ def from_task_results( scores=flat_scores, evaluation_time=evaluation_time, kg_co2_emissions=kg_co2_emissions, + is_pubic=task.metadata.is_public, ) @field_validator("scores") @@ -228,6 +230,10 @@ def domains(self) -> list[str]: def task_type(self) -> str: return self.task.metadata.type + @property + def task_is_public(self) -> bool: + return self.task.metadata.is_public + def to_dict(self) -> dict: return self.model_dump() From 3fa3ccd2ff98a797bea836782921916074097879 Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Tue, 30 Sep 2025 14:37:02 +0800 Subject: [PATCH 16/20] feat - add new fields for rteb part --- mteb/benchmarks/_create_table.py | 124 ++++++++++++++++++ mteb/benchmarks/benchmark.py | 9 ++ mteb/benchmarks/benchmarks/rteb_benchmarks.py | 20 +-- 3 files changed, 143 insertions(+), 10 deletions(-) diff --git a/mteb/benchmarks/_create_table.py b/mteb/benchmarks/_create_table.py index 08d3c4ef1d..f4b3dd890d 100644 --- a/mteb/benchmarks/_create_table.py +++ b/mteb/benchmarks/_create_table.py @@ -254,3 +254,127 @@ def _create_per_task_table_from_benchmark_results( ) return per_task + + +def _create_summary_table_mean_public_private( + benchmark_results: BenchmarkResults, +) -> pd.DataFrame: + """Create summary table from BenchmarkResults. + + Returns a DataFrame with one row per model containing summary statistics + and task type averages. + + Args: + benchmark_results: BenchmarkResults object containing model results + + Returns: + DataFrame with model summaries, ready for styling in the leaderboard + """ + data = benchmark_results.to_dataframe(format="long") + + if data.empty: + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + public_task_name = benchmark_results.filter_tasks(privacy="public").task_names + private_task_name = benchmark_results.filter_tasks(privacy="private").task_names + # Convert to DataFrame and pivot + per_task = data.pivot(index="model_name", columns="task_name", values="score") + + # Remove models with no scores + to_remove = per_task.isna().all(axis="columns") + if to_remove.all(): + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + + models_to_remove = list(per_task[to_remove].index) + per_task = per_task.drop(models_to_remove, axis=0) + + # Calculate means by task type + mean_per_type = _get_means_per_types(per_task) + mean_per_type = mean_per_type.pivot( + index="model_name", columns="task_type", values="score" + ) + mean_per_type.columns = [ + _split_on_capital(column) for column in mean_per_type.columns + ] + + # Calculate overall means + public_mean = per_task[public_task_name].mean(skipna=False, axis=1) + private_mean = per_task[private_task_name].mean(skipna=False, axis=1) + + # Build joint table + joint_table = mean_per_type.copy() + joint_table = joint_table.drop(models_to_remove, axis=0) + joint_table.insert(0, "mean(public)", public_mean) + joint_table.insert(1, "mean(private)", private_mean) + joint_table["borda_rank"] = _get_borda_rank(per_task) + joint_table = joint_table.sort_values("borda_rank", ascending=True) + joint_table = joint_table.reset_index() + + # Add model metadata + model_metas = joint_table["model_name"].map(_failsafe_get_model_meta) + joint_table = joint_table[model_metas.notna()] + joint_table["model_link"] = model_metas.map(lambda m: m.reference) + + # Insert model metadata columns + joint_table.insert( + 1, + "Max Tokens", + model_metas.map(lambda m: _format_max_tokens(m.max_tokens)), + ) + joint_table.insert( + 1, + "Embedding Dimensions", + model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"), + ) + joint_table.insert( + 1, + "Number of Parameters", + model_metas.map(lambda m: _format_n_parameters(m.n_parameters)), + ) + joint_table.insert( + 1, + "Memory Usage (MB)", + model_metas.map( + lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown" + ), + ) + + # Add zero-shot percentage + tasks = get_tasks(tasks=list(data["task_name"].unique())) + joint_table.insert( + 1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks)) + ) + joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1) + + # Clean up model names (remove HF organization) + joint_table["model_name"] = joint_table["model_name"].map( + lambda name: name.split("/")[-1] + ) + + # Add markdown links to model names + name_w_link = ( + "[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")" + ) + joint_table["model_name"] = joint_table["model_name"].mask( + joint_table["model_link"].notna(), name_w_link + ) + joint_table = joint_table.drop(columns=["model_link"]) + + # Rename columns + joint_table = joint_table.rename( + columns={ + "model_name": "Model", + "mean(public)": "Mean (Public)", + "mean(private)": "Mean (Private)", + } + ) + + # Move borda rank to front + joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) + + return joint_table diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py index e48b455ad3..c6a570be8b 100644 --- a/mteb/benchmarks/benchmark.py +++ b/mteb/benchmarks/benchmark.py @@ -10,6 +10,7 @@ from mteb.benchmarks._create_table import ( _create_per_task_table_from_benchmark_results, _create_summary_table_from_benchmark_results, + _create_summary_table_mean_public_private, ) from mteb.load_results.load_results import load_results @@ -89,3 +90,11 @@ def _create_per_task_table( ) -> pd.DataFrame: """Create per-task table. Called by the leaderboard app.""" return _create_per_task_table_from_benchmark_results(benchmark_results) + + +class RtebBenchmark(Benchmark): + def _create_summary_table( + self, benchmark_results: BenchmarkResults + ) -> pd.DataFrame: + """Create summary table. Called by the leaderboard app.""" + return _create_summary_table_mean_public_private(benchmark_results) diff --git a/mteb/benchmarks/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/benchmarks/rteb_benchmarks.py index d124b4c208..5136b9dbe8 100644 --- a/mteb/benchmarks/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/benchmarks/rteb_benchmarks.py @@ -1,7 +1,7 @@ # RTEB Benchmarks - Retrieval Embedding Benchmark from __future__ import annotations -from mteb.benchmarks.benchmark import Benchmark +from mteb.benchmarks.benchmark import RtebBenchmark from mteb.overview import get_tasks RTEB_CITATION = r"""@article{rteb2024, @@ -10,7 +10,7 @@ year = {2024}, }""" -RTEB_MAIN = Benchmark( +RTEB_MAIN = RtebBenchmark( name="RTEB(beta)", display_name="RTEB Retrieval Embedding Benchmark", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-search.svg", @@ -53,7 +53,7 @@ contacts=["fzowl"], ) -RTEB_ENGLISH = Benchmark( +RTEB_ENGLISH = RtebBenchmark( name="RTEB(eng, beta)", display_name="RTEB English", icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg", @@ -88,7 +88,7 @@ contacts=["fzowl"], ) -RTEB_FRENCH = Benchmark( +RTEB_FRENCH = RtebBenchmark( name="RTEB(fr, beta)", display_name="RTEB French", icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", @@ -106,7 +106,7 @@ contacts=["fzowl"], ) -RTEB_GERMAN = Benchmark( +RTEB_GERMAN = RtebBenchmark( name="RTEB(deu, beta)", display_name="RTEB German", icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg", @@ -124,7 +124,7 @@ contacts=["fzowl"], ) -RTEB_JAPANESE = Benchmark( +RTEB_JAPANESE = RtebBenchmark( name="RTEB(jpn, beta)", display_name="RTEB Japanese", icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg", @@ -140,7 +140,7 @@ contacts=["fzowl"], ) -RTEB_FINANCE = Benchmark( +RTEB_FINANCE = RtebBenchmark( name="RTEB(fin, beta)", display_name="RTEB Finance", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-price-tag.svg", @@ -161,7 +161,7 @@ contacts=["fzowl"], ) -RTEB_LEGAL = Benchmark( +RTEB_LEGAL = RtebBenchmark( name="RTEB(Law, beta)", display_name="RTEB Legal", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg", @@ -182,7 +182,7 @@ contacts=["fzowl"], ) -RTEB_CODE = Benchmark( +RTEB_CODE = RtebBenchmark( name="RTEB(Code, beta)", display_name="RTEB Code", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", @@ -204,7 +204,7 @@ contacts=["fzowl"], ) -RTEB_HEALTHCARE = Benchmark( +RTEB_HEALTHCARE = RtebBenchmark( name="RTEB(Health, beta)", display_name="RTEB Healthcare", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg", From cefd826a096fe4a9b47ffe5dcc9fac88c23000ef Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Tue, 30 Sep 2025 16:17:26 +0800 Subject: [PATCH 17/20] feat - getattr --- mteb/load_results/task_results.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index 4c0dc53d47..bffb687435 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -168,6 +168,8 @@ def from_task_results( "languages": eval_langs, } flat_scores[split].append(_scores) + # AttributeError: 'Namespace' object has no attribute 'is_public' + is_public = getattr(task_meta, "is_public", True) return TaskResult( dataset_revision=task.metadata.revision, @@ -176,7 +178,7 @@ def from_task_results( scores=flat_scores, evaluation_time=evaluation_time, kg_co2_emissions=kg_co2_emissions, - is_pubic=task.metadata.is_public, + is_pubic=is_public, ) @field_validator("scores") From fb5902ad6bfdb9b9cac9c21458f88a6b31e71e17 Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Tue, 30 Sep 2025 17:37:44 +0800 Subject: [PATCH 18/20] feat - adjust privacy filter logic --- mteb/load_results/benchmark_results.py | 2 +- mteb/load_results/task_results.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index 261e8f6b00..679415b081 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -129,7 +129,7 @@ def filter_tasks( if not any(modality in task_modalities for modality in modalities): continue if (privacy is not None) and ( - task_result.is_public != (privacy == "public") + task_result.task_is_public != (privacy == "public") ): continue new_task_results.append(task_result) diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index bffb687435..4d3fe8b570 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -146,7 +146,6 @@ class TaskResult(BaseModel): scores: dict[Split, list[ScoresDict]] evaluation_time: float | None kg_co2_emissions: float | None = None - is_public: bool = True @classmethod def from_task_results( @@ -234,7 +233,7 @@ def task_type(self) -> str: @property def task_is_public(self) -> bool: - return self.task.metadata.is_public + return getattr(self.task.metadata, "is_public", True) def to_dict(self) -> dict: return self.model_dump() From 4dec6d48cdb108d7efae4e4c7c006d1bd36cc6a3 Mon Sep 17 00:00:00 2001 From: smile Date: Tue, 30 Sep 2025 21:44:21 +0800 Subject: [PATCH 19/20] feat - enhance summary table column renaming and add 'is_public' field mapping --- mteb/benchmarks/_create_table.py | 16 +++++++++------- mteb/leaderboard/app.py | 2 ++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/mteb/benchmarks/_create_table.py b/mteb/benchmarks/_create_table.py index f4b3dd890d..e977b6e2fc 100644 --- a/mteb/benchmarks/_create_table.py +++ b/mteb/benchmarks/_create_table.py @@ -366,13 +366,15 @@ def _create_summary_table_mean_public_private( joint_table = joint_table.drop(columns=["model_link"]) # Rename columns - joint_table = joint_table.rename( - columns={ - "model_name": "Model", - "mean(public)": "Mean (Public)", - "mean(private)": "Mean (Private)", - } - ) + rename_dict = { + "model_name": "Model", + "mean(public)": "Mean (Public)", + "mean(private)": "Mean (Private)", + } + # For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task) + if "Retrieval" in joint_table.columns: + rename_dict["Retrieval"] = "Mean (Task)" + joint_table = joint_table.rename(columns=rename_dict) # Move borda rank to front joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 88f56ffd22..c37a8edc79 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -123,6 +123,7 @@ def update_task_info(task_names: str) -> gr.DataFrame: "reference", "main_score", "modalities", + "is_public", ] ) df["languages"] = df["languages"].map(format_list) @@ -138,6 +139,7 @@ def update_task_info(task_names: str) -> gr.DataFrame: "domains": "Domains", "main_score": "Metric", "modalities": "Modality", + "is_public": "Public", } ) df = df.drop(columns="reference") From 73f144ce863b8e6e957d7ef9023cc3b9e4649bff Mon Sep 17 00:00:00 2001 From: ethan Date: Wed, 1 Oct 2025 00:00:44 +0800 Subject: [PATCH 20/20] fix: remove unused 'is_public' attribute from TaskResult --- mteb/load_results/task_results.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index 4d3fe8b570..3aaf70fd79 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -167,8 +167,6 @@ def from_task_results( "languages": eval_langs, } flat_scores[split].append(_scores) - # AttributeError: 'Namespace' object has no attribute 'is_public' - is_public = getattr(task_meta, "is_public", True) return TaskResult( dataset_revision=task.metadata.revision, @@ -177,7 +175,6 @@ def from_task_results( scores=flat_scores, evaluation_time=evaluation_time, kg_co2_emissions=kg_co2_emissions, - is_pubic=is_public, ) @field_validator("scores")