From a09734fbeb32a6486446a69df11f9b10d248799b Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 11 Nov 2025 18:58:49 +0300 Subject: [PATCH 01/32] add pytyped --- mteb/py.typed | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 mteb/py.typed diff --git a/mteb/py.typed b/mteb/py.typed new file mode 100644 index 0000000000..e69de29bb2 From 98eab298b5eef4a25e1dcff9f5b0ce10db98f95f Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 11 Nov 2025 19:17:43 +0300 Subject: [PATCH 02/32] start typing --- mteb/abstasks/_stratification.py | 6 +++--- mteb/languages/language_scripts.py | 2 +- pyproject.toml | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mteb/abstasks/_stratification.py b/mteb/abstasks/_stratification.py index e647717a4d..954afdfab0 100644 --- a/mteb/abstasks/_stratification.py +++ b/mteb/abstasks/_stratification.py @@ -211,8 +211,8 @@ def _prepare_stratification( rows = sp.lil_matrix(y).rows rows_used = dict.fromkeys(range(self.n_samples), False) all_combinations = [] - per_row_combinations = [[] for i in range(self.n_samples)] - samples_with_combination = {} + per_row_combinations: list[list[int]] = [[] for i in range(self.n_samples)] + samples_with_combination: dict[str, int] = {} folds = [[] for _ in range(self.n_splits)] # type: ignore # for every row @@ -229,7 +229,7 @@ def _prepare_stratification( all_combinations.append(combination) per_row_combinations[sample_index].append(combination) - all_combinations = [list(x) for x in set(all_combinations)] + all_combinations: list[list[int]] = [list(x) for x in set(all_combinations)] self.desired_samples_per_combination_per_fold = { combination: np.array( diff --git a/mteb/languages/language_scripts.py b/mteb/languages/language_scripts.py index b8f05492f0..f0a6f2f9cc 100644 --- a/mteb/languages/language_scripts.py +++ b/mteb/languages/language_scripts.py @@ -3,7 +3,7 @@ from typing_extensions import Self -from mteb.languages import check_language_code +from mteb.languages.check_language_code import check_language_code @dataclass diff --git a/pyproject.toml b/pyproject.toml index bcb15dc97a..9edd842544 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,9 +127,9 @@ typing = [ "types-simplejson>=3.20.0.20250822", "types-tqdm>=4.67.0.20250809", "types-tensorflow>=2.18.0.20250809", - # stubs require python >=3.10 - # "pandas-stubs>=2.3.2.250827", - # "scipy-stubs>=1.15.3.0", + "pandas-stubs>=2.3.2.250926", + "scipy-stubs>=1.15.3.0", + "types-defusedxml>=0.7.0.20250822", ] dev = [ {include-group = "lint"}, From e028aea938776dde48d710b414ffd2c49ef2096b Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 11 Nov 2025 22:57:23 +0300 Subject: [PATCH 03/32] finish evaluators --- mteb/cli/build_cli.py | 10 ++-- mteb/cli/generate_model_card.py | 10 ++-- mteb/deprecated_evaluator.py | 81 ++++++++++++++++++++------------- mteb/evaluate.py | 62 +++++++++++++++---------- mteb/models/instruct_wrapper.py | 13 ++++-- 5 files changed, 105 insertions(+), 71 deletions(-) diff --git a/mteb/cli/build_cli.py b/mteb/cli/build_cli.py index ac551a57fc..4e959d6522 100644 --- a/mteb/cli/build_cli.py +++ b/mteb/cli/build_cli.py @@ -7,12 +7,12 @@ from rich.logging import RichHandler import mteb +from mteb.abstasks.abstask import AbsTask from mteb.cache import ResultCache +from mteb.cli._display_tasks import _display_benchmarks, _display_tasks from mteb.cli.generate_model_card import generate_model_card from mteb.evaluate import OverwriteStrategy -from ._display_tasks import _display_benchmarks, _display_tasks - logger = logging.getLogger(__name__) @@ -53,7 +53,7 @@ def run(args: argparse.Namespace) -> None: if args.benchmarks: benchmarks = mteb.get_benchmarks(names=args.benchmarks) - tasks = [t for b in benchmarks for t in b.tasks] + tasks = tuple(t for b in benchmarks for t in b.tasks) else: tasks = mteb.get_tasks( categories=args.categories, @@ -285,9 +285,9 @@ def _create_meta(args: argparse.Namespace) -> None: "Output path already exists, use --overwrite to overwrite." ) - tasks = [] + tasks: list[AbsTask] = [] if tasks_names is not None: - tasks = mteb.get_tasks(tasks_names) + tasks = list(mteb.get_tasks(tasks_names)) if benchmarks is not None: benchmarks = mteb.get_benchmarks(benchmarks) for benchmark in benchmarks: diff --git a/mteb/cli/generate_model_card.py b/mteb/cli/generate_model_card.py index c50e90a6f8..3c77d33635 100644 --- a/mteb/cli/generate_model_card.py +++ b/mteb/cli/generate_model_card.py @@ -1,4 +1,5 @@ import logging +from collections.abc import Sequence from pathlib import Path from huggingface_hub import ModelCard, ModelCardData, repo_exists @@ -12,7 +13,7 @@ def generate_model_card( model_name: str, - tasks: list[AbsTask] | None = None, + tasks: Sequence[AbsTask] | None = None, existing_model_card_id_or_path: str | Path | None = None, results_cache: ResultCache = ResultCache(), output_path: Path = Path("model_card.md"), @@ -47,8 +48,8 @@ def generate_model_card( for task_result in models_results.task_results: eval_results.extend(task_result.get_hf_eval_results()) - existing_model_card_data = ( - existing_model_card.data if existing_model_card else ModelCardData() + existing_model_card_data: ModelCardData = ( + existing_model_card.data if existing_model_card else ModelCardData() # type: ignore[assignment] ) if existing_model_card_data.eval_results is None: @@ -88,7 +89,8 @@ def generate_model_card( benchmark_results, existing_model_card ) - if push_to_hub: + if push_to_hub and existing_model_card_id_or_path: + existing_model_card_id_or_path = str(existing_model_card_id_or_path) if repo_exists(existing_model_card_id_or_path): existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token) else: diff --git a/mteb/deprecated_evaluator.py b/mteb/deprecated_evaluator.py index c3f7712cf3..f88a81192c 100644 --- a/mteb/deprecated_evaluator.py +++ b/mteb/deprecated_evaluator.py @@ -8,11 +8,12 @@ from collections.abc import Iterable from copy import deepcopy from datetime import datetime -from itertools import chain from pathlib import Path from time import time -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, cast +from mteb import CrossEncoderProtocol +from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.abstasks.task_metadata import TaskCategory, TaskType from mteb.models.get_model_meta import ( _model_meta_from_cross_encoder, @@ -70,13 +71,20 @@ def __init__( """ from mteb.benchmarks import Benchmark - self.tasks = list(tasks) - if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark): + if isinstance(tasks, list) and all( + isinstance(task, Benchmark) for task in tasks + ): self.benchmarks = tasks - self.tasks = list(chain.from_iterable(self.tasks)) + self.tasks = [task for bench in tasks for task in bench.tasks] + elif isinstance(tasks, list) and all( + isinstance(task, AbsTask) for task in tasks + ): + self.tasks = list(tasks) + else: + raise ValueError("tasks must be a list of AbsTask or Benchmark instances.") self.err_logs_path = Path(err_logs_path) - self.last_evaluated_splits = {} + self._last_evaluated_splits: dict[str, list[str]] = {} @property def available_tasks(self) -> list[str]: @@ -89,7 +97,7 @@ def available_task_types(self) -> list[TaskType]: return sorted({x.metadata.type for x in self.tasks}) @property - def available_task_categories(self) -> set[TaskCategory]: + def available_task_categories(self) -> set[TaskCategory | None]: """Set of available task categories.""" return {x.metadata.category for x in self.tasks} @@ -236,13 +244,14 @@ def _merge_results( merged_kg_co2_emissions = None if existing_kg_co2_emissions and new_kg_co2_emissions: merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions + existing_evaluation_time = existing_results.evaluation_time or 0 + new_evaluation_time = new_results.evaluation_time or 0 merged_results = TaskResult( dataset_revision=new_results.dataset_revision, task_name=new_results.task_name, mteb_version=new_results.mteb_version, scores=merged_scores, - evaluation_time=existing_results.evaluation_time - + new_results.evaluation_time, + evaluation_time=existing_evaluation_time + new_evaluation_time, kg_co2_emissions=merged_kg_co2_emissions, ) @@ -311,13 +320,16 @@ def run( elif verbosity == 3: datasets.logging.set_verbosity(logging.DEBUG) - meta = self.create_model_meta(model) - output_path = self._create_output_folder(meta, output_folder) - + mteb_model: MTEBModels if isinstance(model, SentenceTransformer): - model = SentenceTransformerEncoderWrapper(model) + mteb_model = cast(EncoderProtocol, SentenceTransformerEncoderWrapper(model)) elif isinstance(model, CrossEncoder): - model = CrossEncoderWrapper(model) + mteb_model = cast(CrossEncoderProtocol, CrossEncoderWrapper(model)) + else: + mteb_model = cast(MTEBModels, model) + + meta = self.create_model_meta(mteb_model) + output_path = self._create_output_folder(meta, output_folder) # Disable co2_tracker for API models if "API" in meta.framework: @@ -338,18 +350,20 @@ def run( ) # save them in case we re-use the object (e.g. for reranking) # To evaluate missing splits, we keep track of the task name and the corresponding splits. - self.last_evaluated_splits = {} + self._last_evaluated_splits = {} while len(self.tasks) > 0: task = self.tasks[0] logger.info( f"\n\n********************** Evaluating {task.metadata.name} **********************" ) + save_path: Path | None = None if task.is_aggregate: - self_ = MTEB(tasks=task.metadata.tasks) - task_results = self_.run( - model, + aggregated_task = cast(AbsTaskAggregate, task) + self_ = MTEB(tasks=aggregated_task.metadata.tasks) + aggregated_task_results = self_.run( + mteb_model, verbosity=verbosity - 1, output_folder=output_folder, eval_splits=eval_splits, @@ -360,11 +374,13 @@ def run( encode_kwargs=encode_kwargs, **kwargs, ) - new_results = task.combine_task_results(task_results) + new_results = aggregated_task.combine_task_results( + aggregated_task_results + ) evaluation_results.append(new_results) if output_path: - save_path = output_path / f"{task.metadata.name}.json" + save_path = output_path / f"{aggregated_task.metadata.name}.json" new_results.to_disk(save_path) del self.tasks[0] continue @@ -387,7 +403,6 @@ def run( task_subsets = task.hf_subsets existing_results = None - save_path = None final_splits_to_run = task_eval_splits missing_evaluations = self._get_missing_evaluations( existing_results, @@ -437,7 +452,7 @@ def run( logger.info( f"No splits to evaluate for {task.metadata.name}. Skipping evaluation." ) - self.last_evaluated_splits[task.metadata.name] = [] + self._last_evaluated_splits[task.metadata.name] = [] del self.tasks[0] continue @@ -445,11 +460,11 @@ def run( task.check_if_dataset_is_superseded() task.load_data() - task_results = {} + task_results: dict[str, dict[str, dict[str, Any]]] = {} evaluation_time = 0 kg_co2_emissions: int | None = 0 if co2_tracker else None - self.last_evaluated_splits[task.metadata.name] = [] + self._last_evaluated_splits[task.metadata.name] = [] for split in final_splits_to_run: info = missing_evaluations[split] @@ -470,7 +485,9 @@ def run( if co2_tracker: try: - from codecarbon import EmissionsTracker + from codecarbon import ( + EmissionsTracker, # type: ignore[import-not-found] + ) except ImportError: raise ImportError( "codecarbon is not installed. Please install it using `pip install 'mteb[codecarbon]'` to track CO₂ emissions." @@ -486,7 +503,7 @@ def run( ) as tracker: results, tick, tock = self._run_eval( task, - model, + mteb_model, split, encode_kwargs=encode_kwargs, subsets_to_run=subsets_to_run, @@ -499,7 +516,7 @@ def run( else: results, tick, tock = self._run_eval( task, - model, + mteb_model, split, subsets_to_run=subsets_to_run, encode_kwargs=encode_kwargs, @@ -515,7 +532,7 @@ def run( if verbosity >= 1: logger.info(f"Scores: {task_results[split]}") - self.last_evaluated_splits[task.metadata.name].append(split) + self._last_evaluated_splits[task.metadata.name].append(split) # Create new TaskResult new_results = TaskResult.from_task_results( @@ -526,14 +543,14 @@ def run( ) # Merge with existing if needed - if output_path and save_path.exists(): + if output_path and save_path and save_path.exists(): existing_results = TaskResult.from_disk(save_path) if existing_results: merged_results = self._merge_results(existing_results, new_results) else: merged_results = new_results - if output_path: + if output_path and save_path: merged_results.to_disk(save_path) evaluation_results.append(merged_results) @@ -608,7 +625,7 @@ def _get_last_evaluated_splits(self) -> dict[str, list[str]]: Tasks with empty lists indicate that results already existed and no splits were evaluated. """ return deepcopy( - {task: list(splits) for task, splits in self.last_evaluated_splits.items()} + {task: list(splits) for task, splits in self._last_evaluated_splits.items()} ) @staticmethod @@ -665,7 +682,7 @@ def _get_missing_evaluations( return missing_evaluations @staticmethod - def _get_model_meta(model: EncoderProtocol) -> ModelMeta: + def _get_model_meta(model: MTEBModels) -> ModelMeta: from sentence_transformers import CrossEncoder, SentenceTransformer if isinstance(model, CrossEncoder): diff --git a/mteb/evaluate.py b/mteb/evaluate.py index 12d5745adf..c9f8bf1cbd 100644 --- a/mteb/evaluate.py +++ b/mteb/evaluate.py @@ -9,6 +9,7 @@ from tqdm.auto import tqdm +from mteb import Benchmark from mteb._helpful_enum import HelpfulStrEnum from mteb.abstasks import AbsTaskRetrieval from mteb.abstasks.abstask import AbsTask @@ -86,27 +87,28 @@ def _sanitize_model( ) -> tuple[MTEBModels | ModelMeta, ModelMeta, ModelName, Revision]: from sentence_transformers import CrossEncoder, SentenceTransformer + wrapped: MTEBModels | ModelMeta if isinstance(model, SentenceTransformer): - _mdl = SentenceTransformerEncoderWrapper(model) - meta = _mdl.mteb_model_meta - _mdl = cast(EncoderProtocol, _mdl) - model = _mdl + wrapper = SentenceTransformerEncoderWrapper(model) + meta = wrapper.mteb_model_meta + wrapped = cast(EncoderProtocol, wrapper) elif isinstance(model, CrossEncoder): - _mdl = CrossEncoderWrapper(model) - _mdl = cast(CrossEncoderProtocol, _mdl) - meta = _mdl.mteb_model_meta - model = _mdl + cross_encoder_wrapper = CrossEncoderWrapper(model) + meta = cross_encoder_wrapper.mteb_model_meta + wrapped = cast(CrossEncoderProtocol, cross_encoder_wrapper) elif hasattr(model, "mteb_model_meta"): - meta = model.mteb_model_meta # type: ignore[attr-defined] + meta = getattr(model, "mteb_model_meta") if not isinstance(meta, ModelMeta): meta = _create_empty_model_meta() + wrapped = cast(MTEBModels | ModelMeta, model) else: meta = _create_empty_model_meta() if not isinstance(model, ModelMeta) else model + wrapped = meta model_name = cast(str, meta.name) model_revision = cast(str, meta.revision) - return model, meta, model_name, model_revision + return wrapped, meta, model_name, model_revision def _evaluate_task( @@ -161,7 +163,7 @@ def _evaluate_task( if not data_loaded: task.load_data() - evaluation_time = 0 + evaluation_time = 0.0 for split, hf_subsets in splits.items(): tick = time() @@ -194,7 +196,7 @@ def _evaluate_task( def _check_model_modalities( model: ModelMeta, - tasks: AbsTask | Iterable[AbsTask], + tasks: AbsTask | Benchmark | Iterable[AbsTask | Benchmark], ) -> None: """Check that model modalities are compatible with task modalities. @@ -208,12 +210,21 @@ def _check_model_modalities( return model_modalities = set(model.modalities) + check_tasks: Iterable[AbsTask] = [] if isinstance(tasks, AbsTask): - tasks = [tasks] + check_tasks = [tasks] + elif isinstance(tasks, list) and all(isinstance(task, Benchmark) for task in tasks): + benchmarks = cast(Iterable[Benchmark], tasks) + check_tasks = [task for benchmark in benchmarks for task in benchmark.tasks] + elif isinstance(tasks, Benchmark): + benchmark = cast(Benchmark, tasks) + check_tasks = benchmark.tasks + else: + check_tasks = cast(Iterable[AbsTask], tasks) warnings, errors = [], [] - for task in tasks: + for task in check_tasks: # only retrieval tasks have different modalities for query and document and can be run with partial overlaps if isinstance(task, AbsTaskRetrieval): query_mods = set(task.metadata.get_modalities(PromptType.query)) @@ -258,7 +269,7 @@ def _check_model_modalities( def evaluate( model: ModelMeta | MTEBModels | SentenceTransformer | CrossEncoder, - tasks: AbsTask | Iterable[AbsTask], + tasks: AbsTask | Benchmark | Iterable[AbsTask | Benchmark], *, co2_tracker: bool | None = None, raise_error: bool = True, @@ -330,10 +341,10 @@ def evaluate( # AbsTaskAggregate is a special case where we have to run multiple tasks and combine the results if isinstance(tasks, AbsTaskAggregate): - task = cast(AbsTaskAggregate, tasks) + aggregated_task = cast(AbsTaskAggregate, tasks) results = evaluate( model, - task.metadata.tasks, + aggregated_task.metadata.tasks, co2_tracker=co2_tracker, raise_error=raise_error, encode_kwargs=encode_kwargs, @@ -342,7 +353,7 @@ def evaluate( prediction_folder=prediction_folder, show_progress_bar=show_progress_bar, ) - result = task.combine_task_results(results.task_results) + result = aggregated_task.combine_task_results(results.task_results) return ModelResult( model_name=results.model_name, model_revision=results.model_revision, @@ -352,7 +363,8 @@ def evaluate( if isinstance(tasks, AbsTask): task = tasks else: - results = [] + tasks = cast(Iterable[AbsTask], tasks) + evaluate_results = [] tasks_tqdm = tqdm( tasks, desc="Evaluating tasks", @@ -371,20 +383,20 @@ def evaluate( prediction_folder=prediction_folder, show_progress_bar=False, ) - results.extend(_res.task_results) + evaluate_results.extend(_res.task_results) return ModelResult( model_name=_res.model_name, model_revision=_res.model_revision, - task_results=results, + task_results=evaluate_results, ) overwrite_strategy = OverwriteStrategy.from_str(overwrite_strategy) - existing_results = None + existing_results: TaskResult | None = None if cache and overwrite_strategy != OverwriteStrategy.ALWAYS: - results = cache.load_task_result(task.metadata.name, meta) - if results: - existing_results = results + cache_results = cache.load_task_result(task.metadata.name, meta) + if cache_results: + existing_results = cache_results if ( existing_results diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 2a5aae5595..92dc63bfd1 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -17,7 +17,7 @@ def instruct_wrapper( model_name_or_path: str, mode: str, - instruction_template: str | Callable[[str], str] | None = None, + instruction_template: str | Callable[[str, PromptType], str] | None = None, **kwargs, ): """Instruct wrapper for models. Uses GritLM to pass instructions to the model. @@ -82,8 +82,11 @@ def encode( logger.info( f"Using instruction: '{instruction}' for task: '{task_metadata.name}'" ) - embeddings = super().encode( - _inputs, instruction=instruction, *args, **kwargs + embeddings = super().encode( # type: ignore[safe-super] + _inputs, + instruction=instruction, + *args, + **kwargs, # type: ignore[arg-type] ) if isinstance(embeddings, torch.Tensor): # sometimes in kwargs can be return_tensors=True @@ -140,7 +143,7 @@ def __init__( ) self.instruction_template = instruction_template - tokenizer_params = {} + tokenizer_params: dict[str, Any] = {} if add_eos_token: tokenizer_params["add_eos_token"] = add_eos_token if max_seq_length is not None: @@ -189,7 +192,7 @@ def encode( The encoded input in a numpy array or torch tensor of the shape (Number of sentences) x (Embedding dimension). """ sentences = [text for batch in inputs for text in batch["text"]] - instruction = self.get_task_instruction(task_metadata, prompt_type) + instruction: str | None = self.get_task_instruction(task_metadata, prompt_type) # to passage prompts won't be applied to passages if ( From 86e7efd7745030f2f31d9a9aebcedce3a3f15bad Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 12 Nov 2025 00:58:17 +0300 Subject: [PATCH 04/32] add more types --- mteb/_create_dataloaders.py | 8 +-- .../imagetext_pairclassification_evaluator.py | 5 +- mteb/abstasks/abstask.py | 18 ++++-- .../image/image_text_pair_classification.py | 6 +- mteb/abstasks/task_metadata.py | 10 +-- mteb/abstasks/text/bitext_mining.py | 62 +++++++++++++------ mteb/abstasks/text/reranking.py | 8 ++- mteb/cache.py | 17 +++-- mteb/deprecated_evaluator.py | 8 +-- mteb/filter_tasks.py | 46 +++----------- mteb/get_tasks.py | 22 +++---- mteb/load_results.py | 18 +++--- mteb/models/instruct_wrapper.py | 4 +- mteb/results/benchmark_results.py | 24 ++++--- mteb/results/task_result.py | 4 +- 15 files changed, 133 insertions(+), 127 deletions(-) diff --git a/mteb/_create_dataloaders.py b/mteb/_create_dataloaders.py index 0a8d6725b9..b523cd8f5d 100644 --- a/mteb/_create_dataloaders.py +++ b/mteb/_create_dataloaders.py @@ -115,7 +115,7 @@ def _create_text_dataloader_for_queries( def _convert_conv_history_to_query( - row: dict[str, list[str] | Conversation], + row: dict[str, str | list[str] | Conversation], ) -> dict[str, str | Conversation]: """Convert a conversation history to a single query string. @@ -130,10 +130,10 @@ def _convert_conv_history_to_query( conversation = row["text"] # if it's a list of strings, just join them if isinstance(conversation, list) and isinstance(conversation[0], str): - conversation = cast(list[str], conversation) - conv_str = "; ".join(conversation) + conversation_ = cast(list[str], conversation) + conv_str = "; ".join(conversation_) current_conversation = [ - ConversationTurn(role="user", content=message) for message in conversation + ConversationTurn(role="user", content=message) for message in conversation_ ] if not _warned_about_user_role: logger.warning( diff --git a/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py b/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py index 1b25ae62cc..33dd7b2fa6 100644 --- a/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +++ b/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py @@ -1,4 +1,5 @@ import logging +from collections.abc import Sequence from typing import Any import torch @@ -56,8 +57,8 @@ class ImageTextPairClassificationEvaluator(Evaluator): def __init__( self, dataset, - images_column_names: str | list[str], - texts_column_names: str | list[str], + images_column_names: str | Sequence[str], + texts_column_names: str | Sequence[str], num_images_per_sample: int, num_texts_per_sample: int, task_metadata: TaskMetadata, diff --git a/mteb/abstasks/abstask.py b/mteb/abstasks/abstask.py index e4dbfa3c9c..1d82f596b6 100644 --- a/mteb/abstasks/abstask.py +++ b/mteb/abstasks/abstask.py @@ -1,10 +1,10 @@ import json import logging from abc import ABC, abstractmethod -from collections.abc import Sequence +from collections.abc import Mapping, Sequence from copy import copy from pathlib import Path -from typing import Any, cast +from typing import Any, TypedDict, cast import numpy as np from datasets import ClassLabel, Dataset, DatasetDict, load_dataset @@ -60,6 +60,12 @@ def _multilabel_subsampling( return dataset_dict +class AbsMetrics(TypedDict): + """The abstract class for the metrics returned by the tasks""" + + ... + + class AbsTask(ABC): """The abstract class for the tasks @@ -123,7 +129,7 @@ def evaluate( encode_kwargs: dict[str, Any], prediction_folder: Path | None = None, **kwargs: Any, - ) -> dict[HFSubset, ScoresDict]: + ) -> Mapping[HFSubset, ScoresDict]: """Evaluates an MTEB compatible model on the task. Args: @@ -198,12 +204,12 @@ def _evaluate_subset( model: EncoderProtocol, data_split: Dataset, *, - encode_kwargs: dict[str, Any], hf_split: str, hf_subset: str, + encode_kwargs: dict[str, Any], prediction_folder: Path | None = None, **kwargs: Any, - ) -> ScoresDict: + ) -> AbsMetrics: raise NotImplementedError( "If you are using the default evaluate method, you must implement _evaluate_subset method." ) @@ -499,7 +505,7 @@ def filter_languages( self.hf_subsets = subsets_to_keep return self - def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None: + def _add_main_score(self, scores: dict[HFSubset, ScoresDict | AbsMetrics]) -> None: scores["main_score"] = scores[self.metadata.main_score] def _upload_dataset_to_hub( diff --git a/mteb/abstasks/image/image_text_pair_classification.py b/mteb/abstasks/image/image_text_pair_classification.py index 829a70a45e..0cae5dde64 100644 --- a/mteb/abstasks/image/image_text_pair_classification.py +++ b/mteb/abstasks/image/image_text_pair_classification.py @@ -1,7 +1,7 @@ import logging from collections.abc import Sequence from pathlib import Path -from typing import Any, TypedDict +from typing import Any import torch from datasets import Dataset, concatenate_datasets @@ -11,7 +11,7 @@ calculate_image_statistics, calculate_text_statistics, ) -from mteb.abstasks.abstask import AbsTask +from mteb.abstasks.abstask import AbsMetrics, AbsTask from mteb.models.models_protocols import EncoderProtocol from mteb.types.statistics import ( ImageStatistics, @@ -36,7 +36,7 @@ class ImageTextPairClassificationDescriptiveStatistics(SplitDescriptiveStatistic image_statistics: ImageStatistics -class ImageTextPairClassificationMetrics(TypedDict): +class ImageTextPairClassificationMetrics(AbsMetrics): """ImageTextPairClassification metrics. Attributes: diff --git a/mteb/abstasks/task_metadata.py b/mteb/abstasks/task_metadata.py index f970b25966..c6ac1b00b6 100644 --- a/mteb/abstasks/task_metadata.py +++ b/mteb/abstasks/task_metadata.py @@ -150,7 +150,7 @@ "InstructionReranking", ) + MIEB_TASK_TYPE -TaskType = Literal[_TASK_TYPE] +TaskType = Literal[_TASK_TYPE] # type: ignore[valid-type] """The type of the task. E.g. includes "Classification", "Retrieval" and "Clustering".""" @@ -193,7 +193,9 @@ PromptDict = TypedDict( - "PromptDict", {prompt_type.value: str for prompt_type in PromptType}, total=False + "PromptDict", + {prompt_type.value: str for prompt_type in PromptType}, + total=False, # type: ignore[misc] ) """A dictionary containing the prompt used for the task. @@ -447,7 +449,7 @@ def get_modalities(self, prompt_type: PromptType | None = None) -> list[Modaliti Raises: ValueError: If the prompt type is not recognized. """ - if prompt_type is None: + if prompt_type is None or self.category is None: return self.modalities query_modalities, doc_modalities = self.category.split("2") category_to_modality: dict[str, Modalities] = { @@ -711,7 +713,7 @@ def _hf_languages(self) -> list[str]: readme_langs.append(lang_name) return sorted(set(readme_langs)) - def _hf_license(self) -> str: + def _hf_license(self) -> str | None: dataset_license = self.license if dataset_license: license_mapping = { diff --git a/mteb/abstasks/text/bitext_mining.py b/mteb/abstasks/text/bitext_mining.py index 961c5caf60..6112f391a1 100644 --- a/mteb/abstasks/text/bitext_mining.py +++ b/mteb/abstasks/text/bitext_mining.py @@ -1,15 +1,16 @@ import logging from collections import defaultdict from pathlib import Path -from typing import Any, ClassVar, TypedDict +from typing import Any, ClassVar, cast from datasets import Dataset, DatasetDict from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score from mteb._evaluators import BitextMiningEvaluator from mteb.abstasks._statistics_calculation import calculate_text_statistics -from mteb.abstasks.abstask import AbsTask +from mteb.abstasks.abstask import AbsMetrics, AbsTask from mteb.models import EncoderProtocol, MTEBModels +from mteb.models.models_protocols import CrossEncoderProtocol, SearchProtocol from mteb.types import HFSubset, ScoresDict from mteb.types.statistics import SplitDescriptiveStatistics, TextStatistics @@ -36,7 +37,7 @@ class BitextDescriptiveStatistics(SplitDescriptiveStatistics): sentence2_statistics: TextStatistics -class BitextMiningMetrics(TypedDict): +class BitextMiningMetrics(AbsMetrics): """Metrics for BitextMining tasks Attributes: @@ -78,6 +79,23 @@ def evaluate( **kwargs: Any, ) -> dict[HFSubset, ScoresDict]: """Added load for "parallel" datasets""" + if isinstance(model, CrossEncoderProtocol) and not self._support_cross_encoder: + raise TypeError( + f"Model {model} is a CrossEncoder, but this task {self.metadata.name} does not support CrossEncoders. " + "Please use a Encoder model instead." + ) + + # encoders might implement search protocols + if ( + isinstance(model, SearchProtocol) + and not isinstance(model, EncoderProtocol) + and not self._support_search + ): + raise TypeError( + f"Model {model} is a SearchProtocol, but this task {self.metadata.name} does not support Search. " + "Please use a Encoder model instead." + ) + if not self.data_loaded: self.load_data() @@ -87,11 +105,16 @@ def evaluate( if subsets_to_run is not None: hf_subsets = [s for s in hf_subsets if s in subsets_to_run] - scores = {} + encoder_model = cast(EncoderProtocol, model) + + if self.dataset is None: + raise ValueError("Dataset is not loaded.") + + scores: dict[str, BitextMiningMetrics] = {} if self.parallel_subsets: - scores = self._evaluate_subset( - model, - self.dataset[split], # type: ignore + scores = self._evaluate_subset( # type: ignore[assignment] + encoder_model, + self.dataset[split], parallel=True, hf_split=split, hf_subset="parallel", @@ -109,8 +132,8 @@ def evaluate( data_split = self.dataset[split] else: data_split = self.dataset[hf_subset][split] - scores[hf_subset] = self._evaluate_subset( - model, + scores[hf_subset] = self._evaluate_subset( # type: ignore[assignment] + encoder_model, data_split, hf_split=split, hf_subset=hf_subset, @@ -124,21 +147,21 @@ def evaluate( def _get_pairs(self, parallel: bool) -> list[tuple[str, str]]: pairs = self._DEFAULT_PAIR if parallel: - pairs = [langpair.split("-") for langpair in self.hf_subsets] + pairs = [langpair.split("-") for langpair in self.hf_subsets] # type: ignore[misc] return pairs - def _evaluate_subset( + def _evaluate_subset( # type: ignore[override] self, model: EncoderProtocol, data_split: Dataset, *, hf_split: str, hf_subset: str, - parallel: bool = False, encode_kwargs: dict[str, Any], prediction_folder: Path | None = None, + parallel: bool = False, **kwargs, - ) -> ScoresDict: + ) -> BitextMiningMetrics | dict[str, BitextMiningMetrics]: pairs = self._get_pairs(parallel) evaluator = BitextMiningEvaluator( @@ -250,8 +273,11 @@ def _calculate_descriptive_statistics_from_split( ) def _push_dataset_to_hub(self, repo_name: str) -> None: + if self.dataset is None: + raise ValueError("Dataset is not loaded.") + if self.metadata.is_multilingual: - dataset = defaultdict(dict) + dataset: dict[str, dict[str, list[str]]] = defaultdict(dict) for config in self.metadata.eval_langs: logger.info(f"Converting {config} of {self.metadata.name}") @@ -266,10 +292,10 @@ def _push_dataset_to_hub(self, repo_name: str) -> None: for split in self.dataset[config]: dataset[split][lang_1] = self.dataset[config][split][sent_1] dataset[split][lang_2] = self.dataset[config][split][sent_2] - for split in dataset: - dataset[split] = Dataset.from_dict(dataset[split]) - dataset = DatasetDict(dataset) - dataset.push_to_hub(repo_name) + dataset_dict = DatasetDict( + {split: Dataset.from_dict(dataset[split]) for split in dataset} + ) + dataset_dict.push_to_hub(repo_name) else: sentences = {} for split in self.dataset: diff --git a/mteb/abstasks/text/reranking.py b/mteb/abstasks/text/reranking.py index 13ceebdd77..e675a11084 100644 --- a/mteb/abstasks/text/reranking.py +++ b/mteb/abstasks/text/reranking.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) -OLD_FORMAT_RERANKING_TASKS = [] +OLD_FORMAT_RERANKING_TASKS: list[str] = [] @deprecated( @@ -105,7 +105,9 @@ def transform_old_dataset_format(self, given_dataset: Dataset | None = None): ) given_dataset = copy(given_dataset) - self.dataset = defaultdict(lambda: defaultdict(dict)) + self.dataset: dict[str, dict[str, RetrievalSplitData]] = defaultdict( + lambda: defaultdict(dict) # type: ignore[arg-type] + ) hf_subsets = self.hf_subsets @@ -127,7 +129,7 @@ def transform_old_dataset_format(self, given_dataset: Dataset | None = None): for split in cur_dataset: corpus = [] queries = [] - relevant_docs = defaultdict(dict) + relevant_docs: dict[str, dict[str, int]] = defaultdict(dict) top_ranked = defaultdict(list) # Create an enumerated dataset to pass indices diff --git a/mteb/cache.py b/mteb/cache.py index 6175b20c54..7540c2aa9c 100644 --- a/mteb/cache.py +++ b/mteb/cache.py @@ -412,7 +412,8 @@ def _filter_paths_by_model_and_revision( if (p.parent.parent.name, p.parent.name) in name_and_revision ] - model_names = {m.replace("/", "__").replace(" ", "_") for m in models} + str_models = cast(Sequence[str], models) + model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models} return [p for p in paths if p.parent.parent.name in model_names] @staticmethod @@ -475,7 +476,7 @@ def load_results( ) models_results = defaultdict(list) - task_names = {} + task_names: dict[str, AbsTask | None] = {} if tasks is not None: for task in tasks: if isinstance(task, AbsTask): @@ -493,9 +494,9 @@ def load_results( ) if validate_and_filter: - task = task_names[task_result.task_name] + task_instance = task_names[task_result.task_name] try: - task_result.validate_and_filter_scores(task=task) + task_result.validate_and_filter_scores(task=task_instance) except Exception as e: logger.info( f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}" @@ -505,7 +506,7 @@ def load_results( models_results[(model_name, revision)].append(task_result) # create BenchmarkResults object - models_results = [ + models_results_object = [ ModelResult( model_name=model_name, model_revision=revision, @@ -514,8 +515,6 @@ def load_results( for (model_name, revision), task_results in models_results.items() ] - benchmark_results = BenchmarkResults( - model_results=models_results, + return BenchmarkResults( + model_results=models_results_object, ) - - return benchmark_results diff --git a/mteb/deprecated_evaluator.py b/mteb/deprecated_evaluator.py index f88a81192c..d2742844fb 100644 --- a/mteb/deprecated_evaluator.py +++ b/mteb/deprecated_evaluator.py @@ -5,7 +5,7 @@ import os import sys import traceback -from collections.abc import Iterable +from collections.abc import Iterable, Mapping from copy import deepcopy from datetime import datetime from pathlib import Path @@ -460,7 +460,7 @@ def run( task.check_if_dataset_is_superseded() task.load_data() - task_results: dict[str, dict[str, dict[str, Any]]] = {} + task_results: dict[str, Mapping[str, dict[str, Any]]] = {} evaluation_time = 0 kg_co2_emissions: int | None = 0 if co2_tracker else None @@ -485,8 +485,8 @@ def run( if co2_tracker: try: - from codecarbon import ( - EmissionsTracker, # type: ignore[import-not-found] + from codecarbon import ( # type: ignore[import-not-found] + EmissionsTracker, ) except ImportError: raise ImportError( diff --git a/mteb/filter_tasks.py b/mteb/filter_tasks.py index 759e1f03d8..4a051a7f69 100644 --- a/mteb/filter_tasks.py +++ b/mteb/filter_tasks.py @@ -2,7 +2,7 @@ import logging from collections.abc import Sequence -from typing import overload +from typing import TypeVar from mteb.abstasks import ( AbsTask, @@ -32,54 +32,23 @@ def _check_is_valid_language(lang: str) -> None: ) -@overload -def filter_tasks( - tasks: Sequence[AbsTask], - *, - languages: list[str] | None = None, - script: list[str] | None = None, - domains: list[TaskDomain] | None = None, - task_types: list[TaskType] | None = None, # type: ignore - categories: list[TaskCategory] | None = None, - modalities: list[Modalities] | None = None, - exclusive_modality_filter: bool = False, - exclude_superseded: bool = False, - exclude_aggregate: bool = False, - exclude_private: bool = False, -) -> list[AbsTask]: ... - - -@overload -def filter_tasks( - tasks: Sequence[type[AbsTask]], - *, - languages: list[str] | None = None, - script: list[str] | None = None, - domains: list[TaskDomain] | None = None, - task_types: list[TaskType] | None = None, # type: ignore - categories: list[TaskCategory] | None = None, - modalities: list[Modalities] | None = None, - exclusive_modality_filter: bool = False, - exclude_superseded: bool = False, - exclude_aggregate: bool = False, - exclude_private: bool = False, -) -> list[type[AbsTask]]: ... +T = TypeVar("T", AbsTask, type[AbsTask]) def filter_tasks( - tasks: Sequence[AbsTask] | Sequence[type[AbsTask]], + tasks: Sequence[T], *, languages: list[str] | None = None, script: list[str] | None = None, domains: list[TaskDomain] | None = None, - task_types: list[TaskType] | None = None, # type: ignore + task_types: list[TaskType] | None = None, categories: list[TaskCategory] | None = None, modalities: list[Modalities] | None = None, exclusive_modality_filter: bool = False, exclude_superseded: bool = False, exclude_aggregate: bool = False, exclude_private: bool = False, -) -> list[AbsTask] | list[type[AbsTask]]: +) -> list[T]: """Filter tasks based on the specified criteria. Args: @@ -92,7 +61,6 @@ def filter_tasks( task_types: A string specifying the type of task e.g. "Classification" or "Retrieval". If None, all tasks are included. categories: A list of task categories these include "t2t" (text to text), "t2i" (text to image). See TaskMetadata for the full list. exclude_superseded: A boolean flag to exclude datasets which are superseded by another. - eval_splits: A list of evaluation splits to include. If None, all splits are included. modalities: A list of modalities to include. If None, all modalities are included. exclusive_modality_filter: If True, only keep tasks where _all_ filter modalities are included in the task's modalities and ALL task modalities are in filter modalities (exact match). @@ -113,12 +81,12 @@ def filter_tasks( """ langs_to_keep = None if languages: - [_check_is_valid_language(lang) for lang in languages] + [_check_is_valid_language(lang) for lang in languages] # type: ignore[func-returns-value] langs_to_keep = set(languages) script_to_keep = None if script: - [_check_is_valid_script(s) for s in script] + [_check_is_valid_script(s) for s in script] # type: ignore[func-returns-value] script_to_keep = set(script) domains_to_keep = None diff --git a/mteb/get_tasks.py b/mteb/get_tasks.py index a36495beb8..741b4c40aa 100644 --- a/mteb/get_tasks.py +++ b/mteb/get_tasks.py @@ -22,12 +22,11 @@ def _gather_tasks() -> tuple[type[AbsTask], ...]: import mteb.tasks as tasks - tasks = [ + return tuple( t for t in tasks.__dict__.values() if isinstance(t, type) and issubclass(t, AbsTask) - ] - return tuple(tasks) + ) def _create_name_to_task_mapping( @@ -194,8 +193,7 @@ def to_latex( string with a LaTeX table. """ if include_citation_in_name and "name" in properties: - properties += ["intext_citation"] - df = self.to_dataframe(properties) + df = self.to_dataframe(tuple(properties) + ("intext_citation",)) df["name"] = df["name"] + " " + df["intext_citation"] df = df.drop(columns=["intext_citation"]) else: @@ -287,7 +285,7 @@ def get_tasks( ] return MTEBTasks(_tasks) - _tasks = filter_tasks( + tasks_: list[type[AbsTask]] = filter_tasks( TASK_LIST, languages=languages, script=script, @@ -300,12 +298,12 @@ def get_tasks( exclude_aggregate=exclude_aggregate, exclude_private=exclude_private, ) - _tasks = [ - cls().filter_languages(languages, script).filter_eval_splits(eval_splits) - for cls in _tasks - ] - - return MTEBTasks(_tasks) + return MTEBTasks( + [ + cls().filter_languages(languages, script).filter_eval_splits(eval_splits) + for cls in tasks_ + ] + ) _TASK_RENAMES = {"PersianTextTone": "SynPerTextToneClassification"} diff --git a/mteb/load_results.py b/mteb/load_results.py index 4108e0b066..572c8a1547 100644 --- a/mteb/load_results.py +++ b/mteb/load_results.py @@ -83,21 +83,21 @@ def load_results( if models is not None: models_to_keep = {} - for model_path in models: - if isinstance(model_path, ModelMeta): - models_to_keep[model_path.name] = model_path.revision + for model in models: + if isinstance(model, ModelMeta): + models_to_keep[model.name] = model.revision else: - models_to_keep[model_path] = None + models_to_keep[model] = None else: models_to_keep = None - task_names = {} + task_names: dict[str, AbsTask | None] = {} if tasks is not None: - for task in tasks: - if isinstance(task, AbsTask): - task_names[task.metadata.name] = task + for task_ in tasks: + if isinstance(task_, AbsTask): + task_names[task_.metadata.name] = task_ else: - task_names[task] = None + task_names[task_] = None model_results = [] for model_path in model_paths: diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 92dc63bfd1..49dd942ff8 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -83,10 +83,10 @@ def encode( f"Using instruction: '{instruction}' for task: '{task_metadata.name}'" ) embeddings = super().encode( # type: ignore[safe-super] - _inputs, + _inputs, # type: ignore[arg-type] instruction=instruction, *args, - **kwargs, # type: ignore[arg-type] + **kwargs, ) if isinstance(embeddings, torch.Tensor): # sometimes in kwargs can be return_tensors=True diff --git a/mteb/results/benchmark_results.py b/mteb/results/benchmark_results.py index e5b42f4609..4109c4e512 100644 --- a/mteb/results/benchmark_results.py +++ b/mteb/results/benchmark_results.py @@ -3,7 +3,7 @@ import warnings from collections.abc import Callable, Iterable, Iterator, Sequence from pathlib import Path -from typing import Any, Literal +from typing import Any, Literal, cast, override import pandas as pd from packaging.version import InvalidVersion, Version @@ -60,7 +60,7 @@ def _filter_tasks( task_types: list[TaskType] | None = None, # type: ignore modalities: list[Modalities] | None = None, is_public: bool | None = None, - ) -> Self: + ) -> "BenchmarkResults": # TODO: Same as filter_models model_results = [ res._filter_tasks( @@ -77,7 +77,7 @@ def _filter_tasks( model_results=[res for res in model_results if res.task_results] ) - def select_tasks(self, tasks: Sequence[AbsTask]) -> Self: + def select_tasks(self, tasks: Sequence[AbsTask]) -> "BenchmarkResults": """Select tasks from the benchmark results. Args: @@ -95,7 +95,7 @@ def select_models( self, names: list[str] | list[ModelMeta], revisions: list[str | None] | None = None, - ) -> Self: + ) -> "BenchmarkResults": """Get models by name and revision. Args: @@ -108,7 +108,7 @@ def select_models( models_res = [] _revisions = revisions if revisions is not None else [None] * len(names) - name_rev = {} + name_rev: dict[str, str | None] = {} if len(names) != len(_revisions): raise ValueError( @@ -117,9 +117,12 @@ def select_models( for name, revision in zip(names, _revisions): if isinstance(name, ModelMeta): + if name.name is None: + raise ValueError("ModelMeta must have a name.") name_rev[name.name] = name.revision else: - name_rev[name] = revision + name_ = cast(str, name) + name_rev[name_] = revision for model_res in self.model_results: model_name = model_res.model_name @@ -139,7 +142,7 @@ def _filter_models( n_parameters_range: tuple[int | None, int | None] = (None, None), use_instructions: bool | None = None, zero_shot_on: list[AbsTask] | None = None, - ) -> Self: + ) -> "BenchmarkResults": # mostly a utility function for the leaderboard app. # I would probably move the filtering of the models outside of this call. No need to call get_model_metas inside the filter. # interface would then be the same as the get_models function @@ -162,7 +165,7 @@ def _filter_models( return type(self).model_construct(model_results=new_model_results) - def join_revisions(self) -> Self: + def join_revisions(self) -> "BenchmarkResults": """Join revisions of the same model. In case of conflicts, the following rules are applied: @@ -361,7 +364,8 @@ def to_dataframe( format=format, ) - def __iter__(self) -> Iterator[ModelResult]: + @override + def __iter__(self) -> Iterator[ModelResult]: # type: ignore[override] return iter(self.model_results) def __getitem__(self, index: int) -> ModelResult: @@ -383,7 +387,7 @@ def to_disk(self, path: Path | str) -> None: out_file.write(self.model_dump_json(indent=2)) @classmethod - def from_validated(cls, **data) -> Self: + def from_validated(cls, **data) -> "BenchmarkResults": """Create BenchmarkResults from validated data. Args: diff --git a/mteb/results/task_result.py b/mteb/results/task_result.py index 611156b5ff..c413bf8d9e 100644 --- a/mteb/results/task_result.py +++ b/mteb/results/task_result.py @@ -4,7 +4,7 @@ import logging from argparse import Namespace from collections import defaultdict -from collections.abc import Callable, Iterable +from collections.abc import Callable, Iterable, Mapping from functools import cached_property from importlib.metadata import version from pathlib import Path @@ -166,7 +166,7 @@ class TaskResult(BaseModel): def from_task_results( cls, task: AbsTask | type[AbsTask], - scores: dict[SplitName, dict[HFSubset, ScoresDict]], + scores: dict[SplitName, Mapping[HFSubset, ScoresDict]], evaluation_time: float, kg_co2_emissions: float | None = None, ) -> Self: From 84ab864d1f96446a959d18cc9cd03c0d72e0a9ce Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Wed, 12 Nov 2025 19:26:57 +0300 Subject: [PATCH 05/32] Update mteb/results/benchmark_results.py Co-authored-by: Kenneth Enevoldsen --- mteb/results/benchmark_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/results/benchmark_results.py b/mteb/results/benchmark_results.py index 4109c4e512..f882fc1b57 100644 --- a/mteb/results/benchmark_results.py +++ b/mteb/results/benchmark_results.py @@ -118,7 +118,7 @@ def select_models( for name, revision in zip(names, _revisions): if isinstance(name, ModelMeta): if name.name is None: - raise ValueError("ModelMeta must have a name.") + raise ValueError("name in ModelMeta is None. It must be a string.") name_rev[name.name] = name.revision else: name_ = cast(str, name) From 21ff289e373b3c2bb46eb27246329282001a3957 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 9 Dec 2025 00:22:41 +0300 Subject: [PATCH 06/32] apply comments --- mteb/evaluate.py | 12 ++++++------ mteb/models/instruct_wrapper.py | 3 ++- mteb/results/benchmark_results.py | 17 +++++++++-------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/mteb/evaluate.py b/mteb/evaluate.py index d73d8e91bd..0af40e0a93 100644 --- a/mteb/evaluate.py +++ b/mteb/evaluate.py @@ -89,28 +89,28 @@ def _sanitize_model( ) -> tuple[MTEBModels | ModelMeta, ModelMeta, ModelName, Revision]: from sentence_transformers import CrossEncoder, SentenceTransformer - wrapped: MTEBModels | ModelMeta + wrapped_model: MTEBModels | ModelMeta if isinstance(model, SentenceTransformer): wrapper = SentenceTransformerEncoderWrapper(model) meta = wrapper.mteb_model_meta - wrapped = cast(EncoderProtocol, wrapper) + wrapped_model = cast(EncoderProtocol, wrapper) elif isinstance(model, CrossEncoder): cross_encoder_wrapper = CrossEncoderWrapper(model) meta = cross_encoder_wrapper.mteb_model_meta - wrapped = cast(CrossEncoderProtocol, cross_encoder_wrapper) + wrapped_model = cast(CrossEncoderProtocol, cross_encoder_wrapper) elif hasattr(model, "mteb_model_meta"): meta = getattr(model, "mteb_model_meta") if not isinstance(meta, ModelMeta): meta = _create_empty_model_meta() - wrapped = cast(MTEBModels | ModelMeta, model) + wrapped_model = cast(MTEBModels | ModelMeta, model) else: meta = _create_empty_model_meta() if not isinstance(model, ModelMeta) else model - wrapped = meta + wrapped_model = meta model_name = cast(str, meta.name) model_revision = cast(str, meta.revision) - return wrapped, meta, model_name, model_revision + return wrapped_model, meta, model_name, model_revision def _evaluate_task( diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 6af30c56e8..2cbbe5c101 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -195,7 +195,8 @@ def encode( The encoded input in a numpy array or torch tensor of the shape (Number of sentences) x (Embedding dimension). """ sentences = [text for batch in inputs for text in batch["text"]] - instruction: str | None = self.get_task_instruction(task_metadata, prompt_type) + instruction: str | None + instruction = self.get_task_instruction(task_metadata, prompt_type) # to passage prompts won't be applied to passages if ( diff --git a/mteb/results/benchmark_results.py b/mteb/results/benchmark_results.py index f882fc1b57..dfd8d9ea6f 100644 --- a/mteb/results/benchmark_results.py +++ b/mteb/results/benchmark_results.py @@ -1,9 +1,11 @@ +from __future__ import annotations + import json import logging import warnings from collections.abc import Callable, Iterable, Iterator, Sequence from pathlib import Path -from typing import Any, Literal, cast, override +from typing import Any, Literal, cast import pandas as pd from packaging.version import InvalidVersion, Version @@ -60,7 +62,7 @@ def _filter_tasks( task_types: list[TaskType] | None = None, # type: ignore modalities: list[Modalities] | None = None, is_public: bool | None = None, - ) -> "BenchmarkResults": + ) -> BenchmarkResults: # TODO: Same as filter_models model_results = [ res._filter_tasks( @@ -77,7 +79,7 @@ def _filter_tasks( model_results=[res for res in model_results if res.task_results] ) - def select_tasks(self, tasks: Sequence[AbsTask]) -> "BenchmarkResults": + def select_tasks(self, tasks: Sequence[AbsTask]) -> BenchmarkResults: """Select tasks from the benchmark results. Args: @@ -95,7 +97,7 @@ def select_models( self, names: list[str] | list[ModelMeta], revisions: list[str | None] | None = None, - ) -> "BenchmarkResults": + ) -> BenchmarkResults: """Get models by name and revision. Args: @@ -142,7 +144,7 @@ def _filter_models( n_parameters_range: tuple[int | None, int | None] = (None, None), use_instructions: bool | None = None, zero_shot_on: list[AbsTask] | None = None, - ) -> "BenchmarkResults": + ) -> BenchmarkResults: # mostly a utility function for the leaderboard app. # I would probably move the filtering of the models outside of this call. No need to call get_model_metas inside the filter. # interface would then be the same as the get_models function @@ -165,7 +167,7 @@ def _filter_models( return type(self).model_construct(model_results=new_model_results) - def join_revisions(self) -> "BenchmarkResults": + def join_revisions(self) -> BenchmarkResults: """Join revisions of the same model. In case of conflicts, the following rules are applied: @@ -364,7 +366,6 @@ def to_dataframe( format=format, ) - @override def __iter__(self) -> Iterator[ModelResult]: # type: ignore[override] return iter(self.model_results) @@ -387,7 +388,7 @@ def to_disk(self, path: Path | str) -> None: out_file.write(self.model_dump_json(indent=2)) @classmethod - def from_validated(cls, **data) -> "BenchmarkResults": + def from_validated(cls, **data) -> BenchmarkResults: """Create BenchmarkResults from validated data. Args: From f9d3035e25a8c6032acbdb4e92b8bef757640125 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 9 Dec 2025 01:16:56 +0300 Subject: [PATCH 07/32] continue typechecking --- mteb/_create_dataloaders.py | 17 ++++------- mteb/abstasks/_data_filter/filters.py | 2 +- mteb/abstasks/_data_filter/task_pipelines.py | 3 ++ mteb/abstasks/_statistics_calculation.py | 8 ++++-- mteb/abstasks/abstask.py | 8 +++--- mteb/abstasks/task_metadata.py | 4 +-- mteb/deprecated_evaluator.py | 2 +- mteb/evaluate.py | 16 +++++------ mteb/filter_tasks.py | 14 +++++----- mteb/get_tasks.py | 28 +++++++++---------- .../cache_backends/_hash_utils.py | 8 +++--- .../cache_backends/faiss_cache.py | 3 +- .../cache_backends/numpy_cache.py | 13 ++++----- mteb/models/cache_wrappers/cache_wrapper.py | 2 +- .../search_indexes/faiss_search_index.py | 8 +++--- mteb/types/statistics.py | 4 +-- pyproject.toml | 2 ++ 17 files changed, 71 insertions(+), 71 deletions(-) diff --git a/mteb/_create_dataloaders.py b/mteb/_create_dataloaders.py index 35e3180736..1492209e43 100644 --- a/mteb/_create_dataloaders.py +++ b/mteb/_create_dataloaders.py @@ -1,4 +1,5 @@ import logging +import warnings from collections.abc import Callable from typing import Any, cast @@ -113,9 +114,6 @@ def _create_text_dataloader_for_queries( ) -_warned_about_user_role = False - - def _convert_conv_history_to_query( row: dict[str, str | list[str] | Conversation], ) -> dict[str, str | Conversation]: @@ -127,8 +125,6 @@ def _convert_conv_history_to_query( Returns: The updated row with the "query" and "text" fields set to the conversation string, and the "conversation" field set to the list of ConversationTurn. """ - global _warned_about_user_role - conversation = row["text"] # if it's a list of strings, just join them if isinstance(conversation, list) and isinstance(conversation[0], str): @@ -137,11 +133,10 @@ def _convert_conv_history_to_query( current_conversation = [ ConversationTurn(role="user", content=message) for message in conversation_ ] - if not _warned_about_user_role: - logger.warning( - "Conversations are a list of strings. Used 'user' role for all turns." - ) - _warned_about_user_role = True + warnings.warn( + "Conversations are a list of strings. Used 'user' role for all turns.", + category=UserWarning, + ) # otherwise, it's a list of dictionaries, which we need to convert to strings elif isinstance(conversation, list) and isinstance(conversation[0], dict): conv = [] @@ -178,7 +173,7 @@ def _convert_conv_history_to_query( row["text"] = conv_str row["conversation"] = current_conversation - return row + return cast(dict[str, str | list[ConversationTurn]], row) def _create_dataloader_for_queries_conversation( diff --git a/mteb/abstasks/_data_filter/filters.py b/mteb/abstasks/_data_filter/filters.py index 23f12cd820..16ed5e8d97 100644 --- a/mteb/abstasks/_data_filter/filters.py +++ b/mteb/abstasks/_data_filter/filters.py @@ -61,7 +61,7 @@ def filter_unclear_label( for text, label in zip(ds[input_column], ds[label_column]): key = text.strip().lower() normalized.setdefault(key, set()).add( - label if isinstance(label, (str, int, float)) else tuple(label) + label if isinstance(label, (str, int, float)) else tuple(label) # type: ignore[arg-type] ) bad_texts = {t for t, labels in normalized.items() if len(labels) > 1} diff --git a/mteb/abstasks/_data_filter/task_pipelines.py b/mteb/abstasks/_data_filter/task_pipelines.py index f12f10e60e..c376edc546 100644 --- a/mteb/abstasks/_data_filter/task_pipelines.py +++ b/mteb/abstasks/_data_filter/task_pipelines.py @@ -89,6 +89,9 @@ def process_classification( subset=None, ) + if task.dataset is None: + raise ValueError("Task dataset is None.") + new_ds = {} for subset in task.dataset: new_ds[subset] = clean_dataset( diff --git a/mteb/abstasks/_statistics_calculation.py b/mteb/abstasks/_statistics_calculation.py index f3297e0874..9b799043e7 100644 --- a/mteb/abstasks/_statistics_calculation.py +++ b/mteb/abstasks/_statistics_calculation.py @@ -1,5 +1,6 @@ import hashlib from collections import Counter +from typing import cast from PIL import Image @@ -84,10 +85,11 @@ def calculate_label_statistics(labels: list[int | list[int]]) -> LabelStatistics total_labels = labels elif isinstance(labels[0], list): # multilabel classification - label_len = [len(l) for l in labels] + multilabel_labels = cast(list[list[int]], labels) + label_len = [len(l) for l in multilabel_labels] total_label_len = sum(label_len) - total_labels = [] - for l in labels: + total_labels: list[int | None] = [] + for l in multilabel_labels: total_labels.extend(l if len(l) > 0 else [None]) else: raise ValueError( diff --git a/mteb/abstasks/abstask.py b/mteb/abstasks/abstask.py index 1d82f596b6..90e5c32980 100644 --- a/mteb/abstasks/abstask.py +++ b/mteb/abstasks/abstask.py @@ -443,7 +443,7 @@ def languages(self) -> list[str]: return self.metadata.languages - def filter_eval_splits(self, eval_splits: list[str] | None) -> Self: + def filter_eval_splits(self, eval_splits: Sequence[str] | None) -> Self: """Filter the evaluation splits of the task. Args: @@ -457,9 +457,9 @@ def filter_eval_splits(self, eval_splits: list[str] | None) -> Self: def filter_languages( self, - languages: list[str] | None, - script: list[str] | None = None, - hf_subsets: list[HFSubset] | None = None, + languages: Sequence[str] | None, + script: Sequence[str] | None = None, + hf_subsets: Sequence[HFSubset] | None = None, exclusive_language_filter: bool = False, ) -> Self: """Filter the languages of the task. diff --git a/mteb/abstasks/task_metadata.py b/mteb/abstasks/task_metadata.py index c6ac1b00b6..bb592ec82d 100644 --- a/mteb/abstasks/task_metadata.py +++ b/mteb/abstasks/task_metadata.py @@ -192,10 +192,10 @@ """The type of the annotators. Is often important for understanding the quality of a dataset.""" -PromptDict = TypedDict( +PromptDict = TypedDict( # type: ignore[misc] "PromptDict", {prompt_type.value: str for prompt_type in PromptType}, - total=False, # type: ignore[misc] + total=False, ) """A dictionary containing the prompt used for the task. diff --git a/mteb/deprecated_evaluator.py b/mteb/deprecated_evaluator.py index d2742844fb..843db3e534 100644 --- a/mteb/deprecated_evaluator.py +++ b/mteb/deprecated_evaluator.py @@ -12,13 +12,13 @@ from time import time from typing import TYPE_CHECKING, Any, cast -from mteb import CrossEncoderProtocol from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.abstasks.task_metadata import TaskCategory, TaskType from mteb.models.get_model_meta import ( _model_meta_from_cross_encoder, _model_meta_from_sentence_transformers, ) +from mteb.models.models_protocols import CrossEncoderProtocol if sys.version_info >= (3, 13): from warnings import deprecated diff --git a/mteb/evaluate.py b/mteb/evaluate.py index 0af40e0a93..225963c88a 100644 --- a/mteb/evaluate.py +++ b/mteb/evaluate.py @@ -10,11 +10,11 @@ from datasets.exceptions import DatasetNotFoundError from tqdm.auto import tqdm -from mteb import Benchmark from mteb._helpful_enum import HelpfulStrEnum from mteb.abstasks import AbsTaskRetrieval from mteb.abstasks.abstask import AbsTask from mteb.abstasks.aggregated_task import AbsTaskAggregate +from mteb.benchmarks.benchmark import Benchmark from mteb.cache import ResultCache from mteb.models.model_meta import ModelMeta from mteb.models.models_protocols import ( @@ -156,7 +156,8 @@ def _evaluate_task( prediction_folder=prediction_folder, public_only=public_only, ) - result.kg_co2_emissions = tracker.final_emissions + if isinstance(result, TaskResult): + result.kg_co2_emissions = tracker.final_emissions return result task_results = {} @@ -213,7 +214,7 @@ def _evaluate_task( def _check_model_modalities( model: ModelMeta, - tasks: AbsTask | Benchmark | Iterable[AbsTask | Benchmark], + tasks: AbsTask | Benchmark | Iterable[AbsTask], ) -> None: """Check that model modalities are compatible with task modalities. @@ -230,9 +231,6 @@ def _check_model_modalities( check_tasks: Iterable[AbsTask] = [] if isinstance(tasks, AbsTask): check_tasks = [tasks] - elif isinstance(tasks, list) and all(isinstance(task, Benchmark) for task in tasks): - benchmarks = cast(Iterable[Benchmark], tasks) - check_tasks = [task for benchmark in benchmarks for task in benchmark.tasks] elif isinstance(tasks, Benchmark): benchmark = cast(Benchmark, tasks) check_tasks = benchmark.tasks @@ -300,7 +298,7 @@ def _requires_merge(task: AbsTask, existing_results: TaskResult) -> bool: def evaluate( model: ModelMeta | MTEBModels | SentenceTransformer | CrossEncoder, - tasks: AbsTask | Benchmark | Iterable[AbsTask | Benchmark], + tasks: AbsTask | Benchmark | Iterable[AbsTask], *, co2_tracker: bool | None = None, raise_error: bool = True, @@ -387,11 +385,11 @@ def evaluate( show_progress_bar=show_progress_bar, public_only=public_only, ) - result = aggregated_task.combine_task_results(results.task_results) + combined_results = aggregated_task.combine_task_results(results.task_results) return ModelResult( model_name=results.model_name, model_revision=results.model_revision, - task_results=[result], + task_results=[combined_results], ) if isinstance(tasks, AbsTask): diff --git a/mteb/filter_tasks.py b/mteb/filter_tasks.py index 4a051a7f69..151503f545 100644 --- a/mteb/filter_tasks.py +++ b/mteb/filter_tasks.py @@ -38,17 +38,17 @@ def _check_is_valid_language(lang: str) -> None: def filter_tasks( tasks: Sequence[T], *, - languages: list[str] | None = None, - script: list[str] | None = None, - domains: list[TaskDomain] | None = None, - task_types: list[TaskType] | None = None, - categories: list[TaskCategory] | None = None, - modalities: list[Modalities] | None = None, + languages: Sequence[str] | None = None, + script: Sequence[str] | None = None, + domains: Sequence[TaskDomain] | None = None, + task_types: Sequence[TaskType] | None = None, + categories: Sequence[TaskCategory] | None = None, + modalities: Sequence[Modalities] | None = None, exclusive_modality_filter: bool = False, exclude_superseded: bool = False, exclude_aggregate: bool = False, exclude_private: bool = False, -) -> list[T]: +) -> Sequence[T]: """Filter tasks based on the specified criteria. Args: diff --git a/mteb/get_tasks.py b/mteb/get_tasks.py index 741b4c40aa..614506d0d3 100644 --- a/mteb/get_tasks.py +++ b/mteb/get_tasks.py @@ -194,7 +194,7 @@ def to_latex( """ if include_citation_in_name and "name" in properties: df = self.to_dataframe(tuple(properties) + ("intext_citation",)) - df["name"] = df["name"] + " " + df["intext_citation"] + df["name"] = df["name"] + " " + df["intext_citation"] # type: ignore[operator] df = df.drop(columns=["intext_citation"]) else: df = self.to_dataframe(properties) @@ -219,17 +219,17 @@ def to_latex( def get_tasks( - tasks: list[str] | None = None, + tasks: Sequence[str] | None = None, *, - languages: list[str] | None = None, - script: list[str] | None = None, - domains: list[TaskDomain] | None = None, - task_types: list[TaskType] | None = None, # type: ignore - categories: list[TaskCategory] | None = None, + languages: Sequence[str] | None = None, + script: Sequence[str] | None = None, + domains: Sequence[TaskDomain] | None = None, + task_types: Sequence[TaskType] | None = None, + categories: Sequence[TaskCategory] | None = None, exclude_superseded: bool = True, - eval_splits: list[str] | None = None, + eval_splits: Sequence[str] | None = None, exclusive_language_filter: bool = False, - modalities: list[Modalities] | None = None, + modalities: Sequence[Modalities] | None = None, exclusive_modality_filter: bool = False, exclude_aggregate: bool = False, exclude_private: bool = True, @@ -285,7 +285,7 @@ def get_tasks( ] return MTEBTasks(_tasks) - tasks_: list[type[AbsTask]] = filter_tasks( + tasks_: Sequence[type[AbsTask]] = filter_tasks( TASK_LIST, languages=languages, script=script, @@ -311,10 +311,10 @@ def get_tasks( def get_task( task_name: str, - languages: list[str] | None = None, - script: list[str] | None = None, - eval_splits: list[str] | None = None, - hf_subsets: list[str] | None = None, + languages: Sequence[str] | None = None, + script: Sequence[str] | None = None, + eval_splits: Sequence[str] | None = None, + hf_subsets: Sequence[str] | None = None, exclusive_language_filter: bool = False, ) -> AbsTask: """Get a task by name. diff --git a/mteb/models/cache_wrappers/cache_backends/_hash_utils.py b/mteb/models/cache_wrappers/cache_backends/_hash_utils.py index f05be4ab7d..d29d894cd4 100644 --- a/mteb/models/cache_wrappers/cache_backends/_hash_utils.py +++ b/mteb/models/cache_wrappers/cache_backends/_hash_utils.py @@ -1,14 +1,14 @@ import hashlib +from typing import Any from PIL import Image -from mteb.types import BatchedInput - -def _hash_item(item: BatchedInput) -> str: +def _hash_item(item: dict[str, Any]) -> str: item_hash = "" if "text" in item: - item_hash = hashlib.sha256(item["text"].encode()).hexdigest() + item_text: str = item["text"] + item_hash = hashlib.sha256(item_text.encode()).hexdigest() if "image" in item: image: Image.Image = item["image"] diff --git a/mteb/models/cache_wrappers/cache_backends/faiss_cache.py b/mteb/models/cache_wrappers/cache_backends/faiss_cache.py index 17bdbdc149..db8bc2a775 100644 --- a/mteb/models/cache_wrappers/cache_backends/faiss_cache.py +++ b/mteb/models/cache_wrappers/cache_backends/faiss_cache.py @@ -1,6 +1,7 @@ import json import logging from pathlib import Path +from typing import Any import numpy as np @@ -36,7 +37,7 @@ def __init__(self, directory: str | Path): logger.info(f"Initialized FAISS VectorCacheMap in {self.directory}") self.load() - def add(self, items: list[BatchedInput], vectors: np.ndarray) -> None: + def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None: """Add vector to FAISS index.""" import faiss diff --git a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py index 76265d5a1b..14289ec987 100644 --- a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py +++ b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py @@ -1,11 +1,10 @@ import json import logging from pathlib import Path +from typing import Any import numpy as np -from mteb.types import BatchedInput - from ._hash_utils import _hash_item logger = logging.getLogger(__name__) @@ -14,7 +13,7 @@ class NumpyCache: """Generic vector cache for both text and images.""" - def __init__(self, directory: str | Path, initial_vectors: int = 100000): + def __init__(self, directory: str | Path, initial_vectors: int = 100_000): self.directory = Path(directory) self.directory.mkdir(parents=True, exist_ok=True) self.vectors_file = self.directory / "vectors.npy" @@ -27,7 +26,7 @@ def __init__(self, directory: str | Path, initial_vectors: int = 100000): logger.info(f"Initialized VectorCacheMap in directory: {self.directory}") self._initialize_vectors_file() - def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None: + def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None: """Add a vector to the cache.""" try: if self.vector_dim is None: @@ -38,7 +37,7 @@ def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None: self._save_dimension() logger.info(f"Initialized vector dimension to {self.vector_dim}") - for item, vec in zip(item, vectors): + for item, vec in zip(items, vectors): item_hash = _hash_item(item) if item_hash in self.hash_to_index: logger.warning( @@ -163,7 +162,7 @@ def load(self) -> None: logger.error(f"Error loading VectorCacheMap: {str(e)}") raise - def get_vector(self, item: BatchedInput) -> np.ndarray | None: + def get_vector(self, item: dict[str, Any]) -> np.ndarray | None: """Retrieve vector from index by hash.""" try: item_hash = _hash_item(item) @@ -176,7 +175,7 @@ def get_vector(self, item: BatchedInput) -> np.ndarray | None: logger.error(f"Error retrieving vector for item: {str(e)}") raise - def __contains__(self, item: BatchedInput) -> bool: + def __contains__(self, item: dict[str, Any]) -> bool: return _hash_item(item) in self.hash_to_index def __del__(self): diff --git a/mteb/models/cache_wrappers/cache_wrapper.py b/mteb/models/cache_wrappers/cache_wrapper.py index b895b3a8b0..af80c24646 100644 --- a/mteb/models/cache_wrappers/cache_wrapper.py +++ b/mteb/models/cache_wrappers/cache_wrapper.py @@ -90,7 +90,7 @@ def encode( try: cache = self._get_or_create_cache(task_name) - uncached_items: list[BatchedInput] = [] + uncached_items: list[dict[str, Any]] = [] uncached_indices: list[int] = [] all_items = inputs.dataset cached_vectors: dict[int, np.ndarray] = {} diff --git a/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py b/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py index e254ca7087..3cc0fa5024 100644 --- a/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +++ b/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py @@ -116,8 +116,8 @@ def _reranking( self, embeddings: Array, top_k: int, - top_ranked: TopRankedDocumentsType | None = None, - query_idx_to_id: dict[int, str] | None = None, + top_ranked: TopRankedDocumentsType, + query_idx_to_id: dict[int, str], ) -> tuple[list[list[float]], list[list[int]]]: doc_id_to_idx = {doc_id: i for i, doc_id in enumerate(self.idxs)} scores_all: list[list[float]] = [] @@ -133,9 +133,9 @@ def _reranking( continue candidate_indices = [doc_id_to_idx[doc_id] for doc_id in ranked_ids] - d = self.index.d + d = self.index.d # type: ignore[union-attr] candidate_embs = np.vstack( - [self.index.reconstruct(idx) for idx in candidate_indices] + [self.index.reconstruct(idx) for idx in candidate_indices] # type: ignore[union-attr] ) sub_reranking_index = self.index_type(d) sub_reranking_index.add(candidate_embs) diff --git a/mteb/types/statistics.py b/mteb/types/statistics.py index 6be1e50ae9..7714807e0e 100644 --- a/mteb/types/statistics.py +++ b/mteb/types/statistics.py @@ -88,9 +88,9 @@ class ScoreStatistics(TypedDict): max_score: Maximum score """ - min_score: int + min_score: int | float avg_score: float - max_score: int + max_score: int | float class TopRankedStatistics(TypedDict): diff --git a/pyproject.toml b/pyproject.toml index ba72edd0ce..bc9929f5da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -322,8 +322,10 @@ plugins = ['pydantic.mypy'] # these modules not typed and don't have stubs module = [ "datasets", + "datasets.exceptions", "sklearn", "sklearn.*", + "faiss", ] ignore_missing_imports = true From c9dedc7311f1a3b278c85d9373eaed3f3047057e Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 15 Dec 2025 23:50:13 +0300 Subject: [PATCH 08/32] fix typehint --- mteb/_evaluators/evaluator.py | 4 ++- mteb/abstasks/_statistics_calculation.py | 16 ++++++---- mteb/abstasks/abstask.py | 4 +-- mteb/abstasks/multilabel_classification.py | 2 +- mteb/abstasks/sts.py | 2 +- mteb/abstasks/text/summarization.py | 4 ++- mteb/models/abs_encoder.py | 4 +-- .../cache_wrappers/cache_backend_protocol.py | 8 ++--- .../cache_backends/_hash_utils.py | 3 +- .../cache_backends/numpy_cache.py | 30 +++++++++++++++---- mteb/models/cache_wrappers/cache_wrapper.py | 2 +- mteb/models/instruct_wrapper.py | 6 ++-- mteb/models/model_meta.py | 6 ++-- 13 files changed, 59 insertions(+), 32 deletions(-) diff --git a/mteb/_evaluators/evaluator.py b/mteb/_evaluators/evaluator.py index 9800fcf819..07df3068f7 100644 --- a/mteb/_evaluators/evaluator.py +++ b/mteb/_evaluators/evaluator.py @@ -1,8 +1,10 @@ from abc import ABC, abstractmethod +from collections.abc import Mapping from typing import Any from mteb.abstasks.abstask import _set_seed from mteb.models import EncoderProtocol +from mteb.types import Array class Evaluator(ABC): @@ -18,7 +20,7 @@ def __init__(self, seed: int = 42, **kwargs: Any) -> None: @abstractmethod def __call__( self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any] - ) -> dict[str, float]: + ) -> Mapping[str, float] | Array: """This is called during training to evaluate the model. It returns scores. diff --git a/mteb/abstasks/_statistics_calculation.py b/mteb/abstasks/_statistics_calculation.py index 30fa26ec52..0aab65abf1 100644 --- a/mteb/abstasks/_statistics_calculation.py +++ b/mteb/abstasks/_statistics_calculation.py @@ -82,18 +82,24 @@ def calculate_label_statistics(labels: list[int | list[int]]) -> LabelStatistics LabelStatistics: A dictionary containing the descriptive statistics. """ + total_labels: list[int | None] = [] + if not isinstance(labels[0], list): - label_len = [1] * len(labels) - total_label_len = len(labels) - total_labels = labels + # single label classification + single_label = cast(list[int], labels) + label_len = [1] * len(single_label) + total_label_len = len(single_label) + total_labels.extend(single_label) elif isinstance(labels[0], list): # multilabel classification multilabel_labels = cast(list[list[int]], labels) label_len = [len(l) for l in multilabel_labels] total_label_len = sum(label_len) - total_labels: list[int | None] = [] for l in multilabel_labels: - total_labels.extend(l if len(l) > 0 else [None]) + if l and len(l) > 0: + total_labels.extend(l) + else: + total_labels.append(None) else: raise ValueError( "Labels must be a list of integers or a list of lists of integers." diff --git a/mteb/abstasks/abstask.py b/mteb/abstasks/abstask.py index 90e5c32980..22b0ab7611 100644 --- a/mteb/abstasks/abstask.py +++ b/mteb/abstasks/abstask.py @@ -84,7 +84,7 @@ class AbsTask(ABC): """ metadata: TaskMetadata - abstask_prompt: str | None = None + abstask_prompt: str _eval_splits: list[str] | None = None dataset: dict[HFSubset, DatasetDict] | None = None data_loaded: bool = False @@ -216,7 +216,7 @@ def _evaluate_subset( def _save_task_predictions( self, - predictions: dict[str, Any] | list[Any], + predictions: Mapping[str, Any] | list[Any], model: MTEBModels, prediction_folder: Path, hf_split: str, diff --git a/mteb/abstasks/multilabel_classification.py b/mteb/abstasks/multilabel_classification.py index 66e494b697..b6bda3179f 100644 --- a/mteb/abstasks/multilabel_classification.py +++ b/mteb/abstasks/multilabel_classification.py @@ -235,7 +235,7 @@ def _undersample_data_indices( idxs = np.arange(len(y)) self.np_rng.shuffle(idxs) idxs = idxs.tolist() - label_counter = defaultdict(int) + label_counter: dict[int, int] = defaultdict(int) for i in idxs: if any((label_counter[label] < samples_per_label) for label in y[i]): sample_indices.append(i) diff --git a/mteb/abstasks/sts.py b/mteb/abstasks/sts.py index 16432d0b50..07b8ac8d5c 100644 --- a/mteb/abstasks/sts.py +++ b/mteb/abstasks/sts.py @@ -142,7 +142,7 @@ def _calculate_scores( ) -> STSMetrics: def compute_corr(x: list[float], y: list[float]) -> tuple[float, float]: """Return (pearson, spearman) correlations between x and y.""" - return pearsonr(x, y)[0], spearmanr(x, y)[0] + return float(pearsonr(x, y)[0]), float(spearmanr(x, y)[0]) cosine_pearson, cosine_spearman = compute_corr( normalized_scores, scores["cosine_scores"] diff --git a/mteb/abstasks/text/summarization.py b/mteb/abstasks/text/summarization.py index 4f53884824..3591feb9b0 100644 --- a/mteb/abstasks/text/summarization.py +++ b/mteb/abstasks/text/summarization.py @@ -87,7 +87,9 @@ def _evaluate_subset( **kwargs, ) -> SummarizationMetrics: normalized_scores = [ - (np.array(x) - self.min_score) / (self.max_score - self.min_score) + ( + (np.array(x) - self.min_score) / (self.max_score - self.min_score) + ).tolist() for x in data_split[self.relevancy_column_name] ] evaluator = self.evaluator( diff --git a/mteb/models/abs_encoder.py b/mteb/models/abs_encoder.py index 485e4790f7..70e291907d 100644 --- a/mteb/models/abs_encoder.py +++ b/mteb/models/abs_encoder.py @@ -43,7 +43,7 @@ class AbsEncoder(ABC): model: Any mteb_model_meta: ModelMeta | None = None model_prompts: dict[str, str] | None = None - instruction_template: str | Callable[[str, PromptType], str] | None = None + instruction_template: str | Callable[[str, PromptType | None], str] | None = None prompts_dict: dict[str, str] | None = None def get_prompt_name( @@ -110,7 +110,7 @@ def get_prompt( if not self.model_prompts: return None prompt_name = self.get_prompt_name(task_metadata, prompt_type) - return self.model_prompts.get(prompt_name) + return self.model_prompts.get(prompt_name) if prompt_name else None @staticmethod @overload diff --git a/mteb/models/cache_wrappers/cache_backend_protocol.py b/mteb/models/cache_wrappers/cache_backend_protocol.py index 581ff5c66d..b194b044d8 100644 --- a/mteb/models/cache_wrappers/cache_backend_protocol.py +++ b/mteb/models/cache_wrappers/cache_backend_protocol.py @@ -5,8 +5,6 @@ import numpy as np -from mteb.types import BatchedInput - @runtime_checkable class CacheBackendProtocol(Protocol): @@ -26,7 +24,7 @@ def __init__(self, directory: Path | None = None, **kwargs: Any) -> None: **kwargs: Additional backend-specific arguments. """ - def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None: + def add(self, item: list[dict[str, Any]], vectors: np.ndarray) -> None: """Add a vector to the cache. Args: @@ -34,7 +32,7 @@ def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None: vectors: Embedding vector of shape (dim,) or (1, dim). """ - def get_vector(self, item: BatchedInput) -> np.ndarray | None: + def get_vector(self, item: dict[str, Any]) -> np.ndarray | None: """Retrieve the cached vector for the given item. Args: @@ -53,5 +51,5 @@ def load(self) -> None: def close(self) -> None: """Release resources or flush data.""" - def __contains__(self, item: BatchedInput) -> bool: + def __contains__(self, item: dict[str, Any]) -> bool: """Check whether the cache contains an item.""" diff --git a/mteb/models/cache_wrappers/cache_backends/_hash_utils.py b/mteb/models/cache_wrappers/cache_backends/_hash_utils.py index ff39686f47..f86cfb5702 100644 --- a/mteb/models/cache_wrappers/cache_backends/_hash_utils.py +++ b/mteb/models/cache_wrappers/cache_backends/_hash_utils.py @@ -1,8 +1,9 @@ import hashlib +from collections.abc import Mapping from typing import Any -def _hash_item(item: dict[str, Any]) -> str: +def _hash_item(item: Mapping[str, Any]) -> str: item_hash = "" if "text" in item: item_text: str = item["text"] diff --git a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py index 14289ec987..005f68bb99 100644 --- a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py +++ b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py @@ -28,6 +28,10 @@ def __init__(self, directory: str | Path, initial_vectors: int = 100_000): def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None: """Add a vector to the cache.""" + if self.vectors is None: + raise RuntimeError( + "Vectors file not initialized. Call _initialize_vectors_file() first." + ) try: if self.vector_dim is None: self.vector_dim = ( @@ -73,18 +77,26 @@ def _initialize_vectors_file(self) -> None: shape=(self.initial_vectors, self.vector_dim), ) else: - self.vectors = np.memmap(self.vectors_file, dtype="float32", mode="r+") - self.vectors = self.vectors.reshape(-1, self.vector_dim) + self.vectors = np.memmap( + self.vectors_file, + dtype="float32", + mode="r+", + shape=(-1, self.vector_dim), + ) logger.info(f"Vectors file initialized with shape: {self.vectors.shape}") def _double_vectors_file(self) -> None: + if self.vectors is None or self.vector_dim is None: + raise RuntimeError( + "Vectors file not initialized. Call _initialize_vectors_file() first." + ) current_size = len(self.vectors) new_size = current_size * 2 logger.info(f"Doubling vectors file from {current_size} to {new_size} vectors") self.vectors.flush() new_vectors = np.memmap( - self.vectors_file, - dtype="float32", + str(self.vectors_file), + dtype=np.float32, mode="r+", shape=(new_size, self.vector_dim), ) @@ -145,9 +157,11 @@ def load(self) -> None: if self.vector_dim is not None: self.vectors = np.memmap( - self.vectors_file, dtype="float32", mode="r+" + self.vectors_file, + dtype="float32", + mode="r+", + shape=(-1, self.vector_dim), ) - self.vectors = self.vectors.reshape(-1, self.vector_dim) logger.info(f"Loaded vectors file with shape: {self.vectors.shape}") else: logger.warning( @@ -164,6 +178,10 @@ def load(self) -> None: def get_vector(self, item: dict[str, Any]) -> np.ndarray | None: """Retrieve vector from index by hash.""" + if self.vectors is None: + raise RuntimeError( + "Vectors file not initialized. Call _initialize_vectors_file() first." + ) try: item_hash = _hash_item(item) if item_hash not in self.hash_to_index: diff --git a/mteb/models/cache_wrappers/cache_wrapper.py b/mteb/models/cache_wrappers/cache_wrapper.py index af80c24646..4807385074 100644 --- a/mteb/models/cache_wrappers/cache_wrapper.py +++ b/mteb/models/cache_wrappers/cache_wrapper.py @@ -92,7 +92,7 @@ def encode( uncached_items: list[dict[str, Any]] = [] uncached_indices: list[int] = [] - all_items = inputs.dataset + all_items: Dataset = inputs.dataset cached_vectors: dict[int, np.ndarray] = {} for i, item in enumerate(all_items): diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 2cbbe5c101..de28ab2b16 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -17,7 +17,7 @@ def instruct_wrapper( model_name_or_path: str, mode: str, - instruction_template: str | Callable[[str, PromptType], str] | None = None, + instruction_template: str | Callable[[str, PromptType | None], str] | None = None, **kwargs, ): """Instruct wrapper for models. Uses GritLM to pass instructions to the model. @@ -40,7 +40,9 @@ def __init__( self, model_name_or_path: str, mode: str, - instruction_template: str | Callable[[str, PromptType], str] | None = None, + instruction_template: str + | Callable[[str, PromptType | None], str] + | None = None, **kwargs, ): if ( diff --git a/mteb/models/model_meta.py b/mteb/models/model_meta.py index 74d8cb7751..26b1fdbbc1 100644 --- a/mteb/models/model_meta.py +++ b/mteb/models/model_meta.py @@ -199,9 +199,7 @@ def load_model(self, **kwargs: Any) -> MTEBModels: _kwargs = self.loader_kwargs.copy() _kwargs.update(kwargs) - model: EncoderProtocol = self.loader( - self.name, revision=self.revision, **_kwargs - ) + model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs) model.mteb_model_meta = self # type: ignore return model @@ -260,7 +258,7 @@ def get_training_datasets(self) -> set[str] | None: logger.warning(f"Could not get source model: {e} in MTEB") return_dataset = training_datasets.copy() - visited = set() + visited: set[str] = set() for dataset in training_datasets: similar_tasks = _collect_similar_tasks(dataset, visited) From fed4ecc441b90b053a908f6a50c9e60bf245e429 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Dec 2025 01:48:25 +0300 Subject: [PATCH 09/32] typechecking --- mteb/_evaluators/any_sts_evaluator.py | 5 +--- mteb/_evaluators/evaluator.py | 5 ++-- .../imagetext_pairclassification_evaluator.py | 11 ++++---- mteb/_evaluators/sklearn_evaluator.py | 12 ++++----- mteb/abstasks/_statistics_calculation.py | 3 ++- mteb/abstasks/abstask.py | 12 +++------ mteb/abstasks/classification.py | 9 ++++--- mteb/abstasks/clustering_legacy.py | 6 ++--- .../image/image_text_pair_classification.py | 12 ++++----- mteb/abstasks/multilabel_classification.py | 27 ++++++++++--------- mteb/abstasks/pair_classification.py | 16 ++++++----- mteb/abstasks/regression.py | 6 ++--- mteb/abstasks/retrieval.py | 21 ++++++++------- mteb/abstasks/text/bitext_mining.py | 22 +++++++-------- mteb/deprecated_evaluator.py | 6 ++--- mteb/models/model_meta.py | 15 ++++++----- mteb/types/_result.py | 3 ++- 17 files changed, 96 insertions(+), 95 deletions(-) diff --git a/mteb/_evaluators/any_sts_evaluator.py b/mteb/_evaluators/any_sts_evaluator.py index 0e58bb2814..10106be9fd 100644 --- a/mteb/_evaluators/any_sts_evaluator.py +++ b/mteb/_evaluators/any_sts_evaluator.py @@ -57,10 +57,7 @@ def __init__( self.input2_prompt_type = input2_prompt_type def __call__( - self, - model: EncoderProtocol, - *, - encode_kwargs: dict[str, Any], + self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any] ) -> STSEvaluatorScores: logger.info("Running semantic similarity - Encoding samples (1/2)") embeddings1 = model.encode( diff --git a/mteb/_evaluators/evaluator.py b/mteb/_evaluators/evaluator.py index 07df3068f7..0bd40b397f 100644 --- a/mteb/_evaluators/evaluator.py +++ b/mteb/_evaluators/evaluator.py @@ -1,10 +1,9 @@ from abc import ABC, abstractmethod -from collections.abc import Mapping +from collections.abc import Iterable, Mapping from typing import Any from mteb.abstasks.abstask import _set_seed from mteb.models import EncoderProtocol -from mteb.types import Array class Evaluator(ABC): @@ -20,7 +19,7 @@ def __init__(self, seed: int = 42, **kwargs: Any) -> None: @abstractmethod def __call__( self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any] - ) -> Mapping[str, float] | Array: + ) -> Mapping[str, float] | Iterable[Any]: """This is called during training to evaluate the model. It returns scores. diff --git a/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py b/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py index 55b68c7ca9..21a37ac422 100644 --- a/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +++ b/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py @@ -16,6 +16,7 @@ from mteb._requires_package import requires_image_dependencies from mteb.abstasks.task_metadata import TaskMetadata from mteb.models.models_protocols import EncoderProtocol +from mteb.types import Array if TYPE_CHECKING: from PIL.Image import Image @@ -83,11 +84,9 @@ def __init__( self.hf_split = hf_split self.hf_subset = hf_subset - def __call__( - self, - model: EncoderProtocol, - encode_kwargs: dict[str, Any], - ) -> list[torch.Tensor]: + def __call__( # type: ignore[override] + self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any] + ) -> Array: images = [] if isinstance(self.images_column_names, str): images = self.dataset[self.images_column_names] @@ -151,4 +150,4 @@ def __call__( img_emb @ txt_emb.t() ) # shape = (num_images_per_sample x num_texts_per_sample) all_scores.append(scores) - return all_scores + return torch.tensor(all_scores) diff --git a/mteb/_evaluators/sklearn_evaluator.py b/mteb/_evaluators/sklearn_evaluator.py index 1e043dc770..d0c2e71749 100644 --- a/mteb/_evaluators/sklearn_evaluator.py +++ b/mteb/_evaluators/sklearn_evaluator.py @@ -4,12 +4,12 @@ import numpy as np from datasets import Dataset from torch.utils.data import DataLoader -from typing_extensions import Self +from typing_extensions import Self, Unpack from mteb._create_dataloaders import create_dataloader from mteb.abstasks.task_metadata import TaskMetadata from mteb.models import EncoderProtocol -from mteb.types import BatchedInput +from mteb.types import Array, BatchedInput from .evaluator import Evaluator @@ -17,11 +17,11 @@ class SklearnModelProtocol(Protocol): - def fit(self, X: np.ndarray, y: np.ndarray | list[int]) -> None: ... # noqa: N803 - def predict(self, X: np.ndarray) -> np.ndarray: ... # noqa: N803 + def fit(self, X: Array, y: np.ndarray | list[int]) -> None: ... # noqa: N803 + def predict(self, X: Array) -> np.ndarray: ... # noqa: N803 def get_params(self) -> dict[str, Any]: ... - def set_params(self, **kwargs: dict[str, Any]) -> Self: ... - def score(self, X: np.ndarray, y: np.ndarray | list[int]) -> float: ... # noqa: N803 + def set_params(self, **kwargs: Unpack[dict[str, Any]]) -> Self: ... + def score(self, X: Array, y: np.ndarray | list[int]) -> float: ... # noqa: N803 class SklearnEvaluator(Evaluator): diff --git a/mteb/abstasks/_statistics_calculation.py b/mteb/abstasks/_statistics_calculation.py index 0aab65abf1..eb6960059e 100644 --- a/mteb/abstasks/_statistics_calculation.py +++ b/mteb/abstasks/_statistics_calculation.py @@ -2,6 +2,7 @@ import hashlib from collections import Counter +from collections.abc import Mapping from typing import TYPE_CHECKING, cast from mteb.types import TopRankedDocumentsType @@ -166,7 +167,7 @@ def calculate_top_ranked_statistics( def calculate_relevant_docs_statistics( - relevant_docs: dict[str, dict[str, float]], + relevant_docs: Mapping[str, Mapping[str, int]], ) -> RelevantDocsStatistics: qrels_lengths = [len(relevant_docs[qid]) for qid in relevant_docs] unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]}) diff --git a/mteb/abstasks/abstask.py b/mteb/abstasks/abstask.py index 22b0ab7611..5b4d63f1d5 100644 --- a/mteb/abstasks/abstask.py +++ b/mteb/abstasks/abstask.py @@ -4,7 +4,7 @@ from collections.abc import Mapping, Sequence from copy import copy from pathlib import Path -from typing import Any, TypedDict, cast +from typing import Any, cast import numpy as np from datasets import ClassLabel, Dataset, DatasetDict, load_dataset @@ -60,12 +60,6 @@ def _multilabel_subsampling( return dataset_dict -class AbsMetrics(TypedDict): - """The abstract class for the metrics returned by the tasks""" - - ... - - class AbsTask(ABC): """The abstract class for the tasks @@ -209,7 +203,7 @@ def _evaluate_subset( encode_kwargs: dict[str, Any], prediction_folder: Path | None = None, **kwargs: Any, - ) -> AbsMetrics: + ) -> ScoresDict: raise NotImplementedError( "If you are using the default evaluate method, you must implement _evaluate_subset method." ) @@ -505,7 +499,7 @@ def filter_languages( self.hf_subsets = subsets_to_keep return self - def _add_main_score(self, scores: dict[HFSubset, ScoresDict | AbsMetrics]) -> None: + def _add_main_score(self, scores: ScoresDict) -> None: scores["main_score"] = scores[self.metadata.main_score] def _upload_dataset_to_hub( diff --git a/mteb/abstasks/classification.py b/mteb/abstasks/classification.py index fe26d2623c..b79364158a 100644 --- a/mteb/abstasks/classification.py +++ b/mteb/abstasks/classification.py @@ -143,6 +143,9 @@ def evaluate( if not self.data_loaded: self.load_data() + if self.dataset is None: + raise RuntimeError("Dataset not loaded.") + if "random_state" in self.evaluator_model.get_params(): self.evaluator_model = self.evaluator_model.set_params( random_state=self.seed @@ -175,7 +178,7 @@ def evaluate( ) self._add_main_score(scores[hf_subset]) - return scores + return scores # type: ignore[return-value] def _evaluate_subset( self, @@ -237,7 +240,7 @@ def _evaluate_subset( # ap will be none for non binary classification tasks k: ( float(np.mean(values)) - if (values := [s[k] for s in scores if s[k] is not None]) + if (values := [s[k] for s in scores if s[k] is not None]) # type: ignore[literal-required] else np.nan ) for k in scores[0].keys() @@ -245,7 +248,7 @@ def _evaluate_subset( logger.info(f"Running {self.metadata.name} - Finished.") return FullClassificationMetrics( scores_per_experiment=scores, - **avg_scores, + **avg_scores, # type: ignore[typeddict-item] ) def _calculate_scores( diff --git a/mteb/abstasks/clustering_legacy.py b/mteb/abstasks/clustering_legacy.py index eaf78c09f5..cd571416bd 100644 --- a/mteb/abstasks/clustering_legacy.py +++ b/mteb/abstasks/clustering_legacy.py @@ -148,10 +148,10 @@ def _evaluate_subset( hf_subset=hf_subset, **kwargs, ) - clusters = evaluator(model, encode_kwargs=encode_kwargs) + evaluate_clusters = evaluator(model, encode_kwargs=encode_kwargs) if prediction_folder: self._save_task_predictions( - clusters, + evaluate_clusters, model, prediction_folder, hf_subset=hf_subset, @@ -160,7 +160,7 @@ def _evaluate_subset( return self._compute_metrics( data_split[self.label_column_name], - clusters, + evaluate_clusters, ) def _compute_metrics( diff --git a/mteb/abstasks/image/image_text_pair_classification.py b/mteb/abstasks/image/image_text_pair_classification.py index 0cae5dde64..079aeb7fcc 100644 --- a/mteb/abstasks/image/image_text_pair_classification.py +++ b/mteb/abstasks/image/image_text_pair_classification.py @@ -1,7 +1,7 @@ import logging from collections.abc import Sequence from pathlib import Path -from typing import Any +from typing import Any, TypedDict import torch from datasets import Dataset, concatenate_datasets @@ -11,7 +11,7 @@ calculate_image_statistics, calculate_text_statistics, ) -from mteb.abstasks.abstask import AbsMetrics, AbsTask +from mteb.abstasks.abstask import AbsTask from mteb.models.models_protocols import EncoderProtocol from mteb.types.statistics import ( ImageStatistics, @@ -36,7 +36,7 @@ class ImageTextPairClassificationDescriptiveStatistics(SplitDescriptiveStatistic image_statistics: ImageStatistics -class ImageTextPairClassificationMetrics(AbsMetrics): +class ImageTextPairClassificationMetrics(TypedDict): """ImageTextPairClassification metrics. Attributes: @@ -154,10 +154,10 @@ def _evaluate_subset( hf_subset=hf_subset, **kwargs, ) - scores = evaluator(model, encode_kwargs=encode_kwargs) + scores: torch.Tensor = evaluator(model, encode_kwargs=encode_kwargs) # type: ignore[assignment] if prediction_folder: self._save_task_predictions( - [score.tolist() for score in scores], + scores.tolist(), model, prediction_folder, hf_subset=hf_subset, @@ -172,7 +172,7 @@ def _evaluate_subset( def _compute_metrics( self, - scores: list[torch.Tensor], + scores: torch.Tensor, num_images_per_sample: int, num_texts_per_sample: int, ) -> ImageTextPairClassificationMetrics: diff --git a/mteb/abstasks/multilabel_classification.py b/mteb/abstasks/multilabel_classification.py index b6bda3179f..07c6ac5c4c 100644 --- a/mteb/abstasks/multilabel_classification.py +++ b/mteb/abstasks/multilabel_classification.py @@ -17,6 +17,7 @@ from mteb._evaluators.classification_metrics import hamming_score from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol from mteb.models import EncoderProtocol +from mteb.types import Array from .classification import AbsTaskClassification @@ -24,14 +25,14 @@ def _evaluate_classifier( - embeddings_train: np.ndarray, + embeddings_train: Array, y_train: np.ndarray, - embeddings_test: np.ndarray, + embeddings_test: Array, classifier: SklearnModelProtocol, ) -> tuple[np.ndarray, SklearnModelProtocol]: - classifier: SklearnModelProtocol = clone(classifier) - classifier.fit(embeddings_train, y_train) - return classifier.predict(embeddings_test), classifier + classifier_copy: SklearnModelProtocol = clone(classifier) + classifier_copy.fit(embeddings_train, y_train) + return classifier_copy.predict(embeddings_test), classifier_copy class MultilabelClassificationMetrics(TypedDict): @@ -72,12 +73,12 @@ class AbsTaskMultilabelClassification(AbsTaskClassification): evaluator: Classifier to use for evaluation. Must implement the SklearnModelProtocol. """ - evaluator: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5) + evaluator: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5) # type: ignore[assignment] input_column_name: str = "text" label_column_name: str = "label" @override - def _evaluate_subset( + def _evaluate_subset( # type: ignore[override] self, model: EncoderProtocol, data_split: DatasetDict, @@ -185,19 +186,20 @@ def _evaluate_subset( ) avg_scores: dict[str, Any] = { - k: np.mean([s[k] for s in scores]) for k in scores[0].keys() + k: np.mean([s[k] for s in scores]) + for k in scores[0].keys() # type: ignore[literal-required] } logger.info("Running multilabel classification - Finished.") return FullMultilabelClassificationMetrics( scores_per_experiment=scores, - **avg_scores, + **avg_scores, # type: ignore[typeddict-item] ) - def _calculate_scores( + def _calculate_scores( # type: ignore[override] self, y_test: np.ndarray, y_pred: np.ndarray, - x_test_embedding: np.ndarray, + x_test_embedding: Array, current_classifier: SklearnModelProtocol, ) -> MultilabelClassificationMetrics: accuracy = current_classifier.score(x_test_embedding, y_test) @@ -232,9 +234,8 @@ def _undersample_data_indices( """ sample_indices = [] if idxs is None: - idxs = np.arange(len(y)) + idxs = list(range(len(y))) self.np_rng.shuffle(idxs) - idxs = idxs.tolist() label_counter: dict[int, int] = defaultdict(int) for i in idxs: if any((label_counter[label] < samples_per_label) for label in y[i]): diff --git a/mteb/abstasks/pair_classification.py b/mteb/abstasks/pair_classification.py index df134bbccc..b336759c5d 100644 --- a/mteb/abstasks/pair_classification.py +++ b/mteb/abstasks/pair_classification.py @@ -44,8 +44,8 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics): """ num_samples: int - number_of_characters: int - unique_pairs: int + number_of_characters: int | None + unique_pairs: int | None text1_statistics: TextStatistics | None image1_statistics: ImageStatistics | None @@ -120,7 +120,7 @@ def _compute_metrics( self, similarity_scores: PairClassificationDistances, labels: list[int] ) -> dict[str, float]: logger.info("Computing metrics...") - labels = np.asarray(labels) + np_labels = np.asarray(labels) output_scores = {} max_scores = defaultdict(list) for short_name, scores, reverse in [ @@ -142,7 +142,7 @@ def _compute_metrics( ], [ScoringFunction.DOT_PRODUCT.value, similarity_scores["dot_scores"], True], ]: - metrics = self._compute_metrics_values(scores, labels, reverse) + metrics = self._compute_metrics_values(scores, np_labels, reverse) # type: ignore[arg-type] for metric_name, metric_value in metrics.items(): output_scores[f"{short_name}_{metric_name}"] = metric_value max_scores[metric_name].append(metric_value) @@ -237,6 +237,8 @@ def _compute_image_hash(inputs: list) -> list[str]: def _push_dataset_to_hub(self, repo_name: str) -> None: # previously pair classification datasets were stored in a single row + if self.dataset is None: + raise RuntimeError("Dataset not loaded") if self.metadata.is_multilingual: for subset in self.dataset: for split in self.dataset[subset]: @@ -290,13 +292,13 @@ def _compute_metrics_values( ) def _find_best_acc_and_threshold( - self, scores: np.ndarray, labels: np.ndarray, high_score_more_similar: bool + self, scores: list[float], labels: np.ndarray, high_score_more_similar: bool ) -> tuple[float, float]: rows = list(zip(scores, labels)) rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar) max_acc = 0 - best_threshold = -1 + best_threshold = -1.0 positive_so_far = 0 remaining_negatives = sum(np.array(labels) == 0) @@ -323,7 +325,7 @@ def _find_best_f1_and_threshold( rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar) - best_f1 = best_precision = best_recall = 0 + best_f1 = best_precision = best_recall = 0.0 threshold = 0 nextract = 0 ncorrect = 0 diff --git a/mteb/abstasks/regression.py b/mteb/abstasks/regression.py index 024afcc91e..322a221e10 100644 --- a/mteb/abstasks/regression.py +++ b/mteb/abstasks/regression.py @@ -87,7 +87,7 @@ class AbsTaskRegression(AbsTaskClassification): Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol]. """ - evaluator: type[SklearnModelProtocol] = SklearnEvaluator + evaluator: type[SklearnEvaluator] = SklearnEvaluator evaluator_model: SklearnModelProtocol = LinearRegression(n_jobs=-1) train_split: str = "train" @@ -113,7 +113,7 @@ def _undersample_data( )["train"] return train_split_sampled, [] - def _calculate_scores( + def _calculate_scores( # type: ignore[override] self, y_test: np.ndarray | list[int], y_pred: np.ndarray, @@ -183,7 +183,7 @@ def stratified_subsampling( return dataset_dict - def _calculate_descriptive_statistics_from_split( + def _calculate_descriptive_statistics_from_split( # type: ignore[override] self, split: str, hf_subset: str | None = None, compute_overall: bool = False ) -> RegressionDescriptiveStatistics: train_text = [] diff --git a/mteb/abstasks/retrieval.py b/mteb/abstasks/retrieval.py index c44be10811..96dea9978c 100644 --- a/mteb/abstasks/retrieval.py +++ b/mteb/abstasks/retrieval.py @@ -1,7 +1,7 @@ import json import logging from collections import defaultdict -from collections.abc import Callable, Sequence +from collections.abc import Callable, Mapping, Sequence from pathlib import Path from time import time from typing import Any, Literal @@ -286,7 +286,7 @@ def evaluate( encode_kwargs: dict[str, Any], prediction_folder: Path | None = None, **kwargs, - ) -> dict[HFSubset, ScoresDict]: + ) -> Mapping[HFSubset, ScoresDict]: """Evaluate the model on the retrieval task. Args: @@ -357,6 +357,8 @@ def _evaluate_subset( **kwargs, ) + search_model: SearchProtocol + if isinstance(model, EncoderProtocol) and not isinstance(model, SearchProtocol): search_model = SearchEncoderWrapper(model) elif isinstance(model, CrossEncoderProtocol): @@ -578,11 +580,12 @@ def _push_section( if isinstance(data[split][subset_item], Dataset): sections[split] = data[split][subset_item] elif converter is not None: + subset_data = data[split][subset_item] + if subset_data is None: + continue + sections[split] = Dataset.from_list( - [ - converter(idx, item) - for idx, item in data[split][subset_item].items() - ] + [converter(idx, item) for idx, item in subset_data.items()] ) else: raise ValueError( @@ -680,7 +683,7 @@ def convert_to_reranking( top_k_sorted = defaultdict(list) for query_id, values in top_ranked.items(): - sorted_keys = sorted(values, key=values.get, reverse=True) + sorted_keys = sorted(values, key=lambda k: values[k], reverse=True) top_k_sorted[query_id] = sorted_keys[: self._top_k] self.dataset[subset][split]["top_ranked"] = top_k_sorted @@ -688,10 +691,10 @@ def convert_to_reranking( def _process_relevant_docs( - collection: dict[str, dict[str, float]], + collection: Mapping[str, Mapping[str, int]], hf_subset: str, split: str, -) -> dict[str, dict[str, float]]: +) -> dict[str, dict[str, int]]: """Collections can contain overlapping ids in different splits. Prepend split and subset to avoid this Returns: diff --git a/mteb/abstasks/text/bitext_mining.py b/mteb/abstasks/text/bitext_mining.py index 6112f391a1..ecbdb05b5f 100644 --- a/mteb/abstasks/text/bitext_mining.py +++ b/mteb/abstasks/text/bitext_mining.py @@ -1,14 +1,14 @@ import logging from collections import defaultdict from pathlib import Path -from typing import Any, ClassVar, cast +from typing import Any, ClassVar, TypedDict, cast from datasets import Dataset, DatasetDict from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score from mteb._evaluators import BitextMiningEvaluator from mteb.abstasks._statistics_calculation import calculate_text_statistics -from mteb.abstasks.abstask import AbsMetrics, AbsTask +from mteb.abstasks.abstask import AbsTask from mteb.models import EncoderProtocol, MTEBModels from mteb.models.models_protocols import CrossEncoderProtocol, SearchProtocol from mteb.types import HFSubset, ScoresDict @@ -37,7 +37,7 @@ class BitextDescriptiveStatistics(SplitDescriptiveStatistics): sentence2_statistics: TextStatistics -class BitextMiningMetrics(AbsMetrics): +class BitextMiningMetrics(TypedDict): """Metrics for BitextMining tasks Attributes: @@ -142,7 +142,7 @@ def evaluate( **kwargs, ) - return scores + return cast(dict[HFSubset, ScoresDict], scores) def _get_pairs(self, parallel: bool) -> list[tuple[str, str]]: pairs = self._DEFAULT_PAIR @@ -191,16 +191,16 @@ def _evaluate_subset( # type: ignore[override] ) if parallel: - metrics = {} + parallel_metrics = {} for keys, nearest_neighbors in neighbours.items(): - metrics[keys] = self._compute_metrics(nearest_neighbors, gold) + parallel_metrics[keys] = self._compute_metrics(nearest_neighbors, gold) - for v in metrics.values(): + for v in parallel_metrics.values(): self._add_main_score(v) - else: - def_pair_str = "-".join(self._DEFAULT_PAIR[0]) - metrics = self._compute_metrics(neighbours[def_pair_str], gold) - self._add_main_score(metrics) + return parallel_metrics + def_pair_str = "-".join(self._DEFAULT_PAIR[0]) + metrics = self._compute_metrics(neighbours[def_pair_str], gold) + self._add_main_score(metrics) return metrics def _compute_metrics( diff --git a/mteb/deprecated_evaluator.py b/mteb/deprecated_evaluator.py index cf34a94adb..a734d2fa4e 100644 --- a/mteb/deprecated_evaluator.py +++ b/mteb/deprecated_evaluator.py @@ -5,7 +5,7 @@ import os import sys import traceback -from collections.abc import Iterable, Mapping +from collections.abc import Iterable from copy import deepcopy from datetime import datetime from pathlib import Path @@ -455,7 +455,7 @@ def run( task.check_if_dataset_is_superseded() task.load_data() - task_results: dict[str, Mapping[str, dict[str, Any]]] = {} + task_results: dict[str, dict[str, dict[str, Any]]] = {} evaluation_time = 0 kg_co2_emissions: int | None = 0 if co2_tracker else None @@ -532,7 +532,7 @@ def run( # Create new TaskResult new_results = TaskResult.from_task_results( task, - task_results, + task_results, # type: ignore[arg-type] evaluation_time=evaluation_time, kg_co2_emissions=kg_co2_emissions, ) diff --git a/mteb/models/model_meta.py b/mteb/models/model_meta.py index df03e09545..fdedf92a08 100644 --- a/mteb/models/model_meta.py +++ b/mteb/models/model_meta.py @@ -79,7 +79,7 @@ def _get_loader_name( return loader.__name__ -_SENTENCE_TRANSFORMER_LIB_NAME = "Sentence Transformers" +_SENTENCE_TRANSFORMER_LIB_NAME: FRAMEWORKS = "Sentence Transformers" class ModelMeta(BaseModel): @@ -274,9 +274,8 @@ def _from_hub( model_config = None logger.warning(f"Can't get configuration for {model_name}. Error: {e}") - if ( - card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME - or _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags + if card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME or ( + card_data.tags and _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags ): frameworks.append(_SENTENCE_TRANSFORMER_LIB_NAME) else: @@ -391,7 +390,7 @@ def from_hub( and config_sbert.get("similarity_fn_name") is not None ): meta.similarity_fn_name = ScoringFunction.from_str( - config_sbert.get("similarity_fn_name") + config_sbert["similarity_fn_name"] ) else: meta.similarity_fn_name = ScoringFunction.COSINE @@ -503,6 +502,8 @@ def zero_shot_percentage( @staticmethod def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | None: + if not model_name: + return None try: safetensors_metadata = get_safetensors_metadata(model_name) if len(safetensors_metadata.parameter_count) >= 0: @@ -516,7 +517,7 @@ def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | N logger.warning( f"Can't calculate number of parameters for {model_name}. Got error {e}" ) - return None + return None def calculate_num_parameters_from_hub(self) -> int | None: """Calculates the number of parameters in the model. @@ -579,7 +580,7 @@ def calculate_memory_usage_mb(self) -> int | None: if "API" in self.framework or self.name is None: return None - return self._calculate_memory_usage_mb(self.model_name, self.n_parameters) + return self._calculate_memory_usage_mb(self.name, self.n_parameters) @staticmethod def fetch_release_date(model_name: str) -> StrDate | None: diff --git a/mteb/types/_result.py b/mteb/types/_result.py index 848bb8e713..edb0f57bfb 100644 --- a/mteb/types/_result.py +++ b/mteb/types/_result.py @@ -1,3 +1,4 @@ +from collections.abc import Mapping from typing import Any, NamedTuple HFSubset = str @@ -8,7 +9,7 @@ Score = Any """A score value, could e.g. be accuracy. Normally it is a float or int, but it can take on any value. Should be json serializable.""" -ScoresDict = dict[str, Score] +ScoresDict = Mapping[str, Score] """A dictionary of scores, typically also include metadata, e.g {'main_score': 0.5, 'accuracy': 0.5, 'f1': 0.6, 'hf_subset': 'en-de', 'languages': ['eng-Latn', 'deu-Latn']}""" From 5cfc64b9c78d780ffe8543296de150353b4c2589 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Dec 2025 02:10:53 +0300 Subject: [PATCH 10/32] fix tests --- .../image/imagetext_pairclassification_evaluator.py | 5 ++--- mteb/abstasks/image/image_text_pair_classification.py | 6 +++--- mteb/models/cache_wrappers/cache_backends/numpy_cache.py | 9 ++------- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py b/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py index 21a37ac422..5b8c4ab6dd 100644 --- a/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +++ b/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py @@ -16,7 +16,6 @@ from mteb._requires_package import requires_image_dependencies from mteb.abstasks.task_metadata import TaskMetadata from mteb.models.models_protocols import EncoderProtocol -from mteb.types import Array if TYPE_CHECKING: from PIL.Image import Image @@ -86,7 +85,7 @@ def __init__( def __call__( # type: ignore[override] self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any] - ) -> Array: + ) -> list[torch.Tensor]: images = [] if isinstance(self.images_column_names, str): images = self.dataset[self.images_column_names] @@ -150,4 +149,4 @@ def __call__( # type: ignore[override] img_emb @ txt_emb.t() ) # shape = (num_images_per_sample x num_texts_per_sample) all_scores.append(scores) - return torch.tensor(all_scores) + return all_scores diff --git a/mteb/abstasks/image/image_text_pair_classification.py b/mteb/abstasks/image/image_text_pair_classification.py index 079aeb7fcc..e5a1e6debb 100644 --- a/mteb/abstasks/image/image_text_pair_classification.py +++ b/mteb/abstasks/image/image_text_pair_classification.py @@ -154,10 +154,10 @@ def _evaluate_subset( hf_subset=hf_subset, **kwargs, ) - scores: torch.Tensor = evaluator(model, encode_kwargs=encode_kwargs) # type: ignore[assignment] + scores: list[torch.Tensor] = evaluator(model, encode_kwargs=encode_kwargs) # type: ignore[assignment] if prediction_folder: self._save_task_predictions( - scores.tolist(), + [score.tolist() for score in scores], model, prediction_folder, hf_subset=hf_subset, @@ -172,7 +172,7 @@ def _evaluate_subset( def _compute_metrics( self, - scores: torch.Tensor, + scores: list[torch.Tensor], num_images_per_sample: int, num_texts_per_sample: int, ) -> ImageTextPairClassificationMetrics: diff --git a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py index 005f68bb99..5fa67c2a5c 100644 --- a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py +++ b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py @@ -28,10 +28,6 @@ def __init__(self, directory: str | Path, initial_vectors: int = 100_000): def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None: """Add a vector to the cache.""" - if self.vectors is None: - raise RuntimeError( - "Vectors file not initialized. Call _initialize_vectors_file() first." - ) try: if self.vector_dim is None: self.vector_dim = ( @@ -179,9 +175,8 @@ def load(self) -> None: def get_vector(self, item: dict[str, Any]) -> np.ndarray | None: """Retrieve vector from index by hash.""" if self.vectors is None: - raise RuntimeError( - "Vectors file not initialized. Call _initialize_vectors_file() first." - ) + return None + try: item_hash = _hash_item(item) if item_hash not in self.hash_to_index: From 0c45374790c056c907beb8694d9a3ae60033f57c Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Dec 2025 02:12:46 +0300 Subject: [PATCH 11/32] fix type errors again --- mteb/abstasks/multilabel_classification.py | 4 ++-- mteb/models/cache_wrappers/cache_backends/numpy_cache.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/mteb/abstasks/multilabel_classification.py b/mteb/abstasks/multilabel_classification.py index 07c6ac5c4c..b970ee9f9a 100644 --- a/mteb/abstasks/multilabel_classification.py +++ b/mteb/abstasks/multilabel_classification.py @@ -186,8 +186,8 @@ def _evaluate_subset( # type: ignore[override] ) avg_scores: dict[str, Any] = { - k: np.mean([s[k] for s in scores]) - for k in scores[0].keys() # type: ignore[literal-required] + k: np.mean([s[k] for s in scores]) # type: ignore[literal-required] + for k in scores[0].keys() } logger.info("Running multilabel classification - Finished.") return FullMultilabelClassificationMetrics( diff --git a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py index 5fa67c2a5c..b13fe25583 100644 --- a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py +++ b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py @@ -46,10 +46,14 @@ def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None: index = self.hash_to_index[item_hash] else: index = len(self.hash_to_index) - if index >= len(self.vectors): + if self.vectors and index >= len(self.vectors): self._double_vectors_file() self.hash_to_index[item_hash] = index + if self.vectors is None: + raise RuntimeError( + "Vectors file not initialized. Call _initialize_vectors_file() first." + ) self.vectors[index] = vec logger.debug( f"Added new item-vector pair. Total pairs: {len(self.hash_to_index)}" From 234fdac39450bde6650706025ac83abbe886fac2 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 16 Dec 2025 11:10:51 +0300 Subject: [PATCH 12/32] fix cache --- .../cache_wrappers/cache_backends/numpy_cache.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py index b13fe25583..c724634305 100644 --- a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py +++ b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py @@ -37,6 +37,11 @@ def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None: self._save_dimension() logger.info(f"Initialized vector dimension to {self.vector_dim}") + if self.vectors is None: + raise RuntimeError( + "Vectors file not initialized. Call _initialize_vectors_file() first." + ) + for item, vec in zip(items, vectors): item_hash = _hash_item(item) if item_hash in self.hash_to_index: @@ -46,14 +51,10 @@ def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None: index = self.hash_to_index[item_hash] else: index = len(self.hash_to_index) - if self.vectors and index >= len(self.vectors): + if index >= len(self.vectors): self._double_vectors_file() self.hash_to_index[item_hash] = index - if self.vectors is None: - raise RuntimeError( - "Vectors file not initialized. Call _initialize_vectors_file() first." - ) self.vectors[index] = vec logger.debug( f"Added new item-vector pair. Total pairs: {len(self.hash_to_index)}" From 39e09dd61e45481b4fdf32b813fcb61930ccd0bd Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 22 Dec 2025 11:30:17 +0300 Subject: [PATCH 13/32] add more types --- mteb/_evaluators/retrieval_metrics.py | 33 +++++------ mteb/abstasks/_stratification.py | 21 ++++--- mteb/abstasks/task_metadata.py | 20 ++++--- .../search_indexes/faiss_search_index.py | 2 +- mteb/models/search_wrappers.py | 13 +++-- mteb/models/sentence_transformer_wrapper.py | 8 ++- mteb/similarity_functions.py | 18 +++--- pyproject.toml | 57 ++++++++++--------- 8 files changed, 91 insertions(+), 81 deletions(-) diff --git a/mteb/_evaluators/retrieval_metrics.py b/mteb/_evaluators/retrieval_metrics.py index e998883a13..2e8ac2110b 100644 --- a/mteb/_evaluators/retrieval_metrics.py +++ b/mteb/_evaluators/retrieval_metrics.py @@ -1,5 +1,6 @@ import logging from collections import defaultdict +from collections.abc import Mapping from typing import Any import numpy as np @@ -15,7 +16,7 @@ def mrr( qrels: RelevantDocumentsType, - results: dict[str, dict[str, float]], + results: Mapping[str, Mapping[str, float]], k_values: list[int], ) -> dict[str, list[float]]: mrr_metrics = defaultdict(list) @@ -32,7 +33,7 @@ def mrr( doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0 } for k in k_values: - rr = 0 + rr = 0.0 for rank, hit in enumerate(top_hits[query_id][0:k]): if hit[0] in query_relevant_docs: rr = 1.0 / (rank + 1) @@ -45,8 +46,8 @@ def recall_cap( qrels: RelevantDocumentsType, results: dict[str, dict[str, float]], k_values: list[int], -) -> dict[str, list[float]]: - capped_recall = defaultdict(list) +) -> dict[str, list[float | None]]: + capped_recall: dict[str, list[float | None]] = defaultdict(list) k_max = max(k_values) @@ -188,7 +189,7 @@ def evaluate_p_mrr_change( Returns: A dictionary with the scores, including "p-MRR", "og" and "changed" keys. """ - followir_scores = defaultdict(dict) + followir_scores: dict[str, float | dict[str, float]] = defaultdict(dict) qrels_sep = { "og": {k: v for k, v in qrels.items() if k.endswith("-og")}, @@ -227,7 +228,7 @@ def evaluate_p_mrr_change( ndcg, _map, recall, precision, naucs, avg_mrr, naucs_mrr, cv_recall, {} ) for key, value in scores_dict.items(): - followir_scores[name][key] = value + followir_scores[name][key] = value # type: ignore[index] return followir_scores @@ -254,8 +255,8 @@ def confidence_scores(sim_scores: list[float]) -> dict[str, float]: sim_scores_sorted = sorted(sim_scores)[::-1] cs_max = sim_scores_sorted[0] - cs_std = np.std(sim_scores) - cs_diff1 = None + cs_std = float(np.std(sim_scores)) + cs_diff1 = 0.0 if len(sim_scores) > 1: cs_diff1 = sim_scores_sorted[0] - sim_scores_sorted[1] elif len(sim_scores) == 1: @@ -410,7 +411,7 @@ def make_score_dict( cv_recall: dict[str, float], task_scores: dict[str, float], previous_results_model_meta: dict[str, Any] | None = None, -) -> dict[str, float]: +) -> dict[str, Any]: return { **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()}, @@ -528,7 +529,7 @@ def max_over_subqueries( def calculate_retrieval_scores( - results: dict[str, dict[str, float]], + results: Mapping[str, Mapping[str, float]], qrels: RelevantDocumentsType, k_values: list[int], skip_first_result: bool = False, @@ -576,7 +577,7 @@ def calculate_retrieval_scores( def evaluate_abstention( - results: dict[str, dict[str, float]], + results: Mapping[str, Mapping[str, float]], metric_scores: dict[str, list[float]], ) -> dict[str, float]: """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997 @@ -591,21 +592,21 @@ def evaluate_abstention( all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())] all_conf_scores = [confidence_scores(sim_scores) for sim_scores in all_sim_scores] conf_fcts = list(all_conf_scores[0].keys()) - all_conf_scores = { + all_conf_scores_ = { fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts } - metric_scores = {k: np.array(v) for k, v in metric_scores.items()} + metric_scores_ = {k: np.array(v) for k, v in metric_scores.items()} naucs = {} - for metric_name, scores in metric_scores.items(): - for fct, conf_scores in all_conf_scores.items(): + for metric_name, scores in metric_scores_.items(): + for fct, conf_scores in all_conf_scores_.items(): naucs[f"nAUC_{metric_name}_{fct}"] = nauc(conf_scores, scores) return naucs def calculate_cv_recall( - results: dict[str, dict[str, float]], + results: Mapping[str, Mapping[str, float]], qrels: RelevantDocumentsType, k_values: list[int], skip_first_result: bool = False, diff --git a/mteb/abstasks/_stratification.py b/mteb/abstasks/_stratification.py index 954afdfab0..a010a3a3a7 100644 --- a/mteb/abstasks/_stratification.py +++ b/mteb/abstasks/_stratification.py @@ -39,6 +39,7 @@ """ import itertools +from typing import Any import numpy as np import scipy.sparse as sp @@ -182,9 +183,9 @@ def _prepare_stratification( list[list[int]], dict[int, bool], list[list[int]], - list[list[list[int]]], - dict[tuple[int, ...], list[int]], - list[list[int]], + list[list[Any]], + dict[str, list[Any]], + list[list[Any]], ]: """Prepares variables for performing stratification @@ -211,8 +212,8 @@ def _prepare_stratification( rows = sp.lil_matrix(y).rows rows_used = dict.fromkeys(range(self.n_samples), False) all_combinations = [] - per_row_combinations: list[list[int]] = [[] for i in range(self.n_samples)] - samples_with_combination: dict[str, int] = {} + per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)] + samples_with_combination: dict[str, list[Any]] = {} folds = [[] for _ in range(self.n_splits)] # type: ignore # for every row @@ -229,21 +230,19 @@ def _prepare_stratification( all_combinations.append(combination) per_row_combinations[sample_index].append(combination) - all_combinations: list[list[int]] = [list(x) for x in set(all_combinations)] - self.desired_samples_per_combination_per_fold = { combination: np.array( [ len(evidence_for_combination) * self.percentage_per_fold[j] - for j in range(self.n_splits) # type: ignore + for j in range(self.n_splits) ] ) for combination, evidence_for_combination in samples_with_combination.items() } return ( - rows, + rows.tolist(), rows_used, - all_combinations, + [list(x) for x in set(all_combinations)], per_row_combinations, samples_with_combination, folds, @@ -328,7 +327,7 @@ def _iter_test_indices(self, X, y=None, groups=None): # noqa: N803 per_row_combinations, samples_with_combination, folds, - ) = self._prepare_stratification(y) # type: ignore + ) = self._prepare_stratification(y) self._distribute_positive_evidence( rows_used, folds, samples_with_combination, per_row_combinations diff --git a/mteb/abstasks/task_metadata.py b/mteb/abstasks/task_metadata.py index bb592ec82d..59c7c4493e 100644 --- a/mteb/abstasks/task_metadata.py +++ b/mteb/abstasks/task_metadata.py @@ -2,9 +2,10 @@ import logging from collections.abc import Sequence from pathlib import Path -from typing import Any, Literal +from typing import Any, Literal, cast from huggingface_hub import ( + CardData, DatasetCard, DatasetCardData, constants, @@ -469,7 +470,7 @@ def get_modalities(self, prompt_type: PromptType | None = None) -> list[Modaliti def _create_dataset_card_data( self, - existing_dataset_card_data: DatasetCardData | None = None, + existing_dataset_card_data: CardData | None = None, ) -> tuple[DatasetCardData, dict[str, Any]]: """Create a DatasetCardData object from the task metadata. @@ -504,12 +505,13 @@ def _create_dataset_card_data( tags = ["mteb"] + self.modalities - descriptive_stats = self.descriptive_stats - if descriptive_stats is not None: - for split, split_stat in descriptive_stats.items(): + descriptive_stats = "" + if self.descriptive_stats is not None: + descriptive_stats_ = self.descriptive_stats + for split, split_stat in descriptive_stats_.items(): if len(split_stat.get("hf_subset_descriptive_stats", {})) > 10: split_stat.pop("hf_subset_descriptive_stats", {}) - descriptive_stats = json.dumps(descriptive_stats, indent=4) + descriptive_stats = json.dumps(descriptive_stats_, indent=4) dataset_card_data_params = existing_dataset_card_data.to_dict() # override the existing values @@ -697,11 +699,11 @@ def _hf_task_category(self) -> list[str]: def _hf_languages(self) -> list[str]: languages: list[str] = [] - if self.is_multilingual: - for val in list(self.eval_langs.values()): + if self.is_multilingual and isinstance(self.eval_langs, dict): + for val in self.eval_langs.values(): languages.extend(val) else: - languages = self.eval_langs + languages = cast(list[str], self.eval_langs) # value "python" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters), # or a special value like "code", "multilingual". readme_langs = [] diff --git a/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py b/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py index 3cc0fa5024..a383bd6819 100644 --- a/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +++ b/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py @@ -108,7 +108,7 @@ def search( ids = ids.tolist() if issubclass(self.index_type, faiss.IndexFlatL2): - similarities = -np.sqrt(np.maximum(similarities, 0)) + similarities = -np.sqrt(np.maximum(similarities, 0)).tolist() return similarities, ids diff --git a/mteb/models/search_wrappers.py b/mteb/models/search_wrappers.py index 08270a99a7..8a441446b8 100644 --- a/mteb/models/search_wrappers.py +++ b/mteb/models/search_wrappers.py @@ -200,7 +200,7 @@ def search( # Reset the task corpus dataloader to None to free up memory self.task_corpus = None - results = {qid: {} for qid in query_idx_to_id.values()} + results: RetrievalOutputType = {qid: {} for qid in query_idx_to_id.values()} for qid in result_heaps: for score, corpus_id in result_heaps[qid]: results[qid][corpus_id] = score @@ -218,13 +218,14 @@ def _full_corpus_search( encode_kwargs: dict[str, Any], ) -> dict[str, list[tuple[float, str]]]: logger.info("Encoding Corpus in batches (this might take a while)...") - itr = range(0, len(self.task_corpus), self.corpus_chunk_size) + itr = range(0, len(self.task_corpus), self.corpus_chunk_size) # type: ignore[arg-type] result_heaps = {qid: [] for qid in query_idx_to_id.values()} for batch_num, corpus_start_idx in enumerate(itr): logger.info(f"Encoding Batch {batch_num + 1}/{len(itr)}...") corpus_end_idx = min( - corpus_start_idx + self.corpus_chunk_size, len(self.task_corpus) + corpus_start_idx + self.corpus_chunk_size, + len(self.task_corpus), # type: ignore[arg-type] ) sub_corpus = self.task_corpus.select( range(corpus_start_idx, corpus_end_idx) @@ -249,7 +250,7 @@ def _full_corpus_search( scores = self.model.similarity(query_embeddings, sub_corpus_embeddings) # get top-k values - cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( + cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = torch.topk( torch.as_tensor(scores), min( top_k + 1, @@ -258,8 +259,8 @@ def _full_corpus_search( dim=1, largest=True, ) - cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() - cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() + cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist() + cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist() sub_corpus_ids = list(sub_corpus_ids) result_heaps = self._sort_full_corpus_results( diff --git a/mteb/models/sentence_transformer_wrapper.py b/mteb/models/sentence_transformer_wrapper.py index 2330d97037..6ac1a6d7d5 100644 --- a/mteb/models/sentence_transformer_wrapper.py +++ b/mteb/models/sentence_transformer_wrapper.py @@ -150,7 +150,7 @@ def encode( prompt_name = None if self.model_prompts is not None: prompt_name = self.get_prompt_name(task_metadata, prompt_type) - prompt = self.model_prompts.get(prompt_name, None) + prompt = self.model_prompts.get(prompt_name, None) # type: ignore[arg-type] if prompt_name: prompt_log = f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}" else: @@ -221,7 +221,7 @@ def encode( prompt_name = None if self.model_prompts is not None: prompt_name = self.get_prompt_name(task_metadata, prompt_type) - prompt = self.model_prompts.get(prompt_name, None) + prompt = self.model_prompts.get(prompt_name, None) # type: ignore[arg-type] if prompt_name: logger.info( f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}" @@ -234,7 +234,9 @@ def encode( all_embeddings = [] for batch in inputs: batch_column = next(iter(batch.keys())) - batched_input = [dict() for _ in range(len(batch[batch_column]))] + batched_input: list[dict[str, Any]] = [ + dict() for _ in range(len(batch[batch_column])) + ] # transform from {"text": [text1, text2], "image": [image1, image2]} to # [{"text": text1, "image": image1}, {"text": text2, "image": image2}] diff --git a/mteb/similarity_functions.py b/mteb/similarity_functions.py index 1624a034d1..cd5f32abb6 100644 --- a/mteb/similarity_functions.py +++ b/mteb/similarity_functions.py @@ -186,7 +186,7 @@ def max_sim(a: Array, b: Array) -> torch.Tensor: b, ) - return scores.max(axis=-1).values.sum(axis=-1) + return scores.max(axis=-1).values.sum(axis=-1) # type: ignore[call-overload] # https://github.com/lightonai/pylate/blob/2d094a724866d6e15701781528368438081c0157/pylate/scores/scores.py#L67C1-L122C38 @@ -217,7 +217,7 @@ def pairwise_max_sim( document_embedding, ) - scores.append(query_document_score.max(axis=-1).values.sum()) + scores.append(query_document_score.max(axis=-1).values.sum()) # type: ignore[call-overload] return torch.stack(scores, dim=0) @@ -317,11 +317,15 @@ def similarity(text_embeddings: Array, input_embeddings: Array) -> Array: Returns: Matrix with similarities """ - text_embeddings = _convert_to_tensor(text_embeddings) - input_embeddings = _convert_to_tensor(input_embeddings) + text_embeddings_tensor = _convert_to_tensor(text_embeddings) + input_embeddings_tensor = _convert_to_tensor(input_embeddings) - text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True) - input_embeddings = input_embeddings / input_embeddings.norm(dim=-1, keepdim=True) - logits = torch.matmul(input_embeddings, text_embeddings.T) + text_embeddings_tensor = text_embeddings_tensor / text_embeddings_tensor.norm( + dim=-1, keepdim=True + ) + input_embeddings_tensor = input_embeddings_tensor / input_embeddings_tensor.norm( + dim=-1, keepdim=True + ) + logits = torch.matmul(input_embeddings_tensor, text_embeddings_tensor.T) probs = (logits * 100).softmax(dim=-1) return probs diff --git a/pyproject.toml b/pyproject.toml index 2a61fc1c68..668ff3f37e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,34 +70,34 @@ leaderboard = [ ] # model specific optional dependencies: -peft = ["peft>=0.11.0"] -flagembedding = ["FlagEmbedding==1.3.4"] -jina = ["einops>=0.8.0"] -jina-v4 = ["peft>=0.15.2", "transformers>=4.52.0", "torchvision>=0.22.1"] -flash_attention = ["flash-attn>=2.6.3"] -openai = ["openai>=1.41.0", "tiktoken>=0.8.0"] -model2vec = ["model2vec>=0.3.0"] -pylate = ["pylate>=1.3.1; python_version < '3.13'"] # Required for sentence-transformers 5.0.0 compatibility (otherwise 1.1.6), pylate requires voyager, which in turn requires <=3.12 -bm25s = ["bm25s>=0.2.6", "PyStemmer>=2.2.0.3"] -gritlm = ["gritlm>=1.0.2"] -xformers = ["xformers>=0.0.29"] -blip2 = ["salesforce-lavis>=1.0.2"] -voyageai = ["voyageai>0.3.0,<2.0.0"] -voyage_v = ["voyageai>0.3.0,<2.0.0", "tenacity>9.0.0"] -cohere = ["cohere==5.14.0"] -vertexai = ["vertexai==1.71.1"] -llm2vec = ["llm2vec>=0.2.3,<0.3.0"] -timm = ["timm>=1.0.15,<1.1.0"] -open_clip_torch = ["open_clip_torch==2.31.0"] -nomic = ["einops>=0.8.1"] -ark = ["volcengine-python-sdk[ark]==3.0.2", "tiktoken>=0.8.0"] -colpali_engine = ["colpali_engine>=0.3.12"] -colqwen3 = ["transformers>=4.57", "torchvision>=0.22.1"] -xet = ["huggingface_hub>=0.32.0"] -youtu = ["tencentcloud-sdk-python-common>=3.0.1454", "tencentcloud-sdk-python-lkeap>=3.0.1451"] -llama-embed-nemotron = ["transformers==4.51.0"] -faiss-cpu = ["faiss-cpu>=1.12.0"] -eager_embed = ["qwen_vl_utils>=0.0.14"] +#peft = ["peft>=0.11.0"] +#flagembedding = ["FlagEmbedding==1.3.4"] +#jina = ["einops>=0.8.0"] +#jina-v4 = ["peft>=0.15.2", "transformers>=4.52.0", "torchvision>=0.22.1"] +#flash_attention = ["flash-attn>=2.6.3"] +#openai = ["openai>=1.41.0", "tiktoken>=0.8.0"] +#model2vec = ["model2vec>=0.3.0"] +#pylate = ["pylate>=1.3.1; python_version < '3.13'"] # Required for sentence-transformers 5.0.0 compatibility (otherwise 1.1.6), pylate requires voyager, which in turn requires <=3.12 +#bm25s = ["bm25s>=0.2.6", "PyStemmer>=2.2.0.3"] +#gritlm = ["gritlm>=1.0.2"] +#xformers = ["xformers>=0.0.29"] +#blip2 = ["salesforce-lavis>=1.0.2"] +#voyageai = ["voyageai>0.3.0,<2.0.0"] +#voyage_v = ["voyageai>0.3.0,<2.0.0", "tenacity>9.0.0"] +#cohere = ["cohere==5.14.0"] +#vertexai = ["vertexai==1.71.1"] +#llm2vec = ["llm2vec>=0.2.3,<0.3.0"] +#timm = ["timm>=1.0.15,<1.1.0"] +#open_clip_torch = ["open_clip_torch==2.31.0"] +#nomic = ["einops>=0.8.1"] +#ark = ["volcengine-python-sdk[ark]==3.0.2", "tiktoken>=0.8.0"] +#colpali_engine = ["colpali_engine>=0.3.12"] +#colqwen3 = ["transformers>=4.57", "torchvision>=0.22.1"] +#xet = ["huggingface_hub>=0.32.0"] +#youtu = ["tencentcloud-sdk-python-common>=3.0.1454", "tencentcloud-sdk-python-lkeap>=3.0.1451"] +#llama-embed-nemotron = ["transformers==4.51.0"] +#faiss-cpu = ["faiss-cpu>=1.12.0"] +#eager_embed = ["qwen_vl_utils>=0.0.14"] [dependency-groups] lint = [ @@ -135,6 +135,7 @@ typing = [ "pandas-stubs>=2.3.2.250926", "scipy-stubs>=1.15.3.0", "types-defusedxml>=0.7.0.20250822", + "pillow>=12.0.0", ] dev = [ {include-group = "lint"}, From 20fa646228fa8f0a45c4a2e98827794b8f26fbea Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 22 Dec 2025 12:58:46 +0300 Subject: [PATCH 14/32] fix method --- mteb/models/sentence_transformer_wrapper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mteb/models/sentence_transformer_wrapper.py b/mteb/models/sentence_transformer_wrapper.py index 6ac1a6d7d5..74402c131d 100644 --- a/mteb/models/sentence_transformer_wrapper.py +++ b/mteb/models/sentence_transformer_wrapper.py @@ -103,8 +103,11 @@ def __init__( f" 'document' prompts to ensure optimal performance. Received {self.model_prompts}" ) + def similarity(self, embeddings1: Array, embeddings2: Array) -> Array: + """Compute the similarity between two collections of embeddings.""" if hasattr(self.model, "similarity") and callable(self.model.similarity): - self.similarity = self.model.similarity + return self.model.similarity(embeddings1, embeddings2) + return super().similarity(embeddings1, embeddings2) def encode( self, From d396270c615a847c244b1c60ae3139f834c516de Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 22 Dec 2025 21:30:44 +0300 Subject: [PATCH 15/32] roll back pyproject --- pyproject.toml | 56 +++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 668ff3f37e..5b94a91826 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,34 +70,34 @@ leaderboard = [ ] # model specific optional dependencies: -#peft = ["peft>=0.11.0"] -#flagembedding = ["FlagEmbedding==1.3.4"] -#jina = ["einops>=0.8.0"] -#jina-v4 = ["peft>=0.15.2", "transformers>=4.52.0", "torchvision>=0.22.1"] -#flash_attention = ["flash-attn>=2.6.3"] -#openai = ["openai>=1.41.0", "tiktoken>=0.8.0"] -#model2vec = ["model2vec>=0.3.0"] -#pylate = ["pylate>=1.3.1; python_version < '3.13'"] # Required for sentence-transformers 5.0.0 compatibility (otherwise 1.1.6), pylate requires voyager, which in turn requires <=3.12 -#bm25s = ["bm25s>=0.2.6", "PyStemmer>=2.2.0.3"] -#gritlm = ["gritlm>=1.0.2"] -#xformers = ["xformers>=0.0.29"] -#blip2 = ["salesforce-lavis>=1.0.2"] -#voyageai = ["voyageai>0.3.0,<2.0.0"] -#voyage_v = ["voyageai>0.3.0,<2.0.0", "tenacity>9.0.0"] -#cohere = ["cohere==5.14.0"] -#vertexai = ["vertexai==1.71.1"] -#llm2vec = ["llm2vec>=0.2.3,<0.3.0"] -#timm = ["timm>=1.0.15,<1.1.0"] -#open_clip_torch = ["open_clip_torch==2.31.0"] -#nomic = ["einops>=0.8.1"] -#ark = ["volcengine-python-sdk[ark]==3.0.2", "tiktoken>=0.8.0"] -#colpali_engine = ["colpali_engine>=0.3.12"] -#colqwen3 = ["transformers>=4.57", "torchvision>=0.22.1"] -#xet = ["huggingface_hub>=0.32.0"] -#youtu = ["tencentcloud-sdk-python-common>=3.0.1454", "tencentcloud-sdk-python-lkeap>=3.0.1451"] -#llama-embed-nemotron = ["transformers==4.51.0"] -#faiss-cpu = ["faiss-cpu>=1.12.0"] -#eager_embed = ["qwen_vl_utils>=0.0.14"] +peft = ["peft>=0.11.0"] +flagembedding = ["FlagEmbedding==1.3.4"] +jina = ["einops>=0.8.0"] +jina-v4 = ["peft>=0.15.2", "transformers>=4.52.0", "torchvision>=0.22.1"] +flash_attention = ["flash-attn>=2.6.3"] +openai = ["openai>=1.41.0", "tiktoken>=0.8.0"] +model2vec = ["model2vec>=0.3.0"] +pylate = ["pylate>=1.3.1; python_version < '3.13'"] # Required for sentence-transformers 5.0.0 compatibility (otherwise 1.1.6), pylate requires voyager, which in turn requires <=3.12 +bm25s = ["bm25s>=0.2.6", "PyStemmer>=2.2.0.3"] +gritlm = ["gritlm>=1.0.2"] +xformers = ["xformers>=0.0.29"] +blip2 = ["salesforce-lavis>=1.0.2"] +voyageai = ["voyageai>0.3.0,<2.0.0"] +voyage_v = ["voyageai>0.3.0,<2.0.0", "tenacity>9.0.0"] +cohere = ["cohere==5.14.0"] +vertexai = ["vertexai==1.71.1"] +llm2vec = ["llm2vec>=0.2.3,<0.3.0"] +timm = ["timm>=1.0.15,<1.1.0"] +open_clip_torch = ["open_clip_torch==2.31.0"] +nomic = ["einops>=0.8.1"] +ark = ["volcengine-python-sdk[ark]==3.0.2", "tiktoken>=0.8.0"] +colpali_engine = ["colpali_engine>=0.3.12"] +colqwen3 = ["transformers>=4.57", "torchvision>=0.22.1"] +xet = ["huggingface_hub>=0.32.0"] +youtu = ["tencentcloud-sdk-python-common>=3.0.1454", "tencentcloud-sdk-python-lkeap>=3.0.1451"] +llama-embed-nemotron = ["transformers==4.51.0"] +faiss-cpu = ["faiss-cpu>=1.12.0"] +eager_embed = ["qwen_vl_utils>=0.0.14"] [dependency-groups] lint = [ From 794a32f1cf32c4621d57f5f0ce10018cb7494bf0 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 22 Dec 2025 21:48:59 +0300 Subject: [PATCH 16/32] activate PGH --- docs/mmteb/validate_points.py | 19 ++-- .../text/bitext_mining_evaluator.py | 2 +- .../text/summarization_evaluator.py | 2 +- mteb/abstasks/_statistics_calculation.py | 2 +- mteb/abstasks/_stratification.py | 13 ++- mteb/abstasks/abstask.py | 2 +- mteb/abstasks/aggregate_task_metadata.py | 4 +- mteb/abstasks/clustering.py | 4 +- mteb/abstasks/task_metadata.py | 4 +- mteb/abstasks/text/bitext_mining.py | 2 +- mteb/abstasks/text/reranking.py | 4 +- mteb/deprecated_evaluator.py | 4 +- mteb/leaderboard/app.py | 2 +- mteb/models/get_model_meta.py | 2 +- .../model_implementations/andersborges.py | 4 +- .../model_implementations/blip_models.py | 16 ++-- mteb/models/model_implementations/bm25.py | 2 +- .../model_implementations/clip_models.py | 6 +- .../model_implementations/cohere_models.py | 2 +- mteb/models/model_implementations/cohere_v.py | 4 +- .../model_implementations/dino_models.py | 46 +++++----- .../emillykkejensen_models.py | 6 +- .../models/model_implementations/jina_clip.py | 2 +- .../model_implementations/jina_models.py | 2 +- .../kennethenevoldsen_models.py | 4 +- .../model_implementations/llm2clip_models.py | 6 +- .../model_implementations/moco_models.py | 4 +- .../model_implementations/model2vec_models.py | 2 +- .../model_implementations/nomic_models.py | 16 ++-- .../model_implementations/openclip_models.py | 14 +-- .../model_implementations/random_baseline.py | 6 +- .../model_implementations/rasgaard_models.py | 2 +- .../model_implementations/repllama_models.py | 4 +- .../model_implementations/rerankers_custom.py | 6 +- .../rerankers_monot5_based.py | 6 +- .../model_implementations/siglip_models.py | 20 ++-- .../model_implementations/vlm2vec_models.py | 2 +- mteb/models/model_implementations/voyage_v.py | 8 +- mteb/models/model_meta.py | 2 +- mteb/results/benchmark_results.py | 8 +- mteb/results/model_result.py | 14 +-- mteb/results/task_result.py | 14 +-- .../dan/dk_hate_classification.py | 2 +- .../classification/est/estonian_valence.py | 2 +- .../multilingual/scala_classification.py | 2 +- .../eng/sugar_crepe.py | 2 +- mteb/tasks/retrieval/code/code_rag.py | 24 ++--- .../retrieval/dan/dan_fever_retrieval.py | 2 +- mteb/tasks/retrieval/dan/tv2_nordretrieval.py | 4 +- .../retrieval/dan/twitter_hjerne_retrieval.py | 4 +- mteb/tasks/retrieval/nob/norquad.py | 4 +- mteb/tasks/retrieval/nob/snl_retrieval.py | 4 +- mteb/tasks/retrieval/tur/tur_hist_quad.py | 2 +- pyproject.toml | 1 + tests/mock_models.py | 2 +- tests/mock_tasks.py | 92 +++++++++---------- tests/test_abstasks/test_task_metadata.py | 2 +- tests/test_evaluate.py | 2 +- tests/test_filter_tasks.py | 4 +- tests/test_get_tasks.py | 2 +- tests/test_tasks/test_task_quality.py | 2 +- 61 files changed, 226 insertions(+), 227 deletions(-) diff --git a/docs/mmteb/validate_points.py b/docs/mmteb/validate_points.py index 21b6fd3877..13bee8c047 100644 --- a/docs/mmteb/validate_points.py +++ b/docs/mmteb/validate_points.py @@ -1,6 +1,5 @@ import logging from pathlib import Path -from typing import Optional from jsonlines import Reader from pydantic import BaseModel, ConfigDict, Field, ValidationError, conint, constr @@ -21,17 +20,17 @@ class JsonObject(BaseModel): model_config = ConfigDict(extra="forbid") GitHub: constr(min_length=1) - new_dataset: Optional[conint(ge=1)] = Field(alias="New dataset", default=None) # noqa - new_task: Optional[conint(ge=2)] = Field(alias="New task", default=None) # noqa - dataset_annotations: Optional[conint(ge=1)] = Field( # noqa + new_dataset: conint(ge=1) | None = Field(alias="New dataset", default=None) + new_task: conint(ge=2) | None = Field(alias="New task", default=None) + dataset_annotations: conint(ge=1) | None = Field( alias="Dataset annotations", default=None ) - bug_fixes: Optional[conint(ge=1)] = Field(alias="Bug fixes", default=None) # noqa - running_models: Optional[conint(ge=1)] = Field(alias="Running Models", default=None) # noqa - review_pr: Optional[conint(ge=2)] = Field(alias="Review PR", default=None) # noqa - paper_writing: Optional[int] = Field(alias="Paper writing", default=None) # noqa - Ideation: Optional[int] = None # noqa - Coordination: Optional[int] = None # noqa + bug_fixes: conint(ge=1) | None = Field(alias="Bug fixes", default=None) + running_models: conint(ge=1) | None = Field(alias="Running Models", default=None) + review_pr: conint(ge=2) | None = Field(alias="Review PR", default=None) + paper_writing: int | None = Field(alias="Paper writing", default=None) + Ideation: int | None = None + Coordination: int | None = None def check_max_points(obj: JsonObject, commit_n: str): diff --git a/mteb/_evaluators/text/bitext_mining_evaluator.py b/mteb/_evaluators/text/bitext_mining_evaluator.py index 796d516ea1..2c5a2ee169 100644 --- a/mteb/_evaluators/text/bitext_mining_evaluator.py +++ b/mteb/_evaluators/text/bitext_mining_evaluator.py @@ -110,7 +110,7 @@ def _similarity_search( # Iterate over chunks of the corpus for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size): # Compute cosine similarities - similarity_scores = model.similarity( # type: ignore + similarity_scores = model.similarity( query_embeddings[ query_start_idx : query_start_idx + query_chunk_size ], diff --git a/mteb/_evaluators/text/summarization_evaluator.py b/mteb/_evaluators/text/summarization_evaluator.py index 5c6068d6eb..43c6cda2b4 100644 --- a/mteb/_evaluators/text/summarization_evaluator.py +++ b/mteb/_evaluators/text/summarization_evaluator.py @@ -164,7 +164,7 @@ def __call__( dot_scores = dot_score(emb_machine_summary, embs_human_summaries) _sim_score = [ - float(model.similarity(emb_machine_summary, emb_human_summary)) # type: ignore + float(model.similarity(emb_machine_summary, emb_human_summary)) for emb_human_summary in embs_human_summaries ] sim_score = torch.tensor(_sim_score) diff --git a/mteb/abstasks/_statistics_calculation.py b/mteb/abstasks/_statistics_calculation.py index eb6960059e..598d50af71 100644 --- a/mteb/abstasks/_statistics_calculation.py +++ b/mteb/abstasks/_statistics_calculation.py @@ -53,7 +53,7 @@ def calculate_image_statistics(images: list[Image.Image]) -> ImageStatistics: seen_hashes: set[str] = set() for img in images: - width, height = img.size # type: ignore + width, height = img.size img_heights.append(height) img_widths.append(width) diff --git a/mteb/abstasks/_stratification.py b/mteb/abstasks/_stratification.py index a010a3a3a7..2f54e20af4 100644 --- a/mteb/abstasks/_stratification.py +++ b/mteb/abstasks/_stratification.py @@ -120,8 +120,8 @@ def _get_most_desired_combination(samples_with_combination: dict): if support_size == 0: continue if currently_chosen is None or ( - best_number_of_combinations < number_of_combinations # type: ignore - and best_support_size > support_size # type: ignore + best_number_of_combinations < number_of_combinations + and best_support_size > support_size ): currently_chosen = combination best_number_of_combinations, best_support_size = ( @@ -163,7 +163,7 @@ def __init__( self._rng_state = check_random_state(random_state) need_shuffle = shuffle or random_state is not None self.order = order - super().__init__( # type: ignore + super().__init__( n_splits, shuffle=need_shuffle, random_state=self._rng_state if need_shuffle else None, @@ -173,8 +173,7 @@ def __init__( self.percentage_per_fold = sample_distribution_per_fold else: self.percentage_per_fold = [ - 1 / float(self.n_splits) - for _ in range(self.n_splits) # type: ignore + 1 / float(self.n_splits) for _ in range(self.n_splits) ] def _prepare_stratification( @@ -207,14 +206,14 @@ def _prepare_stratification( """ self.n_samples, self.n_labels = y.shape self.desired_samples_per_fold = np.array( - [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)] # type: ignore + [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)] ) rows = sp.lil_matrix(y).rows rows_used = dict.fromkeys(range(self.n_samples), False) all_combinations = [] per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)] samples_with_combination: dict[str, list[Any]] = {} - folds = [[] for _ in range(self.n_splits)] # type: ignore + folds = [[] for _ in range(self.n_splits)] # for every row for sample_index, label_assignment in enumerate(rows): diff --git a/mteb/abstasks/abstask.py b/mteb/abstasks/abstask.py index 5b4d63f1d5..0bf4beb884 100644 --- a/mteb/abstasks/abstask.py +++ b/mteb/abstasks/abstask.py @@ -326,7 +326,7 @@ def load_data(self) -> None: ) else: # some of monolingual datasets explicitly adding the split name to the dataset name - self.dataset = load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True diff --git a/mteb/abstasks/aggregate_task_metadata.py b/mteb/abstasks/aggregate_task_metadata.py index 97a38c8268..2d90ba8ac8 100644 --- a/mteb/abstasks/aggregate_task_metadata.py +++ b/mteb/abstasks/aggregate_task_metadata.py @@ -65,9 +65,9 @@ def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]: """Return a dictionary mapping huggingface subsets to languages.""" if isinstance(self.eval_langs, dict): return self.eval_langs - return {"default": self.eval_langs} # type: ignore + return {"default": self.eval_langs} - @model_validator(mode="after") # type: ignore + @model_validator(mode="after") def _compute_unfilled_cases(self) -> Self: if not self.eval_langs: self.eval_langs = self._compute_eval_langs() diff --git a/mteb/abstasks/clustering.py b/mteb/abstasks/clustering.py index ca603a3be4..0dbf122489 100644 --- a/mteb/abstasks/clustering.py +++ b/mteb/abstasks/clustering.py @@ -184,11 +184,11 @@ def _evaluate_subset( else: max_documents_to_embed = self.max_document_to_embed - max_documents_to_embed = min(len(data_split), max_documents_to_embed) # type: ignore + max_documents_to_embed = min(len(data_split), max_documents_to_embed) example_indices = self.rng_state.sample( range(len(data_split)), k=max_documents_to_embed ) - downsampled_dataset = data_split.select(example_indices) # type: ignore + downsampled_dataset = data_split.select(example_indices) downsampled_dataset = downsampled_dataset.select_columns( [self.input_column_name, self.label_column_name] diff --git a/mteb/abstasks/task_metadata.py b/mteb/abstasks/task_metadata.py index 59c7c4493e..b4e4a3c59c 100644 --- a/mteb/abstasks/task_metadata.py +++ b/mteb/abstasks/task_metadata.py @@ -368,7 +368,7 @@ def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]: """Return a dictionary mapping huggingface subsets to languages.""" if isinstance(self.eval_langs, dict): return self.eval_langs - return {"default": self.eval_langs} # type: ignore + return {"default": self.eval_langs} @property def intext_citation(self, include_cite: bool = True) -> str: @@ -417,7 +417,7 @@ def n_samples(self) -> dict[str, int] | None: for subset, subset_value in stats.items(): if subset == "hf_subset_descriptive_stats": continue - n_samples[subset] = subset_value["num_samples"] # type: ignore + n_samples[subset] = subset_value["num_samples"] return n_samples @property diff --git a/mteb/abstasks/text/bitext_mining.py b/mteb/abstasks/text/bitext_mining.py index ecbdb05b5f..1e3f302013 100644 --- a/mteb/abstasks/text/bitext_mining.py +++ b/mteb/abstasks/text/bitext_mining.py @@ -167,7 +167,7 @@ def _evaluate_subset( # type: ignore[override] evaluator = BitextMiningEvaluator( data_split, task_metadata=self.metadata, - pair_columns=pairs, # type: ignore + pair_columns=pairs, hf_split=hf_split, hf_subset=hf_subset, **kwargs, diff --git a/mteb/abstasks/text/reranking.py b/mteb/abstasks/text/reranking.py index e675a11084..f142b8a63e 100644 --- a/mteb/abstasks/text/reranking.py +++ b/mteb/abstasks/text/reranking.py @@ -117,14 +117,14 @@ def transform_old_dataset_format(self, given_dataset: Dataset | None = None): if hf_subset in cur_dataset: cur_dataset = cur_dataset[hf_subset] elif "name" in self.metadata.dataset: - cur_dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + cur_dataset = datasets.load_dataset(**self.metadata.dataset) assert hf_subset == "default", ( f"Only default subset is supported for {self.metadata.name} since `name` is given in the metadata." ) else: cur_dataset = datasets.load_dataset( **self.metadata.dataset, name=hf_subset - ) # type: ignore + ) for split in cur_dataset: corpus = [] diff --git a/mteb/deprecated_evaluator.py b/mteb/deprecated_evaluator.py index a734d2fa4e..fe8d160bbb 100644 --- a/mteb/deprecated_evaluator.py +++ b/mteb/deprecated_evaluator.py @@ -572,7 +572,7 @@ def run( def create_model_meta(model: MTEBModels) -> ModelMeta: """Create a ModelMeta object for the given model.""" if hasattr(model, "mteb_model_meta") and model.mteb_model_meta is not None: - meta = model.mteb_model_meta # type: ignore + meta = model.mteb_model_meta else: meta = MTEB._get_model_meta(model) @@ -598,7 +598,7 @@ def _create_output_folder( if output_folder is None: return None - model_revision: str = model_meta.revision # type: ignore + model_revision: str = model_meta.revision model_path_name = model_meta.model_name_as_path() output_path = Path(output_folder) / model_path_name / model_revision diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 7d717b2ae5..7ada5d9e8f 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -157,7 +157,7 @@ def _update_task_info(task_names: str) -> gr.DataFrame: df = df.drop(columns="reference") return gr.DataFrame( df, - datatype=["markdown"] + ["str"] * (len(df.columns) - 1), # type: ignore + datatype=["markdown"] + ["str"] * (len(df.columns) - 1), buttons=["copy", "fullscreen"], show_search="filter", ) diff --git a/mteb/models/get_model_meta.py b/mteb/models/get_model_meta.py index 37a52b56b4..67dcdd056b 100644 --- a/mteb/models/get_model_meta.py +++ b/mteb/models/get_model_meta.py @@ -93,7 +93,7 @@ def get_model( meta = get_model_meta(model_name, revision) model = meta.load_model(**kwargs) - model.mteb_model_meta = meta # type: ignore + model.mteb_model_meta = meta return model diff --git a/mteb/models/model_implementations/andersborges.py b/mteb/models/model_implementations/andersborges.py index b3ea645fb7..029b59e3d4 100644 --- a/mteb/models/model_implementations/andersborges.py +++ b/mteb/models/model_implementations/andersborges.py @@ -4,7 +4,7 @@ from mteb.models.model_meta import ModelMeta, ScoringFunction model2vecdk = ModelMeta( - loader=Model2VecModel, # type: ignore + loader=Model2VecModel, name="andersborges/model2vecdk", languages=["dan-Latn"], open_weights=True, @@ -34,7 +34,7 @@ model2vecdk_stem = ModelMeta( - loader=Model2VecModel, # type: ignore + loader=Model2VecModel, name="andersborges/model2vecdk-stem", languages=["dan-Latn"], open_weights=True, diff --git a/mteb/models/model_implementations/blip_models.py b/mteb/models/model_implementations/blip_models.py index ce68b80062..e0a59d8b33 100644 --- a/mteb/models/model_implementations/blip_models.py +++ b/mteb/models/model_implementations/blip_models.py @@ -128,7 +128,7 @@ def encode( # in descending order of usage (downloads from huggingface) blip_image_captioning_large = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-image-captioning-large", languages=["eng-Latn"], revision="2227ac38c9f16105cb0412e7cab4759978a8fd90", @@ -155,7 +155,7 @@ def encode( ) blip_image_captioning_base = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-image-captioning-base", languages=["eng-Latn"], revision="89b09ea1789f7addf2f6d6f0dfc4ce10ab58ef84", @@ -183,7 +183,7 @@ def encode( blip_vqa_base = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-vqa-base", languages=["eng-Latn"], revision="c7df8e7cd7aa2ee9af18f56e2b29e59a92651b64", @@ -209,7 +209,7 @@ def encode( ) blip_vqa_capfilt_large = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-vqa-capfilt-large", languages=["eng-Latn"], revision="e53f95265aeab69013fabb5380500ab984adbbb4", @@ -235,7 +235,7 @@ def encode( ) blip_itm_base_coco = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-itm-base-coco", languages=["eng-Latn"], revision="7eaa90c11850c0b17fc38c6a11e7d88bd6ac231f", @@ -261,7 +261,7 @@ def encode( ) blip_itm_large_coco = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-itm-large-coco", languages=["eng-Latn"], revision="fef05cafc05298067cbbca00b125749394a77a6f", @@ -288,7 +288,7 @@ def encode( ) blip_itm_base_flickr = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-itm-base-flickr", languages=["eng-Latn"], revision="1de29e660d91ae1786c1876212ea805a22eab251", @@ -315,7 +315,7 @@ def encode( ) blip_itm_large_flickr = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-itm-large-flickr", languages=["eng-Latn"], revision="bda12e6506758f54261b5ab174b2c55a3ba143fb", diff --git a/mteb/models/model_implementations/bm25.py b/mteb/models/model_implementations/bm25.py index 32ae883955..8a825275e8 100644 --- a/mteb/models/model_implementations/bm25.py +++ b/mteb/models/model_implementations/bm25.py @@ -113,7 +113,7 @@ def search( def encode(self, texts: list[str]): """Encode input text as term vectors""" - return bm25s.tokenize(texts, stopwords=self.stopwords, stemmer=self.stemmer) # type: ignore + return bm25s.tokenize(texts, stopwords=self.stopwords, stemmer=self.stemmer) return BM25Search(**kwargs) diff --git a/mteb/models/model_implementations/clip_models.py b/mteb/models/model_implementations/clip_models.py index 026f16a868..50ece8b0ea 100644 --- a/mteb/models/model_implementations/clip_models.py +++ b/mteb/models/model_implementations/clip_models.py @@ -115,7 +115,7 @@ def encode( clip_vit_large_patch14 = ModelMeta( - loader=CLIPModel, # type: ignore + loader=CLIPModel, name="openai/clip-vit-large-patch14", languages=["eng-Latn"], revision="32bd64288804d66eefd0ccbe215aa642df71cc41", @@ -138,7 +138,7 @@ def encode( ) clip_vit_base_patch32 = ModelMeta( - loader=CLIPModel, # type: ignore + loader=CLIPModel, name="openai/clip-vit-base-patch32", languages=["eng-Latn"], revision="3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268", @@ -161,7 +161,7 @@ def encode( ) clip_vit_base_patch16 = ModelMeta( - loader=CLIPModel, # type: ignore + loader=CLIPModel, name="openai/clip-vit-base-patch16", languages=["eng-Latn"], revision="57c216476eefef5ab752ec549e440a49ae4ae5f3", diff --git a/mteb/models/model_implementations/cohere_models.py b/mteb/models/model_implementations/cohere_models.py index 59441aefe9..415199e72f 100644 --- a/mteb/models/model_implementations/cohere_models.py +++ b/mteb/models/model_implementations/cohere_models.py @@ -222,7 +222,7 @@ def __init__( ) -> None: requires_package(self, "cohere", model_name, "pip install 'mteb[cohere]'") - import cohere # type: ignore + import cohere self.model_name = model_name.removeprefix("Cohere/Cohere-") self.sep = sep diff --git a/mteb/models/model_implementations/cohere_v.py b/mteb/models/model_implementations/cohere_v.py index e6b5e3aea3..66368b7119 100644 --- a/mteb/models/model_implementations/cohere_v.py +++ b/mteb/models/model_implementations/cohere_v.py @@ -378,7 +378,7 @@ def encode( cohere_mult_3 = ModelMeta( - loader=cohere_v_loader, # type: ignore + loader=cohere_v_loader, loader_kwargs={"model_name": "embed-multilingual-v3.0"}, name="cohere/embed-multilingual-v3.0", languages=[], # Unknown, but support >100 languages @@ -401,7 +401,7 @@ def encode( ) cohere_eng_3 = ModelMeta( - loader=cohere_v_loader, # type: ignore + loader=cohere_v_loader, loader_kwargs={"model_name": "embed-english-v3.0"}, name="cohere/embed-english-v3.0", languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/dino_models.py b/mteb/models/model_implementations/dino_models.py index bafce62fd7..01a9500343 100644 --- a/mteb/models/model_implementations/dino_models.py +++ b/mteb/models/model_implementations/dino_models.py @@ -104,7 +104,7 @@ def encode( dinov2_small = ModelMeta( - loader=DINOModel, # type: ignore + loader=DINOModel, name="facebook/dinov2-small", languages=["eng-Latn"], revision="ed25f3a31f01632728cabb09d1542f84ab7b0056", @@ -124,7 +124,7 @@ def encode( use_instructions=False, training_datasets=dinov2_training_datasets, citation="""@misc{oquab2023dinov2, - title={DINOv2: Learning Robust Visual Features without Supervision}, + title={DINOv2: Learning Robust Visual Features without Supervision}, author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski}, year={2023}, eprint={2304.07193}, @@ -134,7 +134,7 @@ def encode( ) dinov2_base = ModelMeta( - loader=DINOModel, # type: ignore + loader=DINOModel, name="facebook/dinov2-base", languages=["eng-Latn"], revision="f9e44c814b77203eaa57a6bdbbd535f21ede1415", @@ -154,7 +154,7 @@ def encode( use_instructions=False, training_datasets=dinov2_training_datasets, citation="""@misc{oquab2023dinov2, - title={DINOv2: Learning Robust Visual Features without Supervision}, + title={DINOv2: Learning Robust Visual Features without Supervision}, author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski}, year={2023}, eprint={2304.07193}, @@ -164,7 +164,7 @@ def encode( ) dinov2_large = ModelMeta( - loader=DINOModel, # type: ignore + loader=DINOModel, name="facebook/dinov2-large", languages=["eng-Latn"], revision="47b73eefe95e8d44ec3623f8890bd894b6ea2d6c", @@ -184,7 +184,7 @@ def encode( use_instructions=False, training_datasets=dinov2_training_datasets, citation="""@misc{oquab2023dinov2, - title={DINOv2: Learning Robust Visual Features without Supervision}, + title={DINOv2: Learning Robust Visual Features without Supervision}, author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski}, year={2023}, eprint={2304.07193}, @@ -194,7 +194,7 @@ def encode( ) dinov2_giant = ModelMeta( - loader=DINOModel, # type: ignore + loader=DINOModel, name="facebook/dinov2-giant", languages=["eng-Latn"], revision="611a9d42f2335e0f921f1e313ad3c1b7178d206d", @@ -214,7 +214,7 @@ def encode( use_instructions=False, training_datasets=dinov2_training_datasets, citation="""@misc{oquab2023dinov2, - title={DINOv2: Learning Robust Visual Features without Supervision}, + title={DINOv2: Learning Robust Visual Features without Supervision}, author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski}, year={2023}, eprint={2304.07193}, @@ -248,7 +248,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -278,7 +278,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -308,7 +308,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -338,7 +338,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -368,7 +368,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -398,7 +398,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -428,7 +428,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -458,7 +458,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -489,7 +489,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -519,7 +519,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -549,7 +549,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -579,7 +579,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -609,7 +609,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -639,7 +639,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -669,7 +669,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, diff --git a/mteb/models/model_implementations/emillykkejensen_models.py b/mteb/models/model_implementations/emillykkejensen_models.py index 7b82bb637f..a6ee4b93c2 100644 --- a/mteb/models/model_implementations/emillykkejensen_models.py +++ b/mteb/models/model_implementations/emillykkejensen_models.py @@ -2,7 +2,7 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader embedding_gemma_300m_scandi = ModelMeta( - loader=sentence_transformers_loader, # type: ignore + loader=sentence_transformers_loader, name="emillykkejensen/EmbeddingGemma-Scandi-300m", languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"], open_weights=True, @@ -34,7 +34,7 @@ qwen_scandi = ModelMeta( - loader=sentence_transformers_loader, # type: ignore + loader=sentence_transformers_loader, name="emillykkejensen/Qwen3-Embedding-Scandi-0.6B", languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"], open_weights=True, @@ -57,7 +57,7 @@ mmbert_scandi = ModelMeta( - loader=sentence_transformers_loader, # type: ignore + loader=sentence_transformers_loader, name="emillykkejensen/mmBERTscandi-base-embedding", languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"], open_weights=True, diff --git a/mteb/models/model_implementations/jina_clip.py b/mteb/models/model_implementations/jina_clip.py index 3c306fb64c..8f16be6983 100644 --- a/mteb/models/model_implementations/jina_clip.py +++ b/mteb/models/model_implementations/jina_clip.py @@ -121,7 +121,7 @@ def encode( jina_clip_v1 = ModelMeta( - loader=JinaCLIPModel, # type: ignore + loader=JinaCLIPModel, name="jinaai/jina-clip-v1", languages=["eng-Latn"], revision="06150c7c382d7a4faedc7d5a0d8cdb59308968f4", diff --git a/mteb/models/model_implementations/jina_models.py b/mteb/models/model_implementations/jina_models.py index 1e87b78d47..b40ce5ae18 100644 --- a/mteb/models/model_implementations/jina_models.py +++ b/mteb/models/model_implementations/jina_models.py @@ -794,7 +794,7 @@ def get_programming_task_override( jina_embeddings_v3 = ModelMeta( - loader=JinaWrapper, # type: ignore + loader=JinaWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts={ diff --git a/mteb/models/model_implementations/kennethenevoldsen_models.py b/mteb/models/model_implementations/kennethenevoldsen_models.py index 9f811fd174..4f4c6a7cea 100644 --- a/mteb/models/model_implementations/kennethenevoldsen_models.py +++ b/mteb/models/model_implementations/kennethenevoldsen_models.py @@ -4,7 +4,7 @@ ) dfm_enc_large = ModelMeta( - loader=sentence_transformers_loader, # type: ignore + loader=sentence_transformers_loader, name="KennethEnevoldsen/dfm-sentence-encoder-large", languages=["dan-Latn"], open_weights=True, @@ -38,7 +38,7 @@ ) dfm_enc_med = ModelMeta( - loader=sentence_transformers_loader, # type: ignore + loader=sentence_transformers_loader, name="KennethEnevoldsen/dfm-sentence-encoder-medium", languages=["dan-Latn"], open_weights=True, diff --git a/mteb/models/model_implementations/llm2clip_models.py b/mteb/models/model_implementations/llm2clip_models.py index f2123c600e..0f2f821e99 100644 --- a/mteb/models/model_implementations/llm2clip_models.py +++ b/mteb/models/model_implementations/llm2clip_models.py @@ -181,7 +181,7 @@ def encode( ) llm2clip_openai_l_14_336 = ModelMeta( - loader=llm2clip_loader, # type: ignore + loader=llm2clip_loader, name="microsoft/LLM2CLIP-Openai-L-14-336", languages=["eng-Latn"], revision="92512331f393a003c3d98404677f991c188162c9", @@ -205,7 +205,7 @@ def encode( # NOTE: https://huggingface.co/microsoft/LLM2CLIP-Openai-L-14-224/discussions/1 llm2clip_openai_l_14_224 = ModelMeta( - loader=llm2clip_loader, # type: ignore + loader=llm2clip_loader, name="microsoft/LLM2CLIP-Openai-L-14-224", languages=["eng-Latn"], revision="6b8a11a94ff380fa220dfefe73ac9293d2677575", @@ -228,7 +228,7 @@ def encode( ) llm2clip_openai_b_16 = ModelMeta( - loader=llm2clip_loader, # type: ignore + loader=llm2clip_loader, name="microsoft/LLM2CLIP-Openai-B-16", languages=["eng-Latn"], revision="ecfb347eb3dcfeb2fbc2a2eae7de6ac5a001aaf8", diff --git a/mteb/models/model_implementations/moco_models.py b/mteb/models/model_implementations/moco_models.py index 761df8bf95..082b669fdc 100644 --- a/mteb/models/model_implementations/moco_models.py +++ b/mteb/models/model_implementations/moco_models.py @@ -117,7 +117,7 @@ def encode( ) mocov3_vit_base = ModelMeta( - loader=mocov3_loader, # type: ignore + loader=mocov3_loader, name="nyu-visionx/moco-v3-vit-b", languages=["eng-Latn"], revision="7d091cd70772c5c0ecf7f00b5f12ca609a99d69d", @@ -140,7 +140,7 @@ def encode( ) mocov3_vit_large = ModelMeta( - loader=mocov3_loader, # type: ignore + loader=mocov3_loader, name="nyu-visionx/moco-v3-vit-l", languages=["eng-Latn"], revision="7bf75358d616f39b9716148bf4e3425f3bd35b47", diff --git a/mteb/models/model_implementations/model2vec_models.py b/mteb/models/model_implementations/model2vec_models.py index f0ce608aa3..b6aa5cf21b 100644 --- a/mteb/models/model_implementations/model2vec_models.py +++ b/mteb/models/model_implementations/model2vec_models.py @@ -139,7 +139,7 @@ def __init__( **kwargs: Additional arguments to pass to the wrapper. """ requires_package(self, "model2vec", model_name, "pip install 'mteb[model2vec]'") - from model2vec import StaticModel # type: ignore + from model2vec import StaticModel self.model_name = model_name self.model = StaticModel.from_pretrained(self.model_name) diff --git a/mteb/models/model_implementations/nomic_models.py b/mteb/models/model_implementations/nomic_models.py index 282df8ebd8..19bd4792d4 100644 --- a/mteb/models/model_implementations/nomic_models.py +++ b/mteb/models/model_implementations/nomic_models.py @@ -193,7 +193,7 @@ def encode( """ nomic_embed_v1_5 = ModelMeta( - loader=NomicWrapper, # type: ignore + loader=NomicWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts=model_prompts, @@ -221,7 +221,7 @@ def encode( ) nomic_embed_v1 = ModelMeta( - loader=NomicWrapper, # type: ignore + loader=NomicWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts=model_prompts, @@ -249,7 +249,7 @@ def encode( ) nomic_embed_v1_ablated = ModelMeta( - loader=NomicWrapper, # type: ignore + loader=NomicWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts=model_prompts, @@ -276,7 +276,7 @@ def encode( ) nomic_embed_v1_unsupervised = ModelMeta( - loader=NomicWrapper, # type: ignore + loader=NomicWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts=model_prompts, @@ -329,7 +329,7 @@ def encode( training_datasets=nomic_training_data, public_training_data=None, citation="""@misc{nussbaum2024nomic, - title={Nomic Embed: Training a Reproducible Long Context Text Embedder}, + title={Nomic Embed: Training a Reproducible Long Context Text Embedder}, author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar}, year={2024}, eprint={2402.01613}, @@ -441,7 +441,7 @@ def encode( ] nomic_embed_text_v2_moe = ModelMeta( - loader=NomicWrapper, # type: ignore + loader=NomicWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts=model_prompts, @@ -466,12 +466,12 @@ def encode( training_datasets=None, # did not look into this further superseded_by=None, citation="""@misc{nussbaum2025trainingsparsemixtureexperts, - title={Training Sparse Mixture Of Experts Text Embedding Models}, + title={Training Sparse Mixture Of Experts Text Embedding Models}, author={Zach Nussbaum and Brandon Duderstadt}, year={2025}, eprint={2502.07972}, archivePrefix={arXiv}, primaryClass={cs.CL}, - url={https://arxiv.org/abs/2502.07972}, + url={https://arxiv.org/abs/2502.07972}, }""", ) diff --git a/mteb/models/model_implementations/openclip_models.py b/mteb/models/model_implementations/openclip_models.py index 1fa695b3cc..fc629b30fb 100644 --- a/mteb/models/model_implementations/openclip_models.py +++ b/mteb/models/model_implementations/openclip_models.py @@ -120,7 +120,7 @@ def encode( CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K", languages=["eng-Latn"], revision="84c9828e63dc9a9351d1fe637c346d4c1c4db341", @@ -145,7 +145,7 @@ def encode( ) CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K", languages=["eng-Latn"], revision="f0e2ffa09cbadab3db6a261ec1ec56407ce42912", @@ -170,7 +170,7 @@ def encode( ) CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K", languages=["eng-Latn"], revision="d110532e8d4ff91c574ee60a342323f28468b287", @@ -195,7 +195,7 @@ def encode( ) CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", languages=["eng-Latn"], revision="bc7788f151930d91b58474715fdce5524ad9a189", @@ -220,7 +220,7 @@ def encode( ) CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-g-14-laion2B-s34B-b88K", languages=["eng-Latn"], revision="15efd0f6ac0c40c0f9da7becca03c974d7012604", @@ -245,7 +245,7 @@ def encode( ) CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-H-14-laion2B-s32B-b79K", languages=["eng-Latn"], revision="de081ac0a0ca8dc9d1533eed1ae884bb8ae1404b", @@ -270,7 +270,7 @@ def encode( ) CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-L-14-laion2B-s32B-b82K", languages=["eng-Latn"], revision="1627032197142fbe2a7cfec626f4ced3ae60d07a", diff --git a/mteb/models/model_implementations/random_baseline.py b/mteb/models/model_implementations/random_baseline.py index 46247c4a2e..4adb670300 100644 --- a/mteb/models/model_implementations/random_baseline.py +++ b/mteb/models/model_implementations/random_baseline.py @@ -68,7 +68,7 @@ def _image_to_vector(image: Image.Image, size: int) -> np.ndarray: license="mit", max_tokens=np.inf, reference=None, - similarity_fn_name="cosine", # type: ignore + similarity_fn_name="cosine", framework=[], use_instructions=False, public_training_code=None, # No training code, as this is a random baseline @@ -187,7 +187,7 @@ def similarity_pairwise( random_encoder_baseline = ModelMeta( - loader=RandomEncoderBaseline, # type: ignore + loader=RandomEncoderBaseline, name="baseline/random-encoder-baseline", modalities=["text", "image"], **_common_mock_metadata, @@ -231,7 +231,7 @@ def predict( random_cross_encoder_baseline = ModelMeta( - loader=RandomCrossEncoderBaseline, # type: ignore + loader=RandomCrossEncoderBaseline, name="baseline/random-cross-encoder-baseline", modalities=["text", "image"], is_cross_encoder=True, diff --git a/mteb/models/model_implementations/rasgaard_models.py b/mteb/models/model_implementations/rasgaard_models.py index 00e84130c4..1090a1f34d 100644 --- a/mteb/models/model_implementations/rasgaard_models.py +++ b/mteb/models/model_implementations/rasgaard_models.py @@ -4,7 +4,7 @@ from mteb.models.model_meta import ModelMeta, ScoringFunction potion_base_8m = ModelMeta( - loader=Model2VecModel, # type: ignore + loader=Model2VecModel, name="rasgaard/m2v-dfm-large", languages=["dan-Latn"], open_weights=True, diff --git a/mteb/models/model_implementations/repllama_models.py b/mteb/models/model_implementations/repllama_models.py index 6695265756..8a5ba04ef4 100644 --- a/mteb/models/model_implementations/repllama_models.py +++ b/mteb/models/model_implementations/repllama_models.py @@ -154,7 +154,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: """ repllama_llama2_original = ModelMeta( - loader=RepLLaMAModel, # type: ignore + loader=RepLLaMAModel, loader_kwargs=dict( base_model_name_or_path="meta-llama/Llama-2-7b-hf", device_map="auto", @@ -186,7 +186,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: repllama_llama2_reproduced = ModelMeta( - loader=RepLLaMAModel, # type: ignore + loader=RepLLaMAModel, loader_kwargs=dict( base_model_name_or_path="meta-llama/Llama-2-7b-hf", device_map="auto", diff --git a/mteb/models/model_implementations/rerankers_custom.py b/mteb/models/model_implementations/rerankers_custom.py index 1272e35de5..51bc018129 100644 --- a/mteb/models/model_implementations/rerankers_custom.py +++ b/mteb/models/model_implementations/rerankers_custom.py @@ -214,7 +214,7 @@ def predict( monobert_large = ModelMeta( - loader=MonoBERTReranker, # type: ignore + loader=MonoBERTReranker, loader_kwargs=dict( fp_options="float16", ), @@ -239,7 +239,7 @@ def predict( # languages unclear: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual/discussions/28 jina_reranker_multilingual = ModelMeta( - loader=JinaReranker, # type: ignore + loader=JinaReranker, loader_kwargs=dict( fp_options="float16", ), @@ -263,7 +263,7 @@ def predict( ) bge_reranker_v2_m3 = ModelMeta( - loader=BGEReranker, # type: ignore + loader=BGEReranker, loader_kwargs=dict( fp_options="float16", ), diff --git a/mteb/models/model_implementations/rerankers_monot5_based.py b/mteb/models/model_implementations/rerankers_monot5_based.py index f51b544714..41b7e5b487 100644 --- a/mteb/models/model_implementations/rerankers_monot5_based.py +++ b/mteb/models/model_implementations/rerankers_monot5_based.py @@ -343,7 +343,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) monot5_base = ModelMeta( - loader=MonoT5Reranker, # type: ignore + loader=MonoT5Reranker, loader_kwargs=dict( fp_options="float16", ), @@ -442,7 +442,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) flant5_base = ModelMeta( - loader=FLANT5Reranker, # type: ignore + loader=FLANT5Reranker, loader_kwargs=dict( fp_options="float16", ), @@ -902,7 +902,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) mt5_13b_mmarco_100k = ModelMeta( - loader=MonoT5Reranker, # type: ignore + loader=MonoT5Reranker, loader_kwargs=dict( fp_options="float16", ), diff --git a/mteb/models/model_implementations/siglip_models.py b/mteb/models/model_implementations/siglip_models.py index 82b716ce9e..512d7bbc1e 100644 --- a/mteb/models/model_implementations/siglip_models.py +++ b/mteb/models/model_implementations/siglip_models.py @@ -123,7 +123,7 @@ def encode( ) siglip_so400m_patch14_224 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-so400m-patch14-224", languages=["eng-Latn"], revision="d04cf29fca7b6374f74d8bea1969314492266b5e", @@ -146,7 +146,7 @@ def encode( ) siglip_so400m_patch14_384 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-so400m-patch14-384", languages=["eng-Latn"], revision="9fdffc58afc957d1a03a25b10dba0329ab15c2a3", @@ -169,7 +169,7 @@ def encode( ) siglip_so400m_patch16_256_i18n = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-so400m-patch16-256-i18n", languages=["eng-Latn"], revision="365d321c0cfdea96bc28e3a29787a11a062681a1", @@ -192,7 +192,7 @@ def encode( ) siglip_base_patch16_256_multilingual = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-base-patch16-256-multilingual", languages=["eng-Latn"], revision="8952a4eafcde3cb7ab46b1dd629b33f8784ca9c6", @@ -215,7 +215,7 @@ def encode( ) siglip_base_patch16_256 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-base-patch16-256", languages=["eng-Latn"], revision="b078df89e446d623010d890864d4207fe6399f61", @@ -238,7 +238,7 @@ def encode( ) siglip_base_patch16_512 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-base-patch16-512", languages=["eng-Latn"], revision="753a949581523b60257d93e18391e8c27f72eb22", @@ -261,7 +261,7 @@ def encode( ) siglip_base_patch16_384 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-base-patch16-384", languages=["eng-Latn"], revision="41aec1c83b32e0a6fca20ad88ba058aa5b5ea394", @@ -284,7 +284,7 @@ def encode( ) siglip_base_patch16_224 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-base-patch16-224", languages=["eng-Latn"], revision="7fd15f0689c79d79e38b1c2e2e2370a7bf2761ed", @@ -307,7 +307,7 @@ def encode( ) siglip_large_patch16_256 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-large-patch16-256", languages=["eng-Latn"], revision="d0da9f876e7d66b4e250cd2450c3ba2ce735e447", @@ -330,7 +330,7 @@ def encode( ) siglip_large_patch16_384 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-large-patch16-384", languages=["eng-Latn"], revision="ce005573a40965dfd21fd937fbdeeebf2439fc35", diff --git a/mteb/models/model_implementations/vlm2vec_models.py b/mteb/models/model_implementations/vlm2vec_models.py index 55273fa2ce..fb0b4d274d 100644 --- a/mteb/models/model_implementations/vlm2vec_models.py +++ b/mteb/models/model_implementations/vlm2vec_models.py @@ -41,7 +41,7 @@ def __init__( model_name, "pip install flash-attn --no-build-isolation", ): - import flash_attn # noqa + pass requires_package(self, "peft", model_name, "pip install 'mteb[peft]'") from peft import LoraConfig, PeftModel diff --git a/mteb/models/model_implementations/voyage_v.py b/mteb/models/model_implementations/voyage_v.py index 6386bc2d06..56aa5aa136 100644 --- a/mteb/models/model_implementations/voyage_v.py +++ b/mteb/models/model_implementations/voyage_v.py @@ -40,15 +40,15 @@ def _downsample_image( logging.info( f"Downsampling image from {width}x{height} to {new_width}x{new_height}" ) - return image.resize(new_size, Image.LANCZOS) # type: ignore + return image.resize(new_size, Image.LANCZOS) if width > height: if width > 10000: logging.error("Processing extremely wide images.") - return image.resize((10000, height), Image.LANCZOS) # type: ignore + return image.resize((10000, height), Image.LANCZOS) else: if height > 10000: logging.error("Processing extremely high images.") - return image.resize((width, 10000), Image.LANCZOS) # type: ignore + return image.resize((width, 10000), Image.LANCZOS) return image @@ -202,7 +202,7 @@ def encode( voyage_v = ModelMeta( - loader=voyage_v_loader, # type: ignore + loader=voyage_v_loader, name="voyageai/voyage-multimodal-3", languages=[], # Unknown revision="1", diff --git a/mteb/models/model_meta.py b/mteb/models/model_meta.py index fdedf92a08..3cecccf7ad 100644 --- a/mteb/models/model_meta.py +++ b/mteb/models/model_meta.py @@ -222,7 +222,7 @@ def load_model(self, **kwargs: Any) -> MTEBModels: _kwargs.update(kwargs) model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs) - model.mteb_model_meta = self # type: ignore + model.mteb_model_meta = self return model def model_name_as_path(self) -> str: diff --git a/mteb/results/benchmark_results.py b/mteb/results/benchmark_results.py index aecd6abc81..5291b64788 100644 --- a/mteb/results/benchmark_results.py +++ b/mteb/results/benchmark_results.py @@ -59,7 +59,7 @@ def _filter_tasks( task_names: list[str] | None = None, languages: list[str] | None = None, domains: list[TaskDomain] | None = None, - task_types: list[TaskType] | None = None, # type: ignore + task_types: list[TaskType] | None = None, modalities: list[Modalities] | None = None, is_public: bool | None = None, ) -> BenchmarkResults: @@ -231,8 +231,8 @@ def keep_best(group: pd.DataFrame) -> pd.DataFrame: model_to_main_revision = { meta.name: meta.revision for meta in get_model_metas() } - task_df["main_revision"] = task_df["model"].map(model_to_main_revision) # type: ignore - task_df["mteb_version"] = task_df["mteb_version"].map(parse_version) # type: ignore + task_df["main_revision"] = task_df["model"].map(model_to_main_revision) + task_df["mteb_version"] = task_df["mteb_version"].map(parse_version) task_df = ( task_df.groupby(["model", "task_name"]) .apply(keep_best) @@ -273,7 +273,7 @@ def _get_scores( { "model": model_res.model_name, "revision": model_res.model_revision, - **model_scores, # type: ignore + **model_scores, } ) except Exception as e: diff --git a/mteb/results/model_result.py b/mteb/results/model_result.py index 63432dc450..5223703a98 100644 --- a/mteb/results/model_result.py +++ b/mteb/results/model_result.py @@ -183,15 +183,15 @@ def _get_scores( try: if use_fast: scores[res.task_name] = res._get_score_fast( - splits=splits, # type: ignore - languages=languages, # type: ignore + splits=splits, + languages=languages, ) else: scores[res.task_name] = res.get_score( splits=splits, languages=languages, - aggregation=aggregation, # type: ignore - getter=getter, # type: ignore + aggregation=aggregation, + getter=getter, scripts=scripts, ) except Exception as e: @@ -206,14 +206,14 @@ def _get_scores( if use_fast: score = task_res._get_score_fast( splits=splits, - languages=languages, # type: ignore + languages=languages, ) else: score = task_res.get_score( splits=splits, languages=languages, - aggregation=aggregation, # type: ignore - getter=getter, # type: ignore + aggregation=aggregation, + getter=getter, scripts=scripts, ) entry = dict( diff --git a/mteb/results/task_result.py b/mteb/results/task_result.py index 8c8ba44b01..966af0c86c 100644 --- a/mteb/results/task_result.py +++ b/mteb/results/task_result.py @@ -42,7 +42,7 @@ class Criteria(HelpfulStrEnum): class ScalaNbClassificationDummy: """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( # type: ignore + metadata = Namespace( name="ScalaNbClassification", main_score="accuracy", type="Classification", @@ -57,7 +57,7 @@ class ScalaNbClassificationDummy: class ScalaNnClassificationDummy: """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( # type: ignore + metadata = Namespace( name="ScalaNnClassification", main_score="accuracy", type="Classification", @@ -72,7 +72,7 @@ class ScalaNnClassificationDummy: class ScalaDaClassificationDummy: """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( # type: ignore + metadata = Namespace( name="ScalaDaClassification", main_score="accuracy", type="Classification", @@ -87,7 +87,7 @@ class ScalaDaClassificationDummy: class ScalaSvClassificationDummy: """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( # type: ignore + metadata = Namespace( name="ScalaSvClassification", main_score="accuracy", type="Classification", @@ -251,7 +251,7 @@ def domains(self) -> list[str]: doms = self.task.metadata.domains if doms is None: doms = [] - return doms # type: ignore + return doms @property def task_type(self) -> str: @@ -325,7 +325,7 @@ def to_disk(self, path: Path) -> None: json.dump(json_obj, f, indent=2) @classmethod - def from_disk(cls, path: Path, load_historic_data: bool = True) -> Self: # type: ignore + def from_disk(cls, path: Path, load_historic_data: bool = True) -> Self: """Load TaskResult from disk. Args: @@ -481,7 +481,7 @@ def _convert_from_before_v1_11_0(cls, data: dict) -> Self: scores["test"]["fra-fra"] = scores["test"].pop("fr") result: TaskResult = TaskResult.from_task_results( - task, # type: ignore + task, scores, evaluation_time, kg_co2_emissions=None, diff --git a/mteb/tasks/classification/dan/dk_hate_classification.py b/mteb/tasks/classification/dan/dk_hate_classification.py index 4d027e3576..ef0b6b2783 100644 --- a/mteb/tasks/classification/dan/dk_hate_classification.py +++ b/mteb/tasks/classification/dan/dk_hate_classification.py @@ -62,7 +62,7 @@ class DKHateClassification(AbsTaskClassification): def dataset_transform(self): # convert label to a 0/1 label - labels = self.dataset["train"]["label"] # type: ignore + labels = self.dataset["train"]["label"] lab2idx = {lab: idx for idx, lab in enumerate(set(labels))} self.dataset = self.dataset.map( lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"] diff --git a/mteb/tasks/classification/est/estonian_valence.py b/mteb/tasks/classification/est/estonian_valence.py index b55ef82e0e..e1c12db718 100644 --- a/mteb/tasks/classification/est/estonian_valence.py +++ b/mteb/tasks/classification/est/estonian_valence.py @@ -45,7 +45,7 @@ def dataset_transform(self): "valence", "label" ) # convert label to a numbers - labels = self.dataset["train"]["label"] # type: ignore + labels = self.dataset["train"]["label"] lab2idx = {lab: idx for idx, lab in enumerate(set(labels))} self.dataset = self.dataset.map( lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"] diff --git a/mteb/tasks/classification/multilingual/scala_classification.py b/mteb/tasks/classification/multilingual/scala_classification.py index bd13823046..0ab16f7127 100644 --- a/mteb/tasks/classification/multilingual/scala_classification.py +++ b/mteb/tasks/classification/multilingual/scala_classification.py @@ -57,7 +57,7 @@ class ScalaClassification(AbsTaskClassification): def dataset_transform(self): for lang in self.dataset.keys(): # convert label to a 0/1 label - labels = self.dataset[lang]["train"]["label"] # type: ignore + labels = self.dataset[lang]["train"]["label"] lab2idx = {lab: idx for idx, lab in enumerate(set(labels))} self.dataset[lang] = self.dataset[lang].map( lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"] diff --git a/mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py b/mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py index 62a34df671..b3a6b7d90b 100644 --- a/mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +++ b/mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py @@ -49,7 +49,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset = datasets.DatasetDict({"test": self.dataset["train"]}) self.dataset_transform() self.data_loaded = True diff --git a/mteb/tasks/retrieval/code/code_rag.py b/mteb/tasks/retrieval/code/code_rag.py index a9e291346c..b62ea8bccb 100644 --- a/mteb/tasks/retrieval/code/code_rag.py +++ b/mteb/tasks/retrieval/code/code_rag.py @@ -48,14 +48,14 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval): "path": "code-rag-bench/programming-solutions", "revision": "1064f7bba54d5400d4836f5831fe4c2332a566a6", }, - **common_args, # type: ignore + **common_args, ) def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -71,7 +71,7 @@ def dataset_transform(self) -> None: self.queries = {} split = self.metadata.eval_splits[0] - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) self.queries[split] = {} @@ -105,14 +105,14 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval): "path": "code-rag-bench/online-tutorials", "revision": "095bb77130082e4690d6c3a031997b03487bf6e2", }, - **common_args, # type: ignore + **common_args, ) def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -128,7 +128,7 @@ def dataset_transform(self) -> None: self.queries = {} split = self.metadata.eval_splits[0] - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) self.queries[split] = {} @@ -165,14 +165,14 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval): "path": "code-rag-bench/library-documentation", "revision": "b530d3b5a25087d2074e731b76232db85b9e9107", }, - **common_args, # type: ignore + **common_args, ) def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -188,7 +188,7 @@ def dataset_transform(self) -> None: self.queries = {} split = self.metadata.eval_splits[0] - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) self.queries[split] = {} @@ -222,14 +222,14 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval): "path": "code-rag-bench/stackoverflow-posts", "revision": "04e05d86cb0ac467b29a5d87f4c56eac99dfc0a4", }, - **common_args, # type: ignore + **common_args, ) def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -245,7 +245,7 @@ def dataset_transform(self) -> None: self.queries = {} split = self.metadata.eval_splits[0] - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) self.queries[split] = {} diff --git a/mteb/tasks/retrieval/dan/dan_fever_retrieval.py b/mteb/tasks/retrieval/dan/dan_fever_retrieval.py index c651e60f77..bbc6edd380 100644 --- a/mteb/tasks/retrieval/dan/dan_fever_retrieval.py +++ b/mteb/tasks/retrieval/dan/dan_fever_retrieval.py @@ -51,7 +51,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True diff --git a/mteb/tasks/retrieval/dan/tv2_nordretrieval.py b/mteb/tasks/retrieval/dan/tv2_nordretrieval.py index 98273e109d..12447cb07b 100644 --- a/mteb/tasks/retrieval/dan/tv2_nordretrieval.py +++ b/mteb/tasks/retrieval/dan/tv2_nordretrieval.py @@ -64,7 +64,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -81,7 +81,7 @@ def dataset_transform(self) -> None: text2id = {} for split in self.dataset: - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) ds = ds.select( range(2048) diff --git a/mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py b/mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py index 651e46840f..92fe5feed0 100644 --- a/mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +++ b/mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py @@ -40,7 +40,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -57,7 +57,7 @@ def dataset_transform(self) -> None: text2id = {} for split in self.dataset: - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.map(answers_to_list) self.queries[split] = {} diff --git a/mteb/tasks/retrieval/nob/norquad.py b/mteb/tasks/retrieval/nob/norquad.py index 54d41e8c57..43b6b35c15 100644 --- a/mteb/tasks/retrieval/nob/norquad.py +++ b/mteb/tasks/retrieval/nob/norquad.py @@ -54,7 +54,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -71,7 +71,7 @@ def dataset_transform(self) -> None: text2id = {} for split in self.dataset: - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) max_samples = min(1024, len(ds)) ds = ds.select( diff --git a/mteb/tasks/retrieval/nob/snl_retrieval.py b/mteb/tasks/retrieval/nob/snl_retrieval.py index 41322ac5b5..4cfdcc7503 100644 --- a/mteb/tasks/retrieval/nob/snl_retrieval.py +++ b/mteb/tasks/retrieval/nob/snl_retrieval.py @@ -41,7 +41,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -58,7 +58,7 @@ def dataset_transform(self) -> None: text2id = {} for split in self.dataset: - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) self.queries[split] = {} diff --git a/mteb/tasks/retrieval/tur/tur_hist_quad.py b/mteb/tasks/retrieval/tur/tur_hist_quad.py index cd56138132..a189379ce3 100644 --- a/mteb/tasks/retrieval/tur/tur_hist_quad.py +++ b/mteb/tasks/retrieval/tur/tur_hist_quad.py @@ -59,7 +59,7 @@ def load_data(self, **kwargs) -> None: text2id = {} for split in self.metadata.eval_splits: - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) max_samples = min(1024, len(ds)) ds = ds.select( diff --git a/pyproject.toml b/pyproject.toml index 5b94a91826..36f9877c4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -220,6 +220,7 @@ select = [ "PTH", # use pathlib "TID", # tidy-imports "D", # pydocstyle + "PGH", # pygrep-hooks Use specific rule codes when ignoring type issues ] ignore = [ diff --git a/tests/mock_models.py b/tests/mock_models.py index 5d1d54bb9b..53ac737381 100644 --- a/tests/mock_models.py +++ b/tests/mock_models.py @@ -102,7 +102,7 @@ def encode( normalize_embeddings: bool = False, **kwargs, ) -> list[Tensor] | np.ndarray | Tensor: - return torch.randn(len(sentences), 10, dtype=torch.bfloat16) # type: ignore + return torch.randn(len(sentences), 10, dtype=torch.bfloat16) class MockSentenceTransformerWrapper(SentenceTransformerEncoderWrapper): diff --git a/tests/mock_tasks.py b/tests/mock_tasks.py index 3cdeb98881..8cbce53ae1 100644 --- a/tests/mock_tasks.py +++ b/tests/mock_tasks.py @@ -112,7 +112,7 @@ def instruction_retrieval_datasplit() -> RetrievalSplitData: class MockClassificationTask(AbsTaskClassification): - classifier = LogisticRegression(n_jobs=1, max_iter=10) # type: ignore + classifier = LogisticRegression(n_jobs=1, max_iter=10) expected_stats = { "test": { @@ -159,7 +159,7 @@ class MockClassificationTask(AbsTaskClassification): type="Classification", name="MockClassificationTask", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -315,7 +315,7 @@ class MockMultilingualClassificationTask(AbsTaskClassification): type="Classification", name="MockMultilingualClassificationTask", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -374,7 +374,7 @@ class MockBitextMiningTask(AbsTaskBitextMining): type="BitextMining", name="MockBitextMiningTask", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -464,7 +464,7 @@ class MockMultilingualBitextMiningTask(AbsTaskBitextMining): type="BitextMining", name="MockMultilingualBitextMiningTask", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -559,7 +559,7 @@ class MockMultilingualParallelBitextMiningTask(AbsTaskBitextMining): type="BitextMining", name="MockMultilingualParallelBitextMiningTask", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = { "eng_Latn-fra_Latn": ["eng-Latn", "fra-Latn"], @@ -612,7 +612,7 @@ class MockClusteringTask(AbsTaskClusteringLegacy): type="Clustering", name="MockClusteringTask", main_score="v_measure", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -710,7 +710,7 @@ class MockMultilingualClusteringTask(AbsTaskClusteringLegacy): type="Clustering", name="MockMultilingualClusteringTask", main_score="v_measure", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -771,7 +771,7 @@ class MockClusteringFastTask(AbsTaskClustering): type="Clustering", name="MockClusteringFastTask", main_score="v_measure", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -870,7 +870,7 @@ class MockMultilingualClusteringFastTask(AbsTaskClustering): type="Clustering", name="MockMultilingualClusteringFastTask", main_score="v_measure", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -935,7 +935,7 @@ class MockPairClassificationTask(AbsTaskPairClassification): type="PairClassification", name="MockPairClassificationTask", main_score="similarity_ap", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -1054,7 +1054,7 @@ class MockMultilingualPairClassificationTask(AbsTaskPairClassification): type="PairClassification", name="MockMultilingualPairClassificationTask", main_score="similarity_ap", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -1125,7 +1125,7 @@ class MockPairImageClassificationTask(AbsTaskPairClassification): type="PairClassification", name="MockPairImageClassificationTask", main_score="similarity_ap", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] @@ -1191,7 +1191,7 @@ class MockSTSTask(AbsTaskSTS): type="STS", name="MockSTSTask", main_score="cosine_spearman", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -1303,7 +1303,7 @@ class MockMultilingualSTSTask(AbsTaskSTS): type="STS", name="MockMultilingualSTSTask", main_score="cosine_spearman", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -1368,7 +1368,7 @@ class MockSummarizationTask(AbsTaskSummarization): type="Summarization", name="MockSummarizationTask", main_score="cosine_spearman", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -1497,7 +1497,7 @@ class MockMultilingualSummarizationTask(AbsTaskSummarization): type="Summarization", name="MockMultilingualSummarizationTask", main_score="cosine_spearman", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -1575,7 +1575,7 @@ class MockRerankingTask(AbsTaskRetrieval): type="Reranking", name="MockRerankingTask", main_score="map_at_1000", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -1694,7 +1694,7 @@ class MockMultilingualRerankingTask(AbsTaskRetrieval): type="Reranking", name="MockMultilingualRerankingTask", main_score="map_at_10", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -1772,7 +1772,7 @@ class MockRetrievalTask(AbsTaskRetrieval): type="Retrieval", name="MockRetrievalTask", main_score="ndcg_at_10", - **dict(general_args | {"eval_splits": ["val", "test"]}), # type: ignore + **dict(general_args | {"eval_splits": ["val", "test"]}), ) def load_data(self) -> None: @@ -1848,7 +1848,7 @@ class MockRetrievalDialogTask(AbsTaskRetrieval): type="Retrieval", name="MockRetrievalDialogTask", main_score="ndcg_at_10", - **dict(general_args | {"eval_splits": ["val", "test"]}), # type: ignore + **dict(general_args | {"eval_splits": ["val", "test"]}), ) def load_data(self) -> None: @@ -2057,7 +2057,7 @@ class MockMultilingualRetrievalTask(AbsTaskRetrieval): type="Retrieval", name="MockMultilingualRetrievalTask", main_score="ndcg_at_10", - **dict(general_args | {"eval_splits": ["val", "test"]}), # type: ignore + **dict(general_args | {"eval_splits": ["val", "test"]}), ) metadata.eval_langs = multilingual_eval_langs @@ -2118,7 +2118,7 @@ class MockMultilabelClassification(AbsTaskMultilabelClassification): type="MultilabelClassification", name="MockMultilabelClassification", main_score="lrap", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -2271,7 +2271,7 @@ class MockMultilingualMultilabelClassification(AbsTaskMultilabelClassification): type="MultilabelClassification", name="MockMultilingualMultilabelClassification", main_score="lrap", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -2340,7 +2340,7 @@ class MockInstructionRetrieval(AbsTaskRetrieval): type="InstructionRetrieval", name="MockInstructionRetrieval", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -2392,7 +2392,7 @@ class MockInstructionReranking(AbsTaskRetrieval): type="InstructionReranking", name="MockInstructionReranking", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -2495,7 +2495,7 @@ class MockMultilingualInstructionRetrieval(AbsTaskRetrieval): type="InstructionRetrieval", name="MockMultilingualInstructionRetrieval", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -2618,7 +2618,7 @@ class MockMultilingualInstructionReranking(AbsTaskRetrieval): type="InstructionReranking", name="MockMultilingualInstructionReranking", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -2640,7 +2640,7 @@ class MockAggregatedTask(AbsTaskAggregate): MockRetrievalTask(), MockRerankingTask(), ], - **general_args, # type: ignore + **general_args, ) @@ -2695,7 +2695,7 @@ class MockMultiChoiceTask(AbsTaskRetrieval): type="Any2AnyMultiChoice", name="MockMultiChoice", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "it2i" @@ -2878,7 +2878,7 @@ class MockMultilingualMultiChoiceTask(AbsTaskRetrieval): type="Any2AnyMultiChoice", name="MockMultilingualMultiChoice", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs metadata.modalities = ["image", "text"] @@ -2970,7 +2970,7 @@ class MockAny2AnyRetrievalI2TTask(AbsTaskRetrieval): type="Any2AnyRetrieval", name="MockAny2AnyRetrievalI2T", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "i2t" @@ -3049,7 +3049,7 @@ class MockAny2AnyRetrievalT2ITask(AbsTaskRetrieval): type="Any2AnyRetrieval", name="MockAny2AnyRetrievalT2I", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "t2i" @@ -3140,7 +3140,7 @@ class MockImageClassificationTask(AbsTaskClassification): type="ImageClassification", name="MockImageClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.category = "i2c" @@ -3316,7 +3316,7 @@ class MockMultilingualImageClassificationTask(AbsTaskClassification): type="ImageClassification", name="MockMultilingualImageClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.category = "i2c" @@ -3383,7 +3383,7 @@ class MockImageClusteringTask(AbsTaskClusteringLegacy): type="ImageClustering", name="MockImageClustering", main_score="nmi", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] input_column_name = "image" @@ -3439,7 +3439,7 @@ class MockImageClusteringFastTask(AbsTaskClustering): type="ImageClustering", name="MockImageClusteringFastTask", main_score="v_measure", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] input_column_name = "image" @@ -3529,7 +3529,7 @@ class MockImageMultilabelClassificationTask(AbsTaskMultilabelClassification): type="ImageMultilabelClassification", name="MockImageMultilabelClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.category = "i2c" @@ -3735,7 +3735,7 @@ class MockMultilingualImageMultilabelClassificationTask( type="ImageMultilabelClassification", name="MockMultilingualImageMultilabelClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.eval_langs = multilingual_eval_langs @@ -3802,7 +3802,7 @@ class MockImageTextPairClassificationTask(AbsTaskImageTextPairClassification): type="Compositionality", name="MockImageTextPairClassification", main_score="text_acc", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "i2t" @@ -3898,7 +3898,7 @@ class MockMultilingualImageTextPairClassificationTask( type="Compositionality", name="MockMultilingualImageTextPairClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "i2t" @@ -3965,7 +3965,7 @@ class MockVisualSTSTask(AbsTaskSTS): type="VisualSTS(eng)", name="MockVisualSTS", main_score="cosine_spearman", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.category = "i2i" @@ -4031,7 +4031,7 @@ class MockZeroShotClassificationTask(AbsTaskZeroShotClassification): type="ZeroShotClassification", name="MockZeroShotClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "i2t" @@ -4095,7 +4095,7 @@ class MockTextZeroShotClassificationTask(AbsTaskZeroShotClassification): type="ZeroShotClassification", name="MockTextZeroShotClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["text"] metadata.category = "t2t" @@ -4155,7 +4155,7 @@ class MockRegressionTask(AbsTaskRegression): type="Regression", name="MockRegressionTask", main_score="kendalltau", - **general_args, # type: ignore + **general_args, ) def load_data(self, **kwargs): @@ -4221,7 +4221,7 @@ class MockImageRegressionTask(AbsTaskRegression): type="Regression", name="MockRegressionTask", main_score="kendalltau", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.category = "i2c" diff --git a/tests/test_abstasks/test_task_metadata.py b/tests/test_abstasks/test_task_metadata.py index 4af8e7d443..a05d24eb2b 100644 --- a/tests/test_abstasks/test_task_metadata.py +++ b/tests/test_abstasks/test_task_metadata.py @@ -51,7 +51,7 @@ def test_given_dataset_config_then_it_is_valid(): def test_given_missing_dataset_path_then_it_throws(): with pytest.raises(ValueError): - TaskMetadata( # type: ignore + TaskMetadata( name="MyTask", description="testing", reference=None, diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index fd1653a395..2a60e0df74 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -66,7 +66,7 @@ def test_evaluate_with_cache( path = cache.get_task_result_path( task.metadata.name, results.model_name.replace("/", "__"), - results.model_revision, # type: ignore + results.model_revision, ) model_meta_path = path.parent / "model_meta.json" assert path.exists() and path.is_file(), "cache file should exist" diff --git a/tests/test_filter_tasks.py b/tests/test_filter_tasks.py index 8a3860cd31..19b2ac6ce3 100644 --- a/tests/test_filter_tasks.py +++ b/tests/test_filter_tasks.py @@ -32,7 +32,7 @@ def test_filter_tasks( languages: list[str], script: list[str], domains: list[TaskDomain], - task_types: list[TaskType] | None, # type: ignore + task_types: list[TaskType] | None, ): """Tests that get_tasks filters tasks correctly. This could in principle be combined with the following tests, but they have been kept separate to reduce the grid size. @@ -67,7 +67,7 @@ def test_filter_tasks_superseded( all_tasks: list[AbsTask], languages: list[str], domains: list[TaskDomain], - task_types: list[TaskType] | None, # type: ignore + task_types: list[TaskType] | None, exclude_superseded_datasets: bool, ): tasks = filter_tasks( diff --git a/tests/test_get_tasks.py b/tests/test_get_tasks.py index cf7ed0ad17..a1583415f9 100644 --- a/tests/test_get_tasks.py +++ b/tests/test_get_tasks.py @@ -58,7 +58,7 @@ def test_get_tasks_filtering(): @pytest.mark.parametrize("modalities", [["text"], ["image"], None]) def test_mteb_mteb_tasks( script: list[str], - task_types: list[TaskType] | None, # type: ignore + task_types: list[TaskType] | None, modalities: list[Modalities] | None, ): tasks = mteb.get_tasks(script=script, task_types=task_types, modalities=modalities) diff --git a/tests/test_tasks/test_task_quality.py b/tests/test_tasks/test_task_quality.py index ba9f086432..90180e8b28 100644 --- a/tests/test_tasks/test_task_quality.py +++ b/tests/test_tasks/test_task_quality.py @@ -281,7 +281,7 @@ def _split_quality( ) -> list[str]: errors = [] - num_samples = split_stats["num_samples"] # type: ignore + num_samples = split_stats["num_samples"] text_stats = split_stats.get("text_statistics", None) if text_stats: text_stats = cast(TextStatistics, text_stats) From aed114d3aa55df6f11ddb7380dba91d33524ee50 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 22 Dec 2025 23:38:53 +0300 Subject: [PATCH 17/32] install more types --- mteb/deprecated_evaluator.py | 2 +- pyproject.toml | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/mteb/deprecated_evaluator.py b/mteb/deprecated_evaluator.py index fe8d160bbb..eab9285014 100644 --- a/mteb/deprecated_evaluator.py +++ b/mteb/deprecated_evaluator.py @@ -480,7 +480,7 @@ def run( if co2_tracker: try: - from codecarbon import ( # type: ignore[import-not-found] + from codecarbon import ( # type: ignore[import-untyped] EmissionsTracker, ) except ImportError: diff --git a/pyproject.toml b/pyproject.toml index 36f9877c4f..06590de4f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,6 +132,18 @@ typing = [ "types-simplejson>=3.20.0.20250822", "types-tqdm>=4.67.0.20250809", "types-tensorflow>=2.18.0.20250809", + "types-pygments>=2.19.0.20251121", + "types-colorama>=0.4.15.20250801", + "types-gevent>=25.9.0.20251102", + "types-networkx>=3.6.1.20251220", + "types-openpyxl>=3.1.5.20250919", + "types-psutil>=7.1.3.20251211", + "types-python-dateutil>=2.9.0.20251115", + "types-pywin32>=311.0.0.20251008", + "types-regex>=2025.11.3.20251106", + "types-setuptools>=80.9.0.20251221", + "types-tabulate>=0.9.0.20241207", + "types-xlrd>=2.0.0.20251020", "pandas-stubs>=2.3.2.250926", "scipy-stubs>=1.15.3.0", "types-defusedxml>=0.7.0.20250822", From fac5c5846870fb594c80a28ae7905896be0a8f57 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 23 Dec 2025 01:24:24 +0300 Subject: [PATCH 18/32] almost finish --- .../pair_classification_evaluator.py | 4 +- mteb/_evaluators/sklearn_evaluator.py | 11 ++- .../text/bitext_mining_evaluator.py | 37 ++++--- .../text/summarization_evaluator.py | 52 +++++----- mteb/abstasks/_stratification.py | 6 +- mteb/abstasks/abstask.py | 26 +++-- mteb/abstasks/aggregate_task_metadata.py | 8 -- mteb/abstasks/aggregated_task.py | 19 +--- mteb/abstasks/classification.py | 5 +- mteb/abstasks/clustering.py | 28 +++--- mteb/abstasks/clustering_legacy.py | 7 +- .../image/image_text_pair_classification.py | 6 +- mteb/abstasks/pair_classification.py | 7 +- mteb/abstasks/sts.py | 7 +- mteb/abstasks/task_metadata.py | 2 +- mteb/abstasks/text/summarization.py | 7 +- mteb/abstasks/zeroshot_classification.py | 7 +- mteb/deprecated_evaluator.py | 14 ++- mteb/languages/language_scripts.py | 6 +- mteb/models/get_model_meta.py | 2 +- mteb/models/model_meta.py | 2 +- mteb/results/model_result.py | 54 +++++++--- mteb/results/task_result.py | 99 +++++++++---------- mteb/types/statistics.py | 8 +- 24 files changed, 243 insertions(+), 181 deletions(-) diff --git a/mteb/_evaluators/pair_classification_evaluator.py b/mteb/_evaluators/pair_classification_evaluator.py index da346cce59..c54697e376 100644 --- a/mteb/_evaluators/pair_classification_evaluator.py +++ b/mteb/_evaluators/pair_classification_evaluator.py @@ -148,7 +148,9 @@ def _encode_unique_texts( hf_subset: str, **encode_kwargs: Any, ) -> np.ndarray: - index_map, all_unique_texts, all_texts_indexes = {}, [], [] + index_map = {} + all_unique_texts: list[str] = [] + all_texts_indexes = [] for text in all_texts: text_hash = hash(text) if text_hash not in index_map: diff --git a/mteb/_evaluators/sklearn_evaluator.py b/mteb/_evaluators/sklearn_evaluator.py index d0c2e71749..ae7e420fa6 100644 --- a/mteb/_evaluators/sklearn_evaluator.py +++ b/mteb/_evaluators/sklearn_evaluator.py @@ -1,10 +1,10 @@ import logging -from typing import Any, Protocol +from typing import Any, Protocol, cast import numpy as np from datasets import Dataset from torch.utils.data import DataLoader -from typing_extensions import Self, Unpack +from typing_extensions import Self from mteb._create_dataloaders import create_dataloader from mteb.abstasks.task_metadata import TaskMetadata @@ -20,7 +20,7 @@ class SklearnModelProtocol(Protocol): def fit(self, X: Array, y: np.ndarray | list[int]) -> None: ... # noqa: N803 def predict(self, X: Array) -> np.ndarray: ... # noqa: N803 def get_params(self) -> dict[str, Any]: ... - def set_params(self, **kwargs: Unpack[dict[str, Any]]) -> Self: ... + def set_params(self, random_state: int, **kwargs: dict[str, Any]) -> Self: ... def score(self, X: Array, y: np.ndarray | list[int]) -> float: ... # noqa: N803 @@ -71,8 +71,8 @@ def __call__( # type: ignore[override] model: EncoderProtocol, *, encode_kwargs: dict[str, Any], - test_cache: np.ndarray | None = None, - ) -> tuple[np.ndarray, np.ndarray]: + test_cache: Array | None = None, + ) -> tuple[np.ndarray, Array]: """Classification evaluation by training a sklearn classifier on the embeddings of the training set and evaluating on the embeddings of the test set. Args: @@ -104,6 +104,7 @@ def __call__( # type: ignore[override] hf_subset=self.hf_subset, **encode_kwargs, ) + test_cache = cast(Array, test_cache) logger.info("Running - Fitting classifier...") y_train = self.train_dataset[self.label_column_name] diff --git a/mteb/_evaluators/text/bitext_mining_evaluator.py b/mteb/_evaluators/text/bitext_mining_evaluator.py index 2c5a2ee169..eff53e3e3a 100644 --- a/mteb/_evaluators/text/bitext_mining_evaluator.py +++ b/mteb/_evaluators/text/bitext_mining_evaluator.py @@ -1,7 +1,6 @@ import logging from typing import Any -import numpy as np import torch from datasets import Dataset from tqdm.auto import tqdm @@ -10,6 +9,7 @@ from mteb._evaluators.evaluator import Evaluator from mteb.abstasks.task_metadata import TaskMetadata from mteb.models import EncoderProtocol +from mteb.types import Array logger = logging.getLogger(__name__) @@ -69,11 +69,11 @@ def __call__( def _similarity_search( self, - query_embeddings: np.ndarray, - corpus_embeddings: np.ndarray, + query_embeddings: Array, + corpus_embeddings: Array, model: EncoderProtocol, query_chunk_size: int = 100, - corpus_chunk_size: int = 500000, + corpus_chunk_size: int = 500_000, ) -> list[dict[str, float]]: """This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings. @@ -104,7 +104,9 @@ def _similarity_search( ): query_embeddings = query_embeddings.to(corpus_embeddings.device) - queries_result_list = [[] for _ in range(len(query_embeddings))] + queries_result_list: list[list[dict[str, float]]] = [ + [] for _ in range(len(query_embeddings)) + ] for query_start_idx in range(0, len(query_embeddings), query_chunk_size): # Iterate over chunks of the corpus @@ -120,15 +122,17 @@ def _similarity_search( ) # Get top-k scores - cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( - torch.tensor(similarity_scores), - 1, - dim=1, - largest=True, - sorted=False, + cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = ( + torch.topk( + torch.tensor(similarity_scores), + 1, + dim=1, + largest=True, + sorted=False, + ) ) - cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() - cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() + cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist() + cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist() for query_itr in range(len(similarity_scores)): for sub_corpus_id, score in zip( @@ -141,11 +145,14 @@ def _similarity_search( {"corpus_id": corpus_id, "score": score} ) + result_queries_list: list[dict[str, float]] = [ + {} for _ in range(len(query_embeddings)) + ] # Sort and strip to top_k results for idx in range(len(queries_result_list)): queries_result_list[idx] = sorted( queries_result_list[idx], key=lambda x: x["score"], reverse=True ) - queries_result_list[idx] = queries_result_list[idx][0] + result_queries_list[idx] = queries_result_list[idx][0] - return queries_result_list + return result_queries_list diff --git a/mteb/_evaluators/text/summarization_evaluator.py b/mteb/_evaluators/text/summarization_evaluator.py index 43c6cda2b4..980e7ca095 100644 --- a/mteb/_evaluators/text/summarization_evaluator.py +++ b/mteb/_evaluators/text/summarization_evaluator.py @@ -1,6 +1,6 @@ import logging import sys -from typing import Any, TypedDict +from typing import Any, TypedDict, cast import numpy as np import torch @@ -135,10 +135,10 @@ def __call__( ) # Split the embeddings into the original human & machine summaries - embs_human_summaries_all = np.split( + embs_human_summaries_all_split = np.split( embs_human_summaries_all, np.cumsum(human_lens)[:-1] ) - embs_machine_summaries_all = np.split( + embs_machine_summaries_all_split = np.split( embs_machine_summaries_all, np.cumsum(machine_lens)[:-1] ) @@ -148,7 +148,9 @@ def __call__( all_human_scores = [] for i, (embs_human_summaries, embs_machine_summaries) in tqdm( - enumerate(zip(embs_human_summaries_all, embs_machine_summaries_all)), + enumerate( + zip(embs_human_summaries_all_split, embs_machine_summaries_all_split) + ), desc="Scoring", total=len(self.human_summaries), ): @@ -201,12 +203,12 @@ def _calculate_metrics( self, distances: SummarizationDistances, ) -> SummarizationMetrics: - cosine_spearman_scores = [] - cosine_pearson_scores = [] - dot_spearman_scores = [] - dot_pearson_scores = [] - pearson_scores = [] - spearman_scores = [] + cosine_spearman_scores: list[float] = [] + cosine_pearson_scores: list[float] = [] + dot_spearman_scores: list[float] = [] + dot_pearson_scores: list[float] = [] + pearson_scores: list[float] = [] + spearman_scores: list[float] = [] for human_scores, cosine_pred_scores, dot_pred_scores, sim_scores in zip( distances["human_scores"], @@ -216,17 +218,23 @@ def _calculate_metrics( strict=True, ): cosine_spearman_scores.append( - spearmanr(human_scores, cosine_pred_scores).statistic + cast(float, spearmanr(human_scores, cosine_pred_scores).statistic) ) cosine_pearson_scores.append( - pearsonr(human_scores, cosine_pred_scores).statistic + cast(float, pearsonr(human_scores, cosine_pred_scores).statistic) ) dot_spearman_scores.append( - spearmanr(human_scores, dot_pred_scores).statistic + cast(float, spearmanr(human_scores, dot_pred_scores).statistic) + ) + dot_pearson_scores.append( + cast(float, pearsonr(human_scores, dot_pred_scores).statistic) + ) + spearman_scores.append( + cast(float, spearmanr(human_scores, sim_scores).statistic) + ) + pearson_scores.append( + cast(float, pearsonr(human_scores, sim_scores).statistic) ) - dot_pearson_scores.append(pearsonr(human_scores, dot_pred_scores).statistic) - spearman_scores.append(spearmanr(human_scores, sim_scores).statistic) - pearson_scores.append(pearsonr(human_scores, sim_scores).statistic) return SummarizationMetrics( pearson=float(np.mean(pearson_scores)), @@ -273,10 +281,10 @@ def _calculate_metrics( pearson_scores.append(pearsonr(human_scores, sim_scores)) return SummarizationMetrics( - pearson=float(np.mean(pearson_scores)), - spearman=float(np.mean(spearman_scores)), - cosine_spearman=float(np.mean(cosine_spearman_scores)), - cosine_pearson=float(np.mean(cosine_pearson_scores)), - dot_pearson=float(np.mean(dot_pearson_scores)), - dot_spearman=float(np.mean(dot_spearman_scores)), + pearson=float(np.mean(pearson_scores)), # type: ignore[arg-type] + spearman=float(np.mean(spearman_scores)), # type: ignore[arg-type] + cosine_spearman=float(np.mean(cosine_spearman_scores)), # type: ignore[arg-type] + cosine_pearson=float(np.mean(cosine_pearson_scores)), # type: ignore[arg-type] + dot_pearson=float(np.mean(dot_pearson_scores)), # type: ignore[arg-type] + dot_spearman=float(np.mean(dot_spearman_scores)), # type: ignore[arg-type] ) diff --git a/mteb/abstasks/_stratification.py b/mteb/abstasks/_stratification.py index 2f54e20af4..b30c591486 100644 --- a/mteb/abstasks/_stratification.py +++ b/mteb/abstasks/_stratification.py @@ -120,7 +120,9 @@ def _get_most_desired_combination(samples_with_combination: dict): if support_size == 0: continue if currently_chosen is None or ( - best_number_of_combinations < number_of_combinations + best_number_of_combinations is not None + and best_support_size is not None + and best_number_of_combinations < number_of_combinations and best_support_size > support_size ): currently_chosen = combination @@ -213,7 +215,7 @@ def _prepare_stratification( all_combinations = [] per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)] samples_with_combination: dict[str, list[Any]] = {} - folds = [[] for _ in range(self.n_splits)] + folds: list[list[int]] = [[] for _ in range(self.n_splits)] # for every row for sample_index, label_assignment in enumerate(rows): diff --git a/mteb/abstasks/abstask.py b/mteb/abstasks/abstask.py index 0bf4beb884..b3e232bf94 100644 --- a/mteb/abstasks/abstask.py +++ b/mteb/abstasks/abstask.py @@ -4,7 +4,7 @@ from collections.abc import Mapping, Sequence from copy import copy from pathlib import Path -from typing import Any, cast +from typing import Any, Literal, cast import numpy as np from datasets import ClassLabel, Dataset, DatasetDict, load_dataset @@ -79,7 +79,7 @@ class AbsTask(ABC): metadata: TaskMetadata abstask_prompt: str - _eval_splits: list[str] | None = None + _eval_splits: Sequence[str] | None = None dataset: dict[HFSubset, DatasetDict] | None = None data_loaded: bool = False hf_subsets: list[HFSubset] @@ -195,7 +195,7 @@ def evaluate( @abstractmethod def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, hf_split: str, @@ -226,7 +226,7 @@ def _save_task_predictions( hf_subset: The subset of the dataset (e.g. "en"). """ predictions_path = self._predictions_path(prediction_folder) - existing_results = { + existing_results: dict[str, Any] = { "mteb_model_meta": { "model_name": model.mteb_model_meta.name, "revision": model.mteb_model_meta.revision, @@ -362,15 +362,19 @@ def calculate_descriptive_statistics( """ from mteb.abstasks import AbsTaskClassification - if self.metadata.descriptive_stat_path.exists() and not overwrite_results: + existing_stats = self.metadata.descriptive_stats + + if existing_stats is not None and not overwrite_results: logger.info("Loading metadata descriptive statistics from cache.") - return self.metadata.descriptive_stats + return existing_stats if not self.data_loaded: self.load_data() descriptive_stats: dict[str, DescriptiveStatistics] = {} - hf_subset_stat = "hf_subset_descriptive_stats" + hf_subset_stat: Literal["hf_subset_descriptive_stats"] = ( + "hf_subset_descriptive_stats" + ) eval_splits = self.metadata.eval_splits if isinstance(self, AbsTaskClassification): eval_splits.append(self.train_split) @@ -381,7 +385,7 @@ def calculate_descriptive_statistics( logger.info(f"Processing metadata for split {split}") if self.metadata.is_multilingual: descriptive_stats[split] = ( - self._calculate_descriptive_statistics_from_split( + self._calculate_descriptive_statistics_from_split( # type: ignore[assignment] split, compute_overall=True ) ) @@ -400,7 +404,7 @@ def calculate_descriptive_statistics( descriptive_stats[split][hf_subset_stat][hf_subset] = split_details else: split_details = self._calculate_descriptive_statistics_from_split(split) - descriptive_stats[split] = split_details + descriptive_stats[split] = split_details # type: ignore[assignment] with self.metadata.descriptive_stat_path.open("w") as f: json.dump(descriptive_stats, f, indent=4) @@ -505,6 +509,8 @@ def _add_main_score(self, scores: ScoresDict) -> None: def _upload_dataset_to_hub( self, repo_name: str, fields: list[str] | dict[str, str] ) -> None: + if self.dataset is None: + raise ValueError("Dataset not loaded") if self.metadata.is_multilingual: for config in self.metadata.eval_langs: logger.info(f"Converting {config} of {self.metadata.name}") @@ -574,7 +580,7 @@ def is_aggregate(self) -> bool: return False @property - def eval_splits(self) -> list[str]: + def eval_splits(self) -> Sequence[str]: """Returns the evaluation splits of the task.""" if self._eval_splits: return self._eval_splits diff --git a/mteb/abstasks/aggregate_task_metadata.py b/mteb/abstasks/aggregate_task_metadata.py index 2d90ba8ac8..560fb7c60f 100644 --- a/mteb/abstasks/aggregate_task_metadata.py +++ b/mteb/abstasks/aggregate_task_metadata.py @@ -5,7 +5,6 @@ from typing_extensions import Self from mteb.types import ( - HFSubset, ISOLanguageScript, Languages, Licenses, @@ -60,13 +59,6 @@ class AggregateTaskMetadata(TaskMetadata): reference: str | None = None bibtex_citation: str | None = None - @property - def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]: - """Return a dictionary mapping huggingface subsets to languages.""" - if isinstance(self.eval_langs, dict): - return self.eval_langs - return {"default": self.eval_langs} - @model_validator(mode="after") def _compute_unfilled_cases(self) -> Self: if not self.eval_langs: diff --git a/mteb/abstasks/aggregated_task.py b/mteb/abstasks/aggregated_task.py index c95764de06..a276b8b37e 100644 --- a/mteb/abstasks/aggregated_task.py +++ b/mteb/abstasks/aggregated_task.py @@ -1,10 +1,10 @@ import logging +from collections.abc import Mapping from pathlib import Path from typing import Any import numpy as np from datasets import Dataset, DatasetDict -from typing_extensions import Self from mteb.models.models_protocols import MTEBModels from mteb.results.task_result import TaskResult @@ -32,7 +32,7 @@ def __init__(self, **kwargs: Any): def task_results_to_scores( self, task_results: list[TaskResult] - ) -> dict[str, dict[HFSubset, ScoresDict]]: + ) -> dict[str, Mapping[HFSubset, ScoresDict]]: """The function that aggregated scores. Can be redefined to allow for custom aggregations. Args: @@ -41,7 +41,7 @@ def task_results_to_scores( Returns: A dictionary with the aggregated scores. """ - scores = {} + scores: dict[str, Mapping[HFSubset, ScoresDict]] = {} subsets = ( self.metadata.eval_langs.keys() if isinstance(self.metadata.eval_langs, dict) @@ -127,19 +127,6 @@ def check_if_dataset_is_superseded(self) -> None: f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset." ) - def filter_eval_splits(self, eval_splits: list[str] | None) -> Self: - """Filter the evaluation splits of the task. - - Args: - eval_splits: List of splits to evaluate on. If None, all splits in metadata - are used. - - Returns: - The task with filtered evaluation splits. - """ - self._eval_splits = eval_splits - return self - def evaluate( self, model: MTEBModels, diff --git a/mteb/abstasks/classification.py b/mteb/abstasks/classification.py index b79364158a..a4f33f18fc 100644 --- a/mteb/abstasks/classification.py +++ b/mteb/abstasks/classification.py @@ -182,7 +182,7 @@ def evaluate( def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: DatasetDict, *, encode_kwargs: dict[str, Any], @@ -191,6 +191,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs: Any, ) -> FullClassificationMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + train_split = data_split[self.train_split] eval_split = data_split[hf_split] diff --git a/mteb/abstasks/clustering.py b/mteb/abstasks/clustering.py index 0dbf122489..1aae041023 100644 --- a/mteb/abstasks/clustering.py +++ b/mteb/abstasks/clustering.py @@ -3,7 +3,7 @@ import random from collections import defaultdict from pathlib import Path -from typing import Any +from typing import Any, cast import numpy as np from datasets import Dataset, DatasetDict @@ -11,8 +11,8 @@ from sklearn.metrics.cluster import v_measure_score from mteb._create_dataloaders import create_dataloader -from mteb.models import EncoderProtocol -from mteb.types import HFSubset, ScoresDict +from mteb.models import EncoderProtocol, MTEBModels +from mteb.types import Array, HFSubset, ScoresDict from mteb.types.statistics import ( ImageStatistics, LabelStatistics, @@ -34,7 +34,7 @@ def _evaluate_clustering_bootstrapped( - embeddings: np.ndarray, + embeddings: Array, labels: list[list[str]], n_clusters: int, cluster_size: int, @@ -61,21 +61,21 @@ def _evaluate_clustering_bootstrapped( max_depth = max(map(len, labels)) # Evaluate on each level til max depth for i_level in range(max_depth): - level_labels = [] + level_labels: list[str | int] = [] # Assign -1 to gold label if the level is not there for label in labels: if len(label) > i_level: level_labels.append(label[i_level]) else: level_labels.append(-1) - level_labels = np.array(level_labels) + np_level_labels = np.array(level_labels) valid_idx = np.array( - [level_label != -1 for level_label in level_labels] + [level_label != -1 for level_label in np_level_labels] ) # Could be level_labels != -1 but fails with FutureWarning: elementwise comparison failed - level_labels = level_labels[valid_idx] + np_level_labels = np_level_labels[valid_idx] level_embeddings = embeddings[valid_idx] clustering_model = MiniBatchKMeans( - n_clusters=np.unique(level_labels).size, + n_clusters=np.unique(np_level_labels).size, batch_size=kmean_batch_size, init="k-means++", n_init=1, # default when kmeans++ is used @@ -87,7 +87,7 @@ def _evaluate_clustering_bootstrapped( cluster_indices = rng_state.choices(range(n_embeddings), k=cluster_size) _embeddings = level_embeddings[cluster_indices] - _labels = level_labels[cluster_indices] + _labels = np_level_labels[cluster_indices] cluster_assignment = clustering_model.fit_predict(_embeddings) v_measure = v_measure_score(_labels, cluster_assignment) v_measures[f"Level {i_level}"].append(v_measure) @@ -153,7 +153,7 @@ class AbsTaskClustering(AbsTask): def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, encode_kwargs: dict[str, Any], @@ -162,6 +162,10 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs: Any, ) -> ScoresDict: + if not isinstance(model, EncoderProtocol): + raise TypeError( + "Expected encoder model to be an instance of EncoderProtocol." + ) if ( self.max_document_to_embed is not None and self.max_fraction_of_documents_to_embed is not None @@ -182,7 +186,7 @@ def _evaluate_subset( self.max_fraction_of_documents_to_embed * len(data_split) ) else: - max_documents_to_embed = self.max_document_to_embed + max_documents_to_embed = cast(int, self.max_document_to_embed) max_documents_to_embed = min(len(data_split), max_documents_to_embed) example_indices = self.rng_state.sample( diff --git a/mteb/abstasks/clustering_legacy.py b/mteb/abstasks/clustering_legacy.py index cd571416bd..c92f0c8bed 100644 --- a/mteb/abstasks/clustering_legacy.py +++ b/mteb/abstasks/clustering_legacy.py @@ -8,7 +8,7 @@ from sklearn import metrics from mteb._evaluators import ClusteringEvaluator -from mteb.models import EncoderProtocol +from mteb.models import EncoderProtocol, MTEBModels from mteb.types import ScoresDict from mteb.types.statistics import ( ImageStatistics, @@ -80,7 +80,7 @@ class AbsTaskClusteringLegacy(AbsTask): def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, encode_kwargs: dict[str, Any], @@ -89,6 +89,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs: Any, ) -> ScoresDict: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + # MTEB text clustering requires renaming and eval per subset. if self.metadata.modalities == ["text"]: all_metrics = [] diff --git a/mteb/abstasks/image/image_text_pair_classification.py b/mteb/abstasks/image/image_text_pair_classification.py index e5a1e6debb..1c390cca80 100644 --- a/mteb/abstasks/image/image_text_pair_classification.py +++ b/mteb/abstasks/image/image_text_pair_classification.py @@ -12,7 +12,7 @@ calculate_text_statistics, ) from mteb.abstasks.abstask import AbsTask -from mteb.models.models_protocols import EncoderProtocol +from mteb.models.models_protocols import EncoderProtocol, MTEBModels from mteb.types.statistics import ( ImageStatistics, SplitDescriptiveStatistics, @@ -116,7 +116,7 @@ def _calculate_descriptive_statistics_from_split( def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, encode_kwargs: dict[str, Any], @@ -125,6 +125,8 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs: Any, ) -> ImageTextPairClassificationMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") select_columns = [] for columns in (self.images_column_names, self.texts_column_names): if isinstance(columns, str): diff --git a/mteb/abstasks/pair_classification.py b/mteb/abstasks/pair_classification.py index b336759c5d..b6ad2f59ec 100644 --- a/mteb/abstasks/pair_classification.py +++ b/mteb/abstasks/pair_classification.py @@ -18,7 +18,7 @@ ) from mteb.abstasks.abstask import AbsTask from mteb.models.model_meta import ScoringFunction -from mteb.models.models_protocols import EncoderProtocol +from mteb.models.models_protocols import EncoderProtocol, MTEBModels from mteb.types import PromptType from mteb.types.statistics import ( ImageStatistics, @@ -79,7 +79,7 @@ class AbsTaskPairClassification(AbsTask): def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, hf_split: str, @@ -88,6 +88,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs, ) -> dict[str, float]: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + if self.metadata.modalities == ["text"]: # for compatibility with v1 version where datasets were stored in a single row data_split = data_split[0] if len(data_split) == 1 else data_split diff --git a/mteb/abstasks/sts.py b/mteb/abstasks/sts.py index 07b8ac8d5c..9a7150f4b4 100644 --- a/mteb/abstasks/sts.py +++ b/mteb/abstasks/sts.py @@ -7,7 +7,7 @@ from mteb._evaluators import AnySTSEvaluator from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores -from mteb.models import EncoderProtocol +from mteb.models import EncoderProtocol, MTEBModels from mteb.types import PromptType from mteb.types.statistics import ( ImageStatistics, @@ -103,7 +103,7 @@ class AbsTaskSTS(AbsTask): def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, encode_kwargs: dict[str, Any], hf_split: str, @@ -111,6 +111,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs: Any, ) -> STSMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + normalized_scores = list(map(self._normalize, data_split["score"])) data_split = data_split.select_columns(list(self.column_names)) diff --git a/mteb/abstasks/task_metadata.py b/mteb/abstasks/task_metadata.py index b4e4a3c59c..b65b721994 100644 --- a/mteb/abstasks/task_metadata.py +++ b/mteb/abstasks/task_metadata.py @@ -368,7 +368,7 @@ def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]: """Return a dictionary mapping huggingface subsets to languages.""" if isinstance(self.eval_langs, dict): return self.eval_langs - return {"default": self.eval_langs} + return {"default": cast(list[str], self.eval_langs)} @property def intext_citation(self, include_cite: bool = True) -> str: diff --git a/mteb/abstasks/text/summarization.py b/mteb/abstasks/text/summarization.py index 3591feb9b0..1879af9e19 100644 --- a/mteb/abstasks/text/summarization.py +++ b/mteb/abstasks/text/summarization.py @@ -12,7 +12,7 @@ calculate_text_statistics, ) from mteb.abstasks.abstask import AbsTask -from mteb.models import EncoderProtocol +from mteb.models import EncoderProtocol, MTEBModels from mteb.types.statistics import ( ScoreStatistics, SplitDescriptiveStatistics, @@ -77,7 +77,7 @@ class AbsTaskSummarization(AbsTask): def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, hf_split: str, @@ -86,6 +86,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs, ) -> SummarizationMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + normalized_scores = [ ( (np.array(x) - self.min_score) / (self.max_score - self.min_score) diff --git a/mteb/abstasks/zeroshot_classification.py b/mteb/abstasks/zeroshot_classification.py index 15045309e4..206e6b3ed9 100644 --- a/mteb/abstasks/zeroshot_classification.py +++ b/mteb/abstasks/zeroshot_classification.py @@ -7,7 +7,7 @@ from sklearn import metrics from mteb._evaluators import ZeroShotClassificationEvaluator -from mteb.models import EncoderProtocol +from mteb.models import EncoderProtocol, MTEBModels from mteb.types.statistics import ( ImageStatistics, LabelStatistics, @@ -111,7 +111,7 @@ def _calculate_descriptive_statistics_from_split( def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, hf_split: str, @@ -120,6 +120,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs, ) -> ZeroShotClassificationMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + candidate_labels = self.get_candidate_labels() data_split = data_split.select_columns( [self.input_column_name, self.label_column_name] diff --git a/mteb/deprecated_evaluator.py b/mteb/deprecated_evaluator.py index eab9285014..54844764bc 100644 --- a/mteb/deprecated_evaluator.py +++ b/mteb/deprecated_evaluator.py @@ -5,7 +5,7 @@ import os import sys import traceback -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from copy import deepcopy from datetime import datetime from pathlib import Path @@ -598,7 +598,11 @@ def _create_output_folder( if output_folder is None: return None - model_revision: str = model_meta.revision + model_revision: str = ( + model_meta.revision + if model_meta.revision is not None + else "no_revision_available" + ) model_path_name = model_meta.model_name_as_path() output_path = Path(output_folder) / model_path_name / model_revision @@ -626,9 +630,9 @@ def _get_last_evaluated_splits(self) -> dict[str, list[str]]: @staticmethod def _get_missing_evaluations( existing_results: TaskResult | None, - task_eval_splits: list[str], - task_eval_langs: list[str], - eval_subsets: list[str] | None, + task_eval_splits: Sequence[str], + task_eval_langs: Sequence[str], + eval_subsets: Sequence[str] | None, ) -> dict[str, dict[str, Any]]: """Return a dictionary for each split, indicating if the whole split is missing and which subsets are missing.""" missing_evaluations = { diff --git a/mteb/languages/language_scripts.py b/mteb/languages/language_scripts.py index f0a6f2f9cc..3cf48b9aa8 100644 --- a/mteb/languages/language_scripts.py +++ b/mteb/languages/language_scripts.py @@ -1,4 +1,4 @@ -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from dataclasses import dataclass from typing_extensions import Self @@ -25,7 +25,9 @@ class LanguageScripts: @classmethod def from_languages_and_scripts( - cls, languages: list[str] | None = None, scripts: list[str] | None = None + cls, + languages: Sequence[str] | None = None, + scripts: Sequence[str] | None = None, ) -> Self: """Create a LanguageScripts object from lists of languages and scripts. diff --git a/mteb/models/get_model_meta.py b/mteb/models/get_model_meta.py index 67dcdd056b..c4e69e50e8 100644 --- a/mteb/models/get_model_meta.py +++ b/mteb/models/get_model_meta.py @@ -93,7 +93,7 @@ def get_model( meta = get_model_meta(model_name, revision) model = meta.load_model(**kwargs) - model.mteb_model_meta = meta + model.mteb_model_meta = meta # type: ignore[misc] return model diff --git a/mteb/models/model_meta.py b/mteb/models/model_meta.py index 3cecccf7ad..043a658198 100644 --- a/mteb/models/model_meta.py +++ b/mteb/models/model_meta.py @@ -222,7 +222,7 @@ def load_model(self, **kwargs: Any) -> MTEBModels: _kwargs.update(kwargs) model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs) - model.mteb_model_meta = self + model.mteb_model_meta = self # type: ignore[misc] return model def model_name_as_path(self) -> str: diff --git a/mteb/results/model_result.py b/mteb/results/model_result.py index 5223703a98..f8a621e6c5 100644 --- a/mteb/results/model_result.py +++ b/mteb/results/model_result.py @@ -1,12 +1,14 @@ +from __future__ import annotations + import logging import warnings from collections.abc import Callable, Iterable, Sequence -from typing import Any, Literal +from typing import Any, Literal, cast import numpy as np import pandas as pd from pydantic import BaseModel, ConfigDict, Field -from typing_extensions import Self +from typing_extensions import overload from mteb.abstasks.abstask import AbsTask from mteb.abstasks.task_metadata import ( @@ -58,7 +60,7 @@ def _aggregate_and_pivot( index=index_columns, columns=columns, values="score", - aggfunc=aggregation_fn, + aggfunc=aggregation_fn, # type: ignore[arg-type] ).reset_index() elif format == "long": return ( @@ -81,7 +83,7 @@ class ModelResult(BaseModel): model_revision: str | None task_results: list[TaskResult] default_modalities: list[Modalities] = Field( - default_factory=lambda: ["text"], alias="modalities" + default_factory=lambda: [cast(Modalities, "text")], alias="modalities" ) model_config = ( ConfigDict( # to free up the name model_* which is otherwise protected @@ -95,16 +97,17 @@ def __repr__(self) -> str: return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))" @classmethod - def from_validated(cls, **data: dict[str, Any]) -> Self: + def from_validated(cls, **data: dict[str, Any]) -> ModelResult: """Create a ModelResult from validated data. Args: data: The validated data. """ - data["task_results"] = [ - TaskResult.from_validated(**res) for res in data["task_results"] + data["task_results"] = [ # type: ignore[assignment] + TaskResult.from_validated(**res) # type: ignore[arg-type] + for res in data["task_results"] ] - return cls.model_construct(**data) + return cls.model_construct(**data) # type: ignore[arg-type] def _filter_tasks( self, @@ -114,7 +117,7 @@ def _filter_tasks( task_types: list[TaskType] | None = None, modalities: list[Modalities] | None = None, is_public: bool | None = None, - ) -> Self: + ) -> ModelResult: new_task_results = [] for task_result in self.task_results: if (task_names is not None) and (task_result.task_name not in task_names): @@ -142,7 +145,7 @@ def _filter_tasks( task_results=new_task_results, ) - def select_tasks(self, tasks: Sequence[AbsTask]) -> Self: + def select_tasks(self, tasks: Sequence[AbsTask]) -> ModelResult: """Select tasks from the ModelResult based on a list of AbsTask objects. Args: @@ -160,6 +163,28 @@ def select_tasks(self, tasks: Sequence[AbsTask]) -> Self: task_results=new_task_results, ) + @overload + def _get_scores( + self, + splits: list[SplitName] | None = None, + languages: list[ISOLanguage | ISOLanguageScript] | None = None, + scripts: list[ISOLanguageScript] | None = None, + getter: Callable[[ScoresDict], Score] | None = None, + aggregation: Callable[[list[Score]], Any] | None = None, + format: Literal["wide"] = "wide", + ) -> dict: ... + + @overload + def _get_scores( + self, + splits: list[SplitName] | None = None, + languages: list[ISOLanguage | ISOLanguageScript] | None = None, + scripts: list[ISOLanguageScript] | None = None, + getter: Callable[[ScoresDict], Score] | None = None, + aggregation: Callable[[list[Score]], Any] | None = None, + format: Literal["long"] = "long", + ) -> list: ... + def _get_scores( self, splits: list[SplitName] | None = None, @@ -177,6 +202,9 @@ def _get_scores( aggregation = aggregation if aggregation is not None else np.mean else: use_fast = True + aggregation = cast(Callable[[list[Score]], Any], aggregation) + getter = cast(Callable[[ScoresDict], Score], getter) + if format == "wide": scores = {} for res in self.task_results: @@ -315,7 +343,7 @@ def to_dataframe( def __hash__(self) -> int: return id(self) - def __iter__(self) -> Iterable[TaskResult]: + def __iter__(self) -> Iterable[TaskResult]: # type: ignore[override] return iter(self.task_results) def __getitem__(self, index) -> TaskResult: @@ -368,13 +396,13 @@ def task_names(self) -> list[str]: return [task_res.task_name for task_res in self.task_results] @property - def modalities(self) -> list[str]: + def modalities(self) -> list[Modalities]: """Get all modalities in the task results. Returns: A list of modalities in the task results. """ - mods = [] + mods: list[Modalities] = [] for task_res in self.task_results: task_modalities = getattr(task_res, "modalities", []) mods.extend(task_modalities) diff --git a/mteb/results/task_result.py b/mteb/results/task_result.py index 966af0c86c..72ba2f15a8 100644 --- a/mteb/results/task_result.py +++ b/mteb/results/task_result.py @@ -2,7 +2,6 @@ import json import logging -from argparse import Namespace from collections import defaultdict from collections.abc import Callable, Iterable, Mapping from functools import cached_property @@ -16,8 +15,11 @@ from pydantic import BaseModel, field_validator from typing_extensions import Self +from mteb import TaskMetadata from mteb._helpful_enum import HelpfulStrEnum +from mteb.abstasks import AbsTaskClassification from mteb.abstasks.abstask import AbsTask +from mteb.abstasks.task_metadata import TaskDomain from mteb.languages import LanguageScripts from mteb.models.model_meta import ScoringFunction from mteb.types import ( @@ -39,67 +41,59 @@ class Criteria(HelpfulStrEnum): DATASET_REVISION = "dataset_revision" -class ScalaNbClassificationDummy: +class ScalaNbClassificationDummy(AbsTaskClassification): """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( + metadata = TaskMetadata( name="ScalaNbClassification", + description="A dummy", main_score="accuracy", type="Classification", - hf_subsets_to_langscripts={ - "default": ["nob-Latn"], - }, - dataset={"revision": "revision_not_applicable"}, - revision="revision_not_applicable", + eval_langs=["nob-Latn"], + dataset={"path": "not/exists", "revision": "revision_not_applicable"}, ) -class ScalaNnClassificationDummy: +class ScalaNnClassificationDummy(AbsTaskClassification): """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( + metadata = TaskMetadata( name="ScalaNnClassification", + description="A dummy", main_score="accuracy", type="Classification", - hf_subsets_to_langscripts={ - "default": ["nno-Latn"], - }, - dataset={"revision": "revision_not_applicable"}, - revision="revision_not_applicable", + eval_langs=["nob-Latn"], + dataset={"path": "not/exists", "revision": "revision_not_applicable"}, ) -class ScalaDaClassificationDummy: +class ScalaDaClassificationDummy(AbsTaskClassification): """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( + metadata = TaskMetadata( name="ScalaDaClassification", + description="A dummy", main_score="accuracy", type="Classification", - hf_subsets_to_langscripts={ - "default": ["dan-Latn"], - }, - dataset={"revision": "revision_not_applicable"}, - revision="revision_not_applicable", + eval_langs=["dan-Latn"], + dataset={"path": "not/exists", "revision": "revision_not_applicable"}, ) -class ScalaSvClassificationDummy: +class ScalaSvClassificationDummy(AbsTaskClassification): """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( + metadata = TaskMetadata( name="ScalaSvClassification", + description="A dummy", main_score="accuracy", type="Classification", - hf_subsets_to_langscripts={ - "default": ["swe-Latn"], - }, - dataset={"revision": "revision_not_applicable"}, - revision="revision_not_applicable", + eval_langs=["swe-Latn"], + dataset={"path": "not/exists", "revision": "revision_not_applicable"}, ) -outdated_tasks = { +outdated_tasks: dict[str, type[AbsTask]] = { "ScalaNbClassification": ScalaNbClassificationDummy, "ScalaNnClassification": ScalaNnClassificationDummy, "ScalaDaClassification": ScalaDaClassificationDummy, @@ -169,7 +163,7 @@ def from_task_results( scores: dict[SplitName, Mapping[HFSubset, ScoresDict]], evaluation_time: float, kg_co2_emissions: float | None = None, - ) -> Self: + ) -> TaskResult: """Create a TaskResult from the task and scores. Args: @@ -246,7 +240,7 @@ def task(self) -> AbsTask: return get_task(self.task_name) @property - def domains(self) -> list[str]: + def domains(self) -> list[TaskDomain]: """Get the domains of the task.""" doms = self.task.metadata.domains if doms is None: @@ -307,7 +301,7 @@ def _round_scores(self, scores: dict[SplitName, list[ScoresDict]], n: int) -> No if isinstance(v, dict): self._round_scores(v, n) elif isinstance(v, float): - value[i] = round(v, n) + value[i] = round(v, n) # type: ignore[call-overload] elif isinstance(value, float): scores[key] = round(value, n) @@ -325,7 +319,7 @@ def to_disk(self, path: Path) -> None: json.dump(json_obj, f, indent=2) @classmethod - def from_disk(cls, path: Path, load_historic_data: bool = True) -> Self: + def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult: """Load TaskResult from disk. Args: @@ -356,7 +350,7 @@ def from_disk(cls, path: Path, load_historic_data: bool = True) -> Self: ) # assume it is before 1.11.0 if the version is not present try: - obj = cls.model_validate(data) + obj: TaskResult = cls.model_validate(data) except Exception as e: if not pre_1_11_load: raise e @@ -381,6 +375,7 @@ def _fix_pair_classification_scores(cls, obj: TaskResult) -> None: from mteb import get_task task_name = obj.task_name + task: AbsTask | type[AbsTask] if task_name in outdated_tasks: task = outdated_tasks[task_name] else: @@ -393,11 +388,11 @@ def _fix_pair_classification_scores(cls, obj: TaskResult) -> None: for key in list(hf_subset_scores.keys()): if isinstance(hf_subset_scores[key], dict): for k, v in hf_subset_scores[key].items(): - hf_subset_scores[f"{key}_{k}"] = v - hf_subset_scores.pop(key) + hf_subset_scores[f"{key}_{k}"] = v # type: ignore[index] + hf_subset_scores.pop(key) # type: ignore[attr-defined] @classmethod - def _convert_from_before_v1_11_0(cls, data: dict) -> Self: + def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult: from mteb.get_tasks import _TASKS_REGISTRY # in case the task name is not found in the registry, try to find a lower case version @@ -532,7 +527,7 @@ def get_score( def _get_score_fast( self, splits: Iterable[str] | None = None, - languages: str | None = None, + languages: list[ISOLanguage | ISOLanguageScript] | None = None, subsets: Iterable[str] | None = None, ) -> float: """Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified. @@ -581,7 +576,7 @@ def _get_score_fast( return val_sum / n_val @classmethod - def from_validated(cls, **data) -> Self: + def from_validated(cls, **data) -> TaskResult: """Create a TaskResult from validated data. Returns: @@ -592,13 +587,13 @@ def from_validated(cls, **data) -> Self: def __repr__(self) -> str: return f"TaskResult(task_name={self.task_name}, scores=...)" - def only_main_score(self) -> Self: + def only_main_score(self) -> TaskResult: """Return a new TaskResult object with only the main score. Returns: A new TaskResult object with only the main score. """ - new_scores = {} + new_scores: dict[str, list[Score]] = {} for split in self.scores: new_scores[split] = [] for subset_scores in self.scores[split]: @@ -610,10 +605,9 @@ def only_main_score(self) -> Self: } ) new_res = {**self.to_dict(), "scores": new_scores} - new_res = TaskResult.from_validated(**new_res) - return new_res + return TaskResult.from_validated(**new_res) - def validate_and_filter_scores(self, task: AbsTask | None = None) -> Self: + def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult: """Validate and filter the scores against the task metadata. This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata. @@ -634,9 +628,9 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> Self: splits = task.eval_splits hf_subsets = task.hf_subsets - hf_subsets = set(hf_subsets) + set_hf_subsets = set(hf_subsets) - new_scores = {} + new_scores: dict[str, list[Score]] = {} seen_splits = set() for split in self.scores: if split not in splits: @@ -644,12 +638,12 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> Self: new_scores[split] = [] seen_subsets = set() for _scores in self.scores[split]: - if _scores["hf_subset"] not in hf_subsets: + if _scores["hf_subset"] not in set_hf_subsets: continue new_scores[split].append(_scores) seen_subsets.add(_scores["hf_subset"]) - if seen_subsets != hf_subsets: - missing_subsets = hf_subsets - seen_subsets + if seen_subsets != set_hf_subsets: + missing_subsets = set_hf_subsets - seen_subsets if len(missing_subsets) > 2: subset1, subset2 = list(missing_subsets)[:2] missing_subsets_str = f"{{'{subset1}', '{subset2}', ...}}" @@ -665,8 +659,7 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> Self: f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}" ) new_res = {**self.to_dict(), "scores": new_scores} - new_res = TaskResult.from_validated(**new_res) - return new_res + return TaskResult.from_validated(**new_res) def is_mergeable( self, @@ -734,7 +727,7 @@ def merge( "mteb_version", "dataset_revision", ], - ) -> Self: + ) -> TaskResult: """Merges two TaskResult objects. Args: diff --git a/mteb/types/statistics.py b/mteb/types/statistics.py index 7714807e0e..97737c387c 100644 --- a/mteb/types/statistics.py +++ b/mteb/types/statistics.py @@ -10,8 +10,14 @@ class SplitDescriptiveStatistics(TypedDict): class DescriptiveStatistics(TypedDict, SplitDescriptiveStatistics): - """Class for descriptive statistics for the full task.""" + """Class for descriptive statistics for the full task. + Attributes: + num_samples: Total number of samples + hf_subset_descriptive_stats: HFSubset descriptive statistics (only for multilingual datasets) + """ + + num_samples: int hf_subset_descriptive_stats: NotRequired[dict[HFSubset, SplitDescriptiveStatistics]] From 651c0e0f25c16cc4dea4d3524400b28e6c61f30e Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 23 Dec 2025 01:33:47 +0300 Subject: [PATCH 19/32] fix search wrappers --- mteb/models/search_wrappers.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/mteb/models/search_wrappers.py b/mteb/models/search_wrappers.py index 8a441446b8..9b2b5f9f6d 100644 --- a/mteb/models/search_wrappers.py +++ b/mteb/models/search_wrappers.py @@ -218,14 +218,19 @@ def _full_corpus_search( encode_kwargs: dict[str, Any], ) -> dict[str, list[tuple[float, str]]]: logger.info("Encoding Corpus in batches (this might take a while)...") - itr = range(0, len(self.task_corpus), self.corpus_chunk_size) # type: ignore[arg-type] + if self.task_corpus is None: + raise ValueError("Corpus must be indexed before searching.") + + itr = range(0, len(self.task_corpus), self.corpus_chunk_size) - result_heaps = {qid: [] for qid in query_idx_to_id.values()} + result_heaps: dict[str, list[tuple[float, str]]] = { + qid: [] for qid in query_idx_to_id.values() + } for batch_num, corpus_start_idx in enumerate(itr): logger.info(f"Encoding Batch {batch_num + 1}/{len(itr)}...") corpus_end_idx = min( corpus_start_idx + self.corpus_chunk_size, - len(self.task_corpus), # type: ignore[arg-type] + len(self.task_corpus), ) sub_corpus = self.task_corpus.select( range(corpus_start_idx, corpus_end_idx) @@ -320,7 +325,11 @@ def _rerank_documents( Returns: A dictionary mapping query IDs to a list of tuples, each containing a relevance score and a document ID. """ - result_heaps = {qid: [] for qid in query_idx_to_id.values()} + if self.task_corpus is None: + raise ValueError("Corpus must be indexed before searching.") + result_heaps: dict[str, list[tuple[float, str]]] = { + qid: [] for qid in query_idx_to_id.values() + } doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)} all_doc_embeddings = self.model.encode( @@ -387,12 +396,12 @@ def _rerank_documents( def _rerank_sort_results( self, - result_heaps: list[tuple[float, str]], + result_heaps: dict[str, list[tuple[float, str]]], query_id: str, ranked_ids: list[str], scores_top_k_idx: torch.Tensor, scores_top_k_values: torch.Tensor, - ) -> list[tuple[float, str]]: + ) -> dict[str, list[tuple[float, str]]]: """Sort the heap into descending order list. Returns: @@ -503,6 +512,8 @@ def search( raise ValueError( "CrossEncoder search requires top_ranked documents for reranking." ) + if self.task_corpus is None: + raise ValueError("Corpus must be indexed before searching.") query_id_to_idx = {row["id"]: i for i, row in enumerate(queries)} doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)} @@ -541,7 +552,7 @@ def search( hf_subset=hf_subset, ) - results = {qid: {} for qid in queries["id"]} + results: RetrievalOutputType = {qid: {} for qid in queries["id"]} for (query_id, corpus_id), score in zip(doc_pairs_ids, predictions): results[query_id][corpus_id] = float(score) From d0c061fcef420e35dd7448fbfb5038707b5074e7 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 23 Dec 2025 01:35:37 +0300 Subject: [PATCH 20/32] add ci --- .github/workflows/typechecking.yml | 45 ++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 .github/workflows/typechecking.yml diff --git a/.github/workflows/typechecking.yml b/.github/workflows/typechecking.yml new file mode 100644 index 0000000000..650471baf6 --- /dev/null +++ b/.github/workflows/typechecking.yml @@ -0,0 +1,45 @@ +name: Typechecking + +on: + push: + branches: [main] + pull_request: + + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Free disk space + run: | + sudo rm -rf \ + "$AGENT_TOOLSDIRECTORY" \ + /opt/ghc \ + /opt/google/chrome \ + /opt/microsoft/msedge \ + /opt/microsoft/powershell \ + /opt/pipx \ + /usr/lib/mono \ + /usr/local/julia* \ + /usr/local/lib/android \ + /usr/local/lib/node_modules \ + /usr/local/share/chromium \ + /usr/local/share/powershell \ + /usr/local/share/powershell \ + /usr/share/dotnet \ + /usr/share/swift + docker system prune -af + + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 + with: + python-version: "3.10" + + - name: Dependencies + run: | + make install-for-tests + pip install -e . --group typing + + - name: Build and Deploy + run: | + make typecheck From 9846e66c6ffdf01803661565bc9235e604b86d7a Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 23 Dec 2025 02:06:43 +0300 Subject: [PATCH 21/32] fix tests --- .../search_encoder_index/search_indexes/faiss_search_index.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py b/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py index a383bd6819..f690bc36b6 100644 --- a/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +++ b/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py @@ -108,7 +108,7 @@ def search( ids = ids.tolist() if issubclass(self.index_type, faiss.IndexFlatL2): - similarities = -np.sqrt(np.maximum(similarities, 0)).tolist() + similarities = (-np.sqrt(np.maximum(similarities, 0))).tolist() return similarities, ids diff --git a/pyproject.toml b/pyproject.toml index 06590de4f6..a96d084b08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -124,7 +124,7 @@ docs = [ "tabulate>=0.9.0", ] typing = [ - "mypy>=1.18.1", + "mypy==1.19.1", "types-cachetools>=6.2.0.20250827", "types-pysocks>=1.7.1.20250828", "types-pyyaml>=6.0.12.20250822", From 1858d7e2a875aa3a10caaec968563f4982ba8793 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 23 Dec 2025 02:17:33 +0300 Subject: [PATCH 22/32] fix 3.10 types --- .github/workflows/typechecking.yml | 2 +- mteb/results/benchmark_results.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/typechecking.yml b/.github/workflows/typechecking.yml index 650471baf6..c7e38caeb8 100644 --- a/.github/workflows/typechecking.yml +++ b/.github/workflows/typechecking.yml @@ -7,7 +7,7 @@ on: jobs: - deploy: + typecheck: runs-on: ubuntu-latest steps: - name: Free disk space diff --git a/mteb/results/benchmark_results.py b/mteb/results/benchmark_results.py index 5291b64788..df8850b23f 100644 --- a/mteb/results/benchmark_results.py +++ b/mteb/results/benchmark_results.py @@ -241,8 +241,8 @@ def keep_best(group: pd.DataFrame) -> pd.DataFrame: model_results = [] for (model, model_revision), group in task_df.groupby(["model", "revision"]): model_result = ModelResult.model_construct( - model_name=model, - model_revision=model_revision, + model_name=model, # type: ignore[arg-type] + model_revision=model_revision, # type: ignore[arg-type] task_results=list(group["task_result"]), ) model_results.append(model_result) From 39cbc214d86ec802548e5b3918516949b37e5c38 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 23 Dec 2025 02:28:26 +0300 Subject: [PATCH 23/32] rollback overload --- mteb/filter_tasks.py | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/mteb/filter_tasks.py b/mteb/filter_tasks.py index 151503f545..8ee5cc3523 100644 --- a/mteb/filter_tasks.py +++ b/mteb/filter_tasks.py @@ -2,7 +2,7 @@ import logging from collections.abc import Sequence -from typing import TypeVar +from typing import overload from mteb.abstasks import ( AbsTask, @@ -32,11 +32,42 @@ def _check_is_valid_language(lang: str) -> None: ) -T = TypeVar("T", AbsTask, type[AbsTask]) +@overload +def filter_tasks( + tasks: Sequence[AbsTask], + *, + languages: Sequence[str] | None = None, + script: Sequence[str] | None = None, + domains: Sequence[TaskDomain] | None = None, + task_types: Sequence[TaskType] | None = None, + categories: Sequence[TaskCategory] | None = None, + modalities: Sequence[Modalities] | None = None, + exclusive_modality_filter: bool = False, + exclude_superseded: bool = False, + exclude_aggregate: bool = False, + exclude_private: bool = False, +) -> list[AbsTask]: ... + + +@overload +def filter_tasks( + tasks: Sequence[type[AbsTask]], + *, + languages: Sequence[str] | None = None, + script: Sequence[str] | None = None, + domains: Sequence[TaskDomain] | None = None, + task_types: Sequence[TaskType] | None = None, + categories: Sequence[TaskCategory] | None = None, + modalities: Sequence[Modalities] | None = None, + exclusive_modality_filter: bool = False, + exclude_superseded: bool = False, + exclude_aggregate: bool = False, + exclude_private: bool = False, +) -> list[type[AbsTask]]: ... def filter_tasks( - tasks: Sequence[T], + tasks: Sequence[AbsTask] | Sequence[type[AbsTask]], *, languages: Sequence[str] | None = None, script: Sequence[str] | None = None, @@ -48,7 +79,7 @@ def filter_tasks( exclude_superseded: bool = False, exclude_aggregate: bool = False, exclude_private: bool = False, -) -> Sequence[T]: +) -> list[AbsTask] | list[type[AbsTask]]: """Filter tasks based on the specified criteria. Args: @@ -146,4 +177,4 @@ def _convert_to_set(domain: list[TaskDomain] | None) -> set: _tasks.append(t) - return _tasks + return _tasks # type: ignore[return-value] # type checker cannot infer the overload return type From 3b3f7984fd4dbd38abf32d7974609178ee0ad5ef Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 26 Dec 2025 11:30:52 +0300 Subject: [PATCH 24/32] fixes after merge --- mteb/benchmarks/benchmark.py | 4 ++-- mteb/cache.py | 19 ++++++++++--------- mteb/results/benchmark_results.py | 5 +++-- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py index 1b1578da78..41db0641b4 100644 --- a/mteb/benchmarks/benchmark.py +++ b/mteb/benchmarks/benchmark.py @@ -1,6 +1,6 @@ from __future__ import annotations -from collections.abc import Iterable, Sequence +from collections.abc import Iterator, Sequence from dataclasses import dataclass, field from typing import TYPE_CHECKING, Literal @@ -47,7 +47,7 @@ class Benchmark: display_name: str | None = None language_view: list[str] | Literal["all"] = field(default_factory=list) - def __iter__(self) -> Iterable[AbsTask]: + def __iter__(self) -> Iterator[AbsTask]: return iter(self.tasks) def __len__(self) -> int: diff --git a/mteb/cache.py b/mteb/cache.py index 813c2b1bb3..5e39314ea6 100644 --- a/mteb/cache.py +++ b/mteb/cache.py @@ -4,7 +4,7 @@ import shutil import subprocess from collections import defaultdict -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from pathlib import Path from typing import cast @@ -288,8 +288,8 @@ def __repr__(self) -> str: def get_cache_paths( self, - models: Sequence[str] | Sequence[ModelMeta] | None = None, - tasks: Sequence[str] | Sequence[AbsTask] | None = None, + models: Sequence[str] | Iterable[ModelMeta] | None = None, + tasks: Sequence[str] | Iterable[AbsTask] | None = None, require_model_meta: bool = True, include_remote: bool = True, ) -> list[Path]: @@ -422,7 +422,7 @@ def _get_model_name_and_revision_from_path( @staticmethod def _filter_paths_by_model_and_revision( paths: list[Path], - models: Sequence[str] | Sequence[ModelMeta] | None = None, + models: Sequence[str] | Iterable[ModelMeta] | None = None, ) -> list[Path]: """Filter a list of paths by model name and optional revision. @@ -432,8 +432,9 @@ def _filter_paths_by_model_and_revision( if not models: return paths - if isinstance(models[0], ModelMeta): - models = cast(list[ModelMeta], models) + first_model = next(iter(models)) + if isinstance(first_model, ModelMeta): + models = cast(Iterable[ModelMeta], models) name_and_revision = { (m.model_name_as_path(), m.revision or "no_revision_available") for m in models @@ -451,7 +452,7 @@ def _filter_paths_by_model_and_revision( @staticmethod def _filter_paths_by_task( paths: list[Path], - tasks: Sequence[str] | Sequence[AbsTask] | None = None, + tasks: Sequence[str] | Iterable[AbsTask] | Benchmark | None = None, ) -> list[Path]: if tasks is not None: task_names = set() @@ -467,8 +468,8 @@ def _filter_paths_by_task( def load_results( self, - models: Sequence[str] | Sequence[ModelMeta] | None = None, - tasks: Sequence[str] | Sequence[AbsTask] | Benchmark | str | None = None, + models: Sequence[str] | Iterable[ModelMeta] | None = None, + tasks: Sequence[str] | Iterable[AbsTask] | Benchmark | str | None = None, require_model_meta: bool = True, include_remote: bool = True, validate_and_filter: bool = False, diff --git a/mteb/results/benchmark_results.py b/mteb/results/benchmark_results.py index 17e752408d..f3ffeb60db 100644 --- a/mteb/results/benchmark_results.py +++ b/mteb/results/benchmark_results.py @@ -35,11 +35,12 @@ logger = logging.getLogger(__name__) -# Global cache for model metas and version parsing @functools.lru_cache def _get_cached_model_metas() -> dict[str, str | None]: """Cache model metas to avoid repeated calls.""" - return {meta.name: meta.revision for meta in get_model_metas()} + return { + meta.name: meta.revision for meta in get_model_metas() if meta.name is not None + } @functools.lru_cache(maxsize=10000) From 93d223029c9cceba8fcd167ac82d3c67dab58293 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 26 Dec 2025 11:43:14 +0300 Subject: [PATCH 25/32] change to iterable --- mteb/cache.py | 4 ++-- mteb/cli/_display_tasks.py | 4 ++-- mteb/deprecated_evaluator.py | 2 +- mteb/evaluate.py | 4 ++-- mteb/filter_tasks.py | 32 +++++++++++++++---------------- mteb/get_tasks.py | 4 ++-- mteb/load_results.py | 6 +++--- mteb/results/benchmark_results.py | 4 ++-- mteb/results/model_result.py | 4 ++-- 9 files changed, 32 insertions(+), 32 deletions(-) diff --git a/mteb/cache.py b/mteb/cache.py index 5e39314ea6..a6f9f8c2c6 100644 --- a/mteb/cache.py +++ b/mteb/cache.py @@ -452,7 +452,7 @@ def _filter_paths_by_model_and_revision( @staticmethod def _filter_paths_by_task( paths: list[Path], - tasks: Sequence[str] | Iterable[AbsTask] | Benchmark | None = None, + tasks: Sequence[str] | Iterable[AbsTask] | None = None, ) -> list[Path]: if tasks is not None: task_names = set() @@ -469,7 +469,7 @@ def _filter_paths_by_task( def load_results( self, models: Sequence[str] | Iterable[ModelMeta] | None = None, - tasks: Sequence[str] | Iterable[AbsTask] | Benchmark | str | None = None, + tasks: Sequence[str] | Iterable[AbsTask] | str | None = None, require_model_meta: bool = True, include_remote: bool = True, validate_and_filter: bool = False, diff --git a/mteb/cli/_display_tasks.py b/mteb/cli/_display_tasks.py index 4b4fa1268d..cda4f36a00 100644 --- a/mteb/cli/_display_tasks.py +++ b/mteb/cli/_display_tasks.py @@ -1,4 +1,4 @@ -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from mteb.abstasks import AbsTask from mteb.benchmarks import Benchmark @@ -31,7 +31,7 @@ def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None: _display_tasks(benchmark.tasks, name=name) -def _display_tasks(task_list: Sequence[AbsTask], name: str | None = None) -> None: +def _display_tasks(task_list: Iterable[AbsTask], name: str | None = None) -> None: from rich.console import Console console = Console() diff --git a/mteb/deprecated_evaluator.py b/mteb/deprecated_evaluator.py index 54844764bc..853fb7e813 100644 --- a/mteb/deprecated_evaluator.py +++ b/mteb/deprecated_evaluator.py @@ -53,7 +53,7 @@ class MTEB: ) def __init__( self, - tasks: Iterable[AbsTask | Benchmark], + tasks: Iterable[AbsTask] | Iterable[Benchmark], *, err_logs_path: str = "error_logs.txt", ) -> None: diff --git a/mteb/evaluate.py b/mteb/evaluate.py index 331028919f..6ba1d75b07 100644 --- a/mteb/evaluate.py +++ b/mteb/evaluate.py @@ -183,7 +183,7 @@ def _evaluate_task( def _check_model_modalities( model: ModelMeta, - tasks: AbsTask | Benchmark | Iterable[AbsTask], + tasks: AbsTask | Iterable[AbsTask], ) -> None: """Check that model modalities are compatible with task modalities. @@ -267,7 +267,7 @@ def _requires_merge(task: AbsTask, existing_results: TaskResult) -> bool: def evaluate( model: ModelMeta | MTEBModels | SentenceTransformer | CrossEncoder, - tasks: AbsTask | Benchmark | Iterable[AbsTask], + tasks: AbsTask | Iterable[AbsTask], *, co2_tracker: bool | None = None, raise_error: bool = True, diff --git a/mteb/filter_tasks.py b/mteb/filter_tasks.py index 8ee5cc3523..ea0f5cc0f8 100644 --- a/mteb/filter_tasks.py +++ b/mteb/filter_tasks.py @@ -1,7 +1,7 @@ """This script contains functions that are used to get an overview of the MTEB benchmark.""" import logging -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from typing import overload from mteb.abstasks import ( @@ -34,14 +34,14 @@ def _check_is_valid_language(lang: str) -> None: @overload def filter_tasks( - tasks: Sequence[AbsTask], + tasks: Iterable[AbsTask], *, languages: Sequence[str] | None = None, script: Sequence[str] | None = None, - domains: Sequence[TaskDomain] | None = None, - task_types: Sequence[TaskType] | None = None, - categories: Sequence[TaskCategory] | None = None, - modalities: Sequence[Modalities] | None = None, + domains: Iterable[TaskDomain] | None = None, + task_types: Iterable[TaskType] | None = None, + categories: Iterable[TaskCategory] | None = None, + modalities: Iterable[Modalities] | None = None, exclusive_modality_filter: bool = False, exclude_superseded: bool = False, exclude_aggregate: bool = False, @@ -51,14 +51,14 @@ def filter_tasks( @overload def filter_tasks( - tasks: Sequence[type[AbsTask]], + tasks: Iterable[type[AbsTask]], *, languages: Sequence[str] | None = None, script: Sequence[str] | None = None, - domains: Sequence[TaskDomain] | None = None, - task_types: Sequence[TaskType] | None = None, - categories: Sequence[TaskCategory] | None = None, - modalities: Sequence[Modalities] | None = None, + domains: Iterable[TaskDomain] | None = None, + task_types: Iterable[TaskType] | None = None, + categories: Iterable[TaskCategory] | None = None, + modalities: Iterable[Modalities] | None = None, exclusive_modality_filter: bool = False, exclude_superseded: bool = False, exclude_aggregate: bool = False, @@ -67,14 +67,14 @@ def filter_tasks( def filter_tasks( - tasks: Sequence[AbsTask] | Sequence[type[AbsTask]], + tasks: Iterable[AbsTask] | Iterable[type[AbsTask]], *, languages: Sequence[str] | None = None, script: Sequence[str] | None = None, - domains: Sequence[TaskDomain] | None = None, - task_types: Sequence[TaskType] | None = None, - categories: Sequence[TaskCategory] | None = None, - modalities: Sequence[Modalities] | None = None, + domains: Iterable[TaskDomain] | None = None, + task_types: Iterable[TaskType] | None = None, + categories: Iterable[TaskCategory] | None = None, + modalities: Iterable[Modalities] | None = None, exclusive_modality_filter: bool = False, exclude_superseded: bool = False, exclude_aggregate: bool = False, diff --git a/mteb/get_tasks.py b/mteb/get_tasks.py index 614506d0d3..f9f9a68fef 100644 --- a/mteb/get_tasks.py +++ b/mteb/get_tasks.py @@ -3,7 +3,7 @@ import difflib import logging from collections import Counter, defaultdict -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from typing import Any import pandas as pd @@ -42,7 +42,7 @@ def _create_name_to_task_mapping( return metadata_names -def _create_similar_tasks(tasks: Sequence[type[AbsTask]]) -> dict[str, list[str]]: +def _create_similar_tasks(tasks: Iterable[type[AbsTask]]) -> dict[str, list[str]]: """Create a dictionary of similar tasks. Returns: diff --git a/mteb/load_results.py b/mteb/load_results.py index 572c8a1547..c306423bd5 100644 --- a/mteb/load_results.py +++ b/mteb/load_results.py @@ -1,7 +1,7 @@ import json import logging import sys -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from pathlib import Path from mteb.abstasks.abstask import AbsTask @@ -45,8 +45,8 @@ def _model_name_and_revision( def load_results( results_repo: str = "https://github.com/embeddings-benchmark/results", download_latest: bool = True, - models: Sequence[ModelMeta] | Sequence[str] | None = None, - tasks: Sequence[AbsTask] | Sequence[str] | None = None, + models: Iterable[ModelMeta] | Sequence[str] | None = None, + tasks: Iterable[AbsTask] | Sequence[str] | None = None, validate_and_filter: bool = True, require_model_meta: bool = True, only_main_score: bool = False, diff --git a/mteb/results/benchmark_results.py b/mteb/results/benchmark_results.py index f3ffeb60db..b3d39da150 100644 --- a/mteb/results/benchmark_results.py +++ b/mteb/results/benchmark_results.py @@ -4,7 +4,7 @@ import json import logging import warnings -from collections.abc import Callable, Iterable, Iterator, Sequence +from collections.abc import Callable, Iterable, Iterator from pathlib import Path from typing import Any, Literal, cast @@ -100,7 +100,7 @@ def _filter_tasks( model_results=[res for res in model_results if res.task_results] ) - def select_tasks(self, tasks: Sequence[AbsTask]) -> BenchmarkResults: + def select_tasks(self, tasks: Iterable[AbsTask]) -> BenchmarkResults: """Select tasks from the benchmark results. Args: diff --git a/mteb/results/model_result.py b/mteb/results/model_result.py index f8a621e6c5..9e9c8db509 100644 --- a/mteb/results/model_result.py +++ b/mteb/results/model_result.py @@ -2,7 +2,7 @@ import logging import warnings -from collections.abc import Callable, Iterable, Sequence +from collections.abc import Callable, Iterable from typing import Any, Literal, cast import numpy as np @@ -145,7 +145,7 @@ def _filter_tasks( task_results=new_task_results, ) - def select_tasks(self, tasks: Sequence[AbsTask]) -> ModelResult: + def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult: """Select tasks from the ModelResult based on a list of AbsTask objects. Args: From 9b3c1d4b7026592e45e7efb9f36556ddff7ecb52 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 26 Dec 2025 12:23:16 +0300 Subject: [PATCH 26/32] add fixes --- .../text/summarization_evaluator.py | 18 +++++++----------- mteb/abstasks/multilabel_classification.py | 2 +- mteb/abstasks/pair_classification.py | 4 +++- mteb/deprecated_evaluator.py | 13 ++++++------- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/mteb/_evaluators/text/summarization_evaluator.py b/mteb/_evaluators/text/summarization_evaluator.py index 980e7ca095..a57dc78474 100644 --- a/mteb/_evaluators/text/summarization_evaluator.py +++ b/mteb/_evaluators/text/summarization_evaluator.py @@ -1,6 +1,6 @@ import logging import sys -from typing import Any, TypedDict, cast +from typing import Any, TypedDict import numpy as np import torch @@ -218,23 +218,19 @@ def _calculate_metrics( strict=True, ): cosine_spearman_scores.append( - cast(float, spearmanr(human_scores, cosine_pred_scores).statistic) + float(spearmanr(human_scores, cosine_pred_scores).statistic) ) cosine_pearson_scores.append( - cast(float, pearsonr(human_scores, cosine_pred_scores).statistic) + float(pearsonr(human_scores, cosine_pred_scores).statistic) ) dot_spearman_scores.append( - cast(float, spearmanr(human_scores, dot_pred_scores).statistic) + float(spearmanr(human_scores, dot_pred_scores).statistic) ) dot_pearson_scores.append( - cast(float, pearsonr(human_scores, dot_pred_scores).statistic) - ) - spearman_scores.append( - cast(float, spearmanr(human_scores, sim_scores).statistic) - ) - pearson_scores.append( - cast(float, pearsonr(human_scores, sim_scores).statistic) + float(pearsonr(human_scores, dot_pred_scores).statistic) ) + spearman_scores.append(float(spearmanr(human_scores, sim_scores).statistic)) + pearson_scores.append(float(pearsonr(human_scores, sim_scores).statistic)) return SummarizationMetrics( pearson=float(np.mean(pearson_scores)), diff --git a/mteb/abstasks/multilabel_classification.py b/mteb/abstasks/multilabel_classification.py index b970ee9f9a..5c959781c9 100644 --- a/mteb/abstasks/multilabel_classification.py +++ b/mteb/abstasks/multilabel_classification.py @@ -234,7 +234,7 @@ def _undersample_data_indices( """ sample_indices = [] if idxs is None: - idxs = list(range(len(y))) + idxs = list(np.arange(len(y))) self.np_rng.shuffle(idxs) label_counter: dict[int, int] = defaultdict(int) for i in idxs: diff --git a/mteb/abstasks/pair_classification.py b/mteb/abstasks/pair_classification.py index b6ad2f59ec..fa3daa09f0 100644 --- a/mteb/abstasks/pair_classification.py +++ b/mteb/abstasks/pair_classification.py @@ -241,7 +241,9 @@ def _compute_image_hash(inputs: list) -> list[str]: def _push_dataset_to_hub(self, repo_name: str) -> None: # previously pair classification datasets were stored in a single row if self.dataset is None: - raise RuntimeError("Dataset not loaded") + raise RuntimeError( + "Dataset not loaded. To load dataset run `task.load_data()`." + ) if self.metadata.is_multilingual: for subset in self.dataset: for split in self.dataset[subset]: diff --git a/mteb/deprecated_evaluator.py b/mteb/deprecated_evaluator.py index 853fb7e813..a90dcc6cbc 100644 --- a/mteb/deprecated_evaluator.py +++ b/mteb/deprecated_evaluator.py @@ -21,12 +21,10 @@ from mteb.benchmarks import Benchmark from mteb.models import ( CrossEncoderWrapper, - EncoderProtocol, ModelMeta, MTEBModels, SentenceTransformerEncoderWrapper, ) -from mteb.models.models_protocols import CrossEncoderProtocol from mteb.results import TaskResult from mteb.types import ScoresDict @@ -317,9 +315,9 @@ def run( mteb_model: MTEBModels if isinstance(model, SentenceTransformer): - mteb_model = cast(EncoderProtocol, SentenceTransformerEncoderWrapper(model)) + mteb_model = SentenceTransformerEncoderWrapper(model) elif isinstance(model, CrossEncoder): - mteb_model = cast(CrossEncoderProtocol, CrossEncoderWrapper(model)) + mteb_model = CrossEncoderWrapper(model) else: mteb_model = cast(MTEBModels, model) @@ -352,7 +350,6 @@ def run( logger.info( f"\n\n********************** Evaluating {task.metadata.name} **********************" ) - save_path: Path | None = None if task.is_aggregate: aggregated_task = cast(AbsTaskAggregate, task) @@ -375,8 +372,9 @@ def run( evaluation_results.append(new_results) if output_path: - save_path = output_path / f"{aggregated_task.metadata.name}.json" - new_results.to_disk(save_path) + new_results.to_disk( + output_path / f"{aggregated_task.metadata.name}.json" + ) del self.tasks[0] continue @@ -398,6 +396,7 @@ def run( task_subsets = task.hf_subsets existing_results = None + save_path: Path | None = None final_splits_to_run = task_eval_splits missing_evaluations = self._get_missing_evaluations( existing_results, From 3d8c0738310ebe7acf5b406d1a06b0a87ff860a4 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 26 Dec 2025 12:25:16 +0300 Subject: [PATCH 27/32] remove summarization scores hint --- mteb/_evaluators/text/summarization_evaluator.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mteb/_evaluators/text/summarization_evaluator.py b/mteb/_evaluators/text/summarization_evaluator.py index a57dc78474..0efc311715 100644 --- a/mteb/_evaluators/text/summarization_evaluator.py +++ b/mteb/_evaluators/text/summarization_evaluator.py @@ -203,12 +203,12 @@ def _calculate_metrics( self, distances: SummarizationDistances, ) -> SummarizationMetrics: - cosine_spearman_scores: list[float] = [] - cosine_pearson_scores: list[float] = [] - dot_spearman_scores: list[float] = [] - dot_pearson_scores: list[float] = [] - pearson_scores: list[float] = [] - spearman_scores: list[float] = [] + cosine_spearman_scores = [] + cosine_pearson_scores = [] + dot_spearman_scores = [] + dot_pearson_scores = [] + pearson_scores = [] + spearman_scores = [] for human_scores, cosine_pred_scores, dot_pred_scores, sim_scores in zip( distances["human_scores"], From ed773c0dd89f830f5b80fd5215049eeabd5f3a34 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 26 Dec 2025 12:46:43 +0300 Subject: [PATCH 28/32] simplify deprecated_evaluator --- mteb/deprecated_evaluator.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/mteb/deprecated_evaluator.py b/mteb/deprecated_evaluator.py index a90dcc6cbc..02522a42ad 100644 --- a/mteb/deprecated_evaluator.py +++ b/mteb/deprecated_evaluator.py @@ -8,6 +8,7 @@ from collections.abc import Iterable, Sequence from copy import deepcopy from datetime import datetime +from itertools import chain from pathlib import Path from time import time from typing import TYPE_CHECKING, Any, cast @@ -62,19 +63,11 @@ def __init__( `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)"). err_logs_path: Path to save error logs. """ - from mteb.benchmarks import Benchmark - - if isinstance(tasks, list) and all( - isinstance(task, Benchmark) for task in tasks - ): + if isinstance(next(iter(tasks)), Benchmark): self.benchmarks = tasks - self.tasks = [task for bench in tasks for task in bench.tasks] - elif isinstance(tasks, list) and all( - isinstance(task, AbsTask) for task in tasks - ): - self.tasks = list(tasks) - else: - raise ValueError("tasks must be a list of AbsTask or Benchmark instances.") + self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks))) + elif isinstance(next(iter(tasks)), AbsTask): + self.tasks = list(cast(Iterable[AbsTask], tasks)) self.err_logs_path = Path(err_logs_path) self._last_evaluated_splits: dict[str, list[str]] = {} From db47e14ed61867466024ed31a310dbdcc051b394 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 26 Dec 2025 12:54:02 +0300 Subject: [PATCH 29/32] simplify model conversion --- mteb/abstasks/multilabel_classification.py | 7 +++++-- mteb/abstasks/text/bitext_mining.py | 19 ++----------------- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/mteb/abstasks/multilabel_classification.py b/mteb/abstasks/multilabel_classification.py index 5c959781c9..8731b86b0f 100644 --- a/mteb/abstasks/multilabel_classification.py +++ b/mteb/abstasks/multilabel_classification.py @@ -16,7 +16,7 @@ from mteb._create_dataloaders import create_dataloader from mteb._evaluators.classification_metrics import hamming_score from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol -from mteb.models import EncoderProtocol +from mteb.models import EncoderProtocol, MTEBModels from mteb.types import Array from .classification import AbsTaskClassification @@ -80,7 +80,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification): @override def _evaluate_subset( # type: ignore[override] self, - model: EncoderProtocol, + model: MTEBModels, data_split: DatasetDict, *, encode_kwargs: dict[str, Any], @@ -89,6 +89,9 @@ def _evaluate_subset( # type: ignore[override] prediction_folder: Path | None = None, **kwargs: Any, ) -> FullMultilabelClassificationMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + if isinstance(data_split, DatasetDict): data_split = data_split.select_columns( [self.input_column_name, self.label_column_name] diff --git a/mteb/abstasks/text/bitext_mining.py b/mteb/abstasks/text/bitext_mining.py index 1e3f302013..5ca00a62d3 100644 --- a/mteb/abstasks/text/bitext_mining.py +++ b/mteb/abstasks/text/bitext_mining.py @@ -10,7 +10,6 @@ from mteb.abstasks._statistics_calculation import calculate_text_statistics from mteb.abstasks.abstask import AbsTask from mteb.models import EncoderProtocol, MTEBModels -from mteb.models.models_protocols import CrossEncoderProtocol, SearchProtocol from mteb.types import HFSubset, ScoresDict from mteb.types.statistics import SplitDescriptiveStatistics, TextStatistics @@ -79,22 +78,8 @@ def evaluate( **kwargs: Any, ) -> dict[HFSubset, ScoresDict]: """Added load for "parallel" datasets""" - if isinstance(model, CrossEncoderProtocol) and not self._support_cross_encoder: - raise TypeError( - f"Model {model} is a CrossEncoder, but this task {self.metadata.name} does not support CrossEncoders. " - "Please use a Encoder model instead." - ) - - # encoders might implement search protocols - if ( - isinstance(model, SearchProtocol) - and not isinstance(model, EncoderProtocol) - and not self._support_search - ): - raise TypeError( - f"Model {model} is a SearchProtocol, but this task {self.metadata.name} does not support Search. " - "Please use a Encoder model instead." - ) + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") if not self.data_loaded: self.load_data() From 00bac9c1ecdeb3c696364bc2e3ddc0988eaa203e Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 26 Dec 2025 13:07:11 +0300 Subject: [PATCH 30/32] add comment for typechecking --- mteb/abstasks/pair_classification.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mteb/abstasks/pair_classification.py b/mteb/abstasks/pair_classification.py index fa3daa09f0..96966f89bd 100644 --- a/mteb/abstasks/pair_classification.py +++ b/mteb/abstasks/pair_classification.py @@ -241,6 +241,8 @@ def _compute_image_hash(inputs: list) -> list[str]: def _push_dataset_to_hub(self, repo_name: str) -> None: # previously pair classification datasets were stored in a single row if self.dataset is None: + # overall this shouldn't happen as we check for dataset before pushing to hub + # added here for type checking purposes raise RuntimeError( "Dataset not loaded. To load dataset run `task.load_data()`." ) From cb8cf8ef18156ea8078c4803a82a3ff0e42d7ef2 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 26 Dec 2025 13:14:00 +0300 Subject: [PATCH 31/32] remove casts --- mteb/evaluate.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mteb/evaluate.py b/mteb/evaluate.py index 6ba1d75b07..85245021ae 100644 --- a/mteb/evaluate.py +++ b/mteb/evaluate.py @@ -17,8 +17,6 @@ from mteb.cache import ResultCache from mteb.models.model_meta import ModelMeta from mteb.models.models_protocols import ( - CrossEncoderProtocol, - EncoderProtocol, MTEBModels, ) from mteb.models.sentence_transformer_wrapper import ( @@ -60,13 +58,11 @@ def _sanitize_model( wrapped_model: MTEBModels | ModelMeta if isinstance(model, SentenceTransformer): - wrapper = SentenceTransformerEncoderWrapper(model) - meta = wrapper.mteb_model_meta - wrapped_model = cast(EncoderProtocol, wrapper) + wrapped_model = SentenceTransformerEncoderWrapper(model) + meta = wrapped_model.mteb_model_meta elif isinstance(model, CrossEncoder): - cross_encoder_wrapper = CrossEncoderWrapper(model) - meta = cross_encoder_wrapper.mteb_model_meta - wrapped_model = cast(CrossEncoderProtocol, cross_encoder_wrapper) + wrapped_model = CrossEncoderWrapper(model) + meta = wrapped_model.mteb_model_meta elif hasattr(model, "mteb_model_meta"): meta = getattr(model, "mteb_model_meta") if not isinstance(meta, ModelMeta): From f33c354dd438e4c6ab547068a372a54c150ccbbe Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sat, 27 Dec 2025 15:55:24 +0500 Subject: [PATCH 32/32] remove duplicated function --- mteb/abstasks/aggregated_task.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/mteb/abstasks/aggregated_task.py b/mteb/abstasks/aggregated_task.py index 3d2d91f918..1480ea8437 100644 --- a/mteb/abstasks/aggregated_task.py +++ b/mteb/abstasks/aggregated_task.py @@ -6,7 +6,6 @@ import numpy as np from datasets import Dataset, DatasetDict -from typing_extensions import Self from mteb.models.models_protocols import MTEBModels from mteb.results.task_result import TaskResult @@ -122,19 +121,6 @@ def combine_task_results(self, task_results: list[TaskResult]) -> TaskResult: task_res.mteb_version = task_results[0].mteb_version return task_res - def filter_eval_splits(self, eval_splits: list[str] | None) -> Self: - """Filter the evaluation splits of the task. - - Args: - eval_splits: List of splits to evaluate on. If None, all splits in metadata - are used. - - Returns: - The task with filtered evaluation splits. - """ - self._eval_splits = eval_splits - return self - def evaluate( self, model: MTEBModels,