diff --git a/.github/workflows/typechecking.yml b/.github/workflows/typechecking.yml new file mode 100644 index 0000000000..c7e38caeb8 --- /dev/null +++ b/.github/workflows/typechecking.yml @@ -0,0 +1,45 @@ +name: Typechecking + +on: + push: + branches: [main] + pull_request: + + +jobs: + typecheck: + runs-on: ubuntu-latest + steps: + - name: Free disk space + run: | + sudo rm -rf \ + "$AGENT_TOOLSDIRECTORY" \ + /opt/ghc \ + /opt/google/chrome \ + /opt/microsoft/msedge \ + /opt/microsoft/powershell \ + /opt/pipx \ + /usr/lib/mono \ + /usr/local/julia* \ + /usr/local/lib/android \ + /usr/local/lib/node_modules \ + /usr/local/share/chromium \ + /usr/local/share/powershell \ + /usr/local/share/powershell \ + /usr/share/dotnet \ + /usr/share/swift + docker system prune -af + + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 + with: + python-version: "3.10" + + - name: Dependencies + run: | + make install-for-tests + pip install -e . --group typing + + - name: Build and Deploy + run: | + make typecheck diff --git a/docs/mmteb/validate_points.py b/docs/mmteb/validate_points.py index 21b6fd3877..13bee8c047 100644 --- a/docs/mmteb/validate_points.py +++ b/docs/mmteb/validate_points.py @@ -1,6 +1,5 @@ import logging from pathlib import Path -from typing import Optional from jsonlines import Reader from pydantic import BaseModel, ConfigDict, Field, ValidationError, conint, constr @@ -21,17 +20,17 @@ class JsonObject(BaseModel): model_config = ConfigDict(extra="forbid") GitHub: constr(min_length=1) - new_dataset: Optional[conint(ge=1)] = Field(alias="New dataset", default=None) # noqa - new_task: Optional[conint(ge=2)] = Field(alias="New task", default=None) # noqa - dataset_annotations: Optional[conint(ge=1)] = Field( # noqa + new_dataset: conint(ge=1) | None = Field(alias="New dataset", default=None) + new_task: conint(ge=2) | None = Field(alias="New task", default=None) + dataset_annotations: conint(ge=1) | None = Field( alias="Dataset annotations", default=None ) - bug_fixes: Optional[conint(ge=1)] = Field(alias="Bug fixes", default=None) # noqa - running_models: Optional[conint(ge=1)] = Field(alias="Running Models", default=None) # noqa - review_pr: Optional[conint(ge=2)] = Field(alias="Review PR", default=None) # noqa - paper_writing: Optional[int] = Field(alias="Paper writing", default=None) # noqa - Ideation: Optional[int] = None # noqa - Coordination: Optional[int] = None # noqa + bug_fixes: conint(ge=1) | None = Field(alias="Bug fixes", default=None) + running_models: conint(ge=1) | None = Field(alias="Running Models", default=None) + review_pr: conint(ge=2) | None = Field(alias="Review PR", default=None) + paper_writing: int | None = Field(alias="Paper writing", default=None) + Ideation: int | None = None + Coordination: int | None = None def check_max_points(obj: JsonObject, commit_n: str): diff --git a/mteb/_create_dataloaders.py b/mteb/_create_dataloaders.py index 0d4a5ebddd..1492209e43 100644 --- a/mteb/_create_dataloaders.py +++ b/mteb/_create_dataloaders.py @@ -1,4 +1,5 @@ import logging +import warnings from collections.abc import Callable from typing import Any, cast @@ -113,11 +114,8 @@ def _create_text_dataloader_for_queries( ) -_warned_about_user_role = False - - def _convert_conv_history_to_query( - row: dict[str, list[str] | Conversation], + row: dict[str, str | list[str] | Conversation], ) -> dict[str, str | Conversation]: """Convert a conversation history to a single query string. @@ -127,21 +125,18 @@ def _convert_conv_history_to_query( Returns: The updated row with the "query" and "text" fields set to the conversation string, and the "conversation" field set to the list of ConversationTurn. """ - global _warned_about_user_role - conversation = row["text"] # if it's a list of strings, just join them if isinstance(conversation, list) and isinstance(conversation[0], str): - conversation = cast(list[str], conversation) - conv_str = "; ".join(conversation) + conversation_ = cast(list[str], conversation) + conv_str = "; ".join(conversation_) current_conversation = [ - ConversationTurn(role="user", content=message) for message in conversation + ConversationTurn(role="user", content=message) for message in conversation_ ] - if not _warned_about_user_role: - logger.warning( - "Conversations are a list of strings. Used 'user' role for all turns." - ) - _warned_about_user_role = True + warnings.warn( + "Conversations are a list of strings. Used 'user' role for all turns.", + category=UserWarning, + ) # otherwise, it's a list of dictionaries, which we need to convert to strings elif isinstance(conversation, list) and isinstance(conversation[0], dict): conv = [] @@ -178,7 +173,7 @@ def _convert_conv_history_to_query( row["text"] = conv_str row["conversation"] = current_conversation - return row + return cast(dict[str, str | list[ConversationTurn]], row) def _create_dataloader_for_queries_conversation( diff --git a/mteb/_evaluators/any_sts_evaluator.py b/mteb/_evaluators/any_sts_evaluator.py index 0e58bb2814..10106be9fd 100644 --- a/mteb/_evaluators/any_sts_evaluator.py +++ b/mteb/_evaluators/any_sts_evaluator.py @@ -57,10 +57,7 @@ def __init__( self.input2_prompt_type = input2_prompt_type def __call__( - self, - model: EncoderProtocol, - *, - encode_kwargs: dict[str, Any], + self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any] ) -> STSEvaluatorScores: logger.info("Running semantic similarity - Encoding samples (1/2)") embeddings1 = model.encode( diff --git a/mteb/_evaluators/evaluator.py b/mteb/_evaluators/evaluator.py index 9800fcf819..0bd40b397f 100644 --- a/mteb/_evaluators/evaluator.py +++ b/mteb/_evaluators/evaluator.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from collections.abc import Iterable, Mapping from typing import Any from mteb.abstasks.abstask import _set_seed @@ -18,7 +19,7 @@ def __init__(self, seed: int = 42, **kwargs: Any) -> None: @abstractmethod def __call__( self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any] - ) -> dict[str, float]: + ) -> Mapping[str, float] | Iterable[Any]: """This is called during training to evaluate the model. It returns scores. diff --git a/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py b/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py index 791664a4eb..5b8c4ab6dd 100644 --- a/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +++ b/mteb/_evaluators/image/imagetext_pairclassification_evaluator.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from collections.abc import Sequence from typing import TYPE_CHECKING, Any import torch @@ -61,8 +62,8 @@ class ImageTextPairClassificationEvaluator(Evaluator): def __init__( self, dataset, - images_column_names: str | list[str], - texts_column_names: str | list[str], + images_column_names: str | Sequence[str], + texts_column_names: str | Sequence[str], num_images_per_sample: int, num_texts_per_sample: int, task_metadata: TaskMetadata, @@ -82,10 +83,8 @@ def __init__( self.hf_split = hf_split self.hf_subset = hf_subset - def __call__( - self, - model: EncoderProtocol, - encode_kwargs: dict[str, Any], + def __call__( # type: ignore[override] + self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any] ) -> list[torch.Tensor]: images = [] if isinstance(self.images_column_names, str): diff --git a/mteb/_evaluators/pair_classification_evaluator.py b/mteb/_evaluators/pair_classification_evaluator.py index da346cce59..c54697e376 100644 --- a/mteb/_evaluators/pair_classification_evaluator.py +++ b/mteb/_evaluators/pair_classification_evaluator.py @@ -148,7 +148,9 @@ def _encode_unique_texts( hf_subset: str, **encode_kwargs: Any, ) -> np.ndarray: - index_map, all_unique_texts, all_texts_indexes = {}, [], [] + index_map = {} + all_unique_texts: list[str] = [] + all_texts_indexes = [] for text in all_texts: text_hash = hash(text) if text_hash not in index_map: diff --git a/mteb/_evaluators/retrieval_metrics.py b/mteb/_evaluators/retrieval_metrics.py index e998883a13..2e8ac2110b 100644 --- a/mteb/_evaluators/retrieval_metrics.py +++ b/mteb/_evaluators/retrieval_metrics.py @@ -1,5 +1,6 @@ import logging from collections import defaultdict +from collections.abc import Mapping from typing import Any import numpy as np @@ -15,7 +16,7 @@ def mrr( qrels: RelevantDocumentsType, - results: dict[str, dict[str, float]], + results: Mapping[str, Mapping[str, float]], k_values: list[int], ) -> dict[str, list[float]]: mrr_metrics = defaultdict(list) @@ -32,7 +33,7 @@ def mrr( doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0 } for k in k_values: - rr = 0 + rr = 0.0 for rank, hit in enumerate(top_hits[query_id][0:k]): if hit[0] in query_relevant_docs: rr = 1.0 / (rank + 1) @@ -45,8 +46,8 @@ def recall_cap( qrels: RelevantDocumentsType, results: dict[str, dict[str, float]], k_values: list[int], -) -> dict[str, list[float]]: - capped_recall = defaultdict(list) +) -> dict[str, list[float | None]]: + capped_recall: dict[str, list[float | None]] = defaultdict(list) k_max = max(k_values) @@ -188,7 +189,7 @@ def evaluate_p_mrr_change( Returns: A dictionary with the scores, including "p-MRR", "og" and "changed" keys. """ - followir_scores = defaultdict(dict) + followir_scores: dict[str, float | dict[str, float]] = defaultdict(dict) qrels_sep = { "og": {k: v for k, v in qrels.items() if k.endswith("-og")}, @@ -227,7 +228,7 @@ def evaluate_p_mrr_change( ndcg, _map, recall, precision, naucs, avg_mrr, naucs_mrr, cv_recall, {} ) for key, value in scores_dict.items(): - followir_scores[name][key] = value + followir_scores[name][key] = value # type: ignore[index] return followir_scores @@ -254,8 +255,8 @@ def confidence_scores(sim_scores: list[float]) -> dict[str, float]: sim_scores_sorted = sorted(sim_scores)[::-1] cs_max = sim_scores_sorted[0] - cs_std = np.std(sim_scores) - cs_diff1 = None + cs_std = float(np.std(sim_scores)) + cs_diff1 = 0.0 if len(sim_scores) > 1: cs_diff1 = sim_scores_sorted[0] - sim_scores_sorted[1] elif len(sim_scores) == 1: @@ -410,7 +411,7 @@ def make_score_dict( cv_recall: dict[str, float], task_scores: dict[str, float], previous_results_model_meta: dict[str, Any] | None = None, -) -> dict[str, float]: +) -> dict[str, Any]: return { **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()}, @@ -528,7 +529,7 @@ def max_over_subqueries( def calculate_retrieval_scores( - results: dict[str, dict[str, float]], + results: Mapping[str, Mapping[str, float]], qrels: RelevantDocumentsType, k_values: list[int], skip_first_result: bool = False, @@ -576,7 +577,7 @@ def calculate_retrieval_scores( def evaluate_abstention( - results: dict[str, dict[str, float]], + results: Mapping[str, Mapping[str, float]], metric_scores: dict[str, list[float]], ) -> dict[str, float]: """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997 @@ -591,21 +592,21 @@ def evaluate_abstention( all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())] all_conf_scores = [confidence_scores(sim_scores) for sim_scores in all_sim_scores] conf_fcts = list(all_conf_scores[0].keys()) - all_conf_scores = { + all_conf_scores_ = { fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts } - metric_scores = {k: np.array(v) for k, v in metric_scores.items()} + metric_scores_ = {k: np.array(v) for k, v in metric_scores.items()} naucs = {} - for metric_name, scores in metric_scores.items(): - for fct, conf_scores in all_conf_scores.items(): + for metric_name, scores in metric_scores_.items(): + for fct, conf_scores in all_conf_scores_.items(): naucs[f"nAUC_{metric_name}_{fct}"] = nauc(conf_scores, scores) return naucs def calculate_cv_recall( - results: dict[str, dict[str, float]], + results: Mapping[str, Mapping[str, float]], qrels: RelevantDocumentsType, k_values: list[int], skip_first_result: bool = False, diff --git a/mteb/_evaluators/sklearn_evaluator.py b/mteb/_evaluators/sklearn_evaluator.py index 1e043dc770..ae7e420fa6 100644 --- a/mteb/_evaluators/sklearn_evaluator.py +++ b/mteb/_evaluators/sklearn_evaluator.py @@ -1,5 +1,5 @@ import logging -from typing import Any, Protocol +from typing import Any, Protocol, cast import numpy as np from datasets import Dataset @@ -9,7 +9,7 @@ from mteb._create_dataloaders import create_dataloader from mteb.abstasks.task_metadata import TaskMetadata from mteb.models import EncoderProtocol -from mteb.types import BatchedInput +from mteb.types import Array, BatchedInput from .evaluator import Evaluator @@ -17,11 +17,11 @@ class SklearnModelProtocol(Protocol): - def fit(self, X: np.ndarray, y: np.ndarray | list[int]) -> None: ... # noqa: N803 - def predict(self, X: np.ndarray) -> np.ndarray: ... # noqa: N803 + def fit(self, X: Array, y: np.ndarray | list[int]) -> None: ... # noqa: N803 + def predict(self, X: Array) -> np.ndarray: ... # noqa: N803 def get_params(self) -> dict[str, Any]: ... - def set_params(self, **kwargs: dict[str, Any]) -> Self: ... - def score(self, X: np.ndarray, y: np.ndarray | list[int]) -> float: ... # noqa: N803 + def set_params(self, random_state: int, **kwargs: dict[str, Any]) -> Self: ... + def score(self, X: Array, y: np.ndarray | list[int]) -> float: ... # noqa: N803 class SklearnEvaluator(Evaluator): @@ -71,8 +71,8 @@ def __call__( # type: ignore[override] model: EncoderProtocol, *, encode_kwargs: dict[str, Any], - test_cache: np.ndarray | None = None, - ) -> tuple[np.ndarray, np.ndarray]: + test_cache: Array | None = None, + ) -> tuple[np.ndarray, Array]: """Classification evaluation by training a sklearn classifier on the embeddings of the training set and evaluating on the embeddings of the test set. Args: @@ -104,6 +104,7 @@ def __call__( # type: ignore[override] hf_subset=self.hf_subset, **encode_kwargs, ) + test_cache = cast(Array, test_cache) logger.info("Running - Fitting classifier...") y_train = self.train_dataset[self.label_column_name] diff --git a/mteb/_evaluators/text/bitext_mining_evaluator.py b/mteb/_evaluators/text/bitext_mining_evaluator.py index 796d516ea1..eff53e3e3a 100644 --- a/mteb/_evaluators/text/bitext_mining_evaluator.py +++ b/mteb/_evaluators/text/bitext_mining_evaluator.py @@ -1,7 +1,6 @@ import logging from typing import Any -import numpy as np import torch from datasets import Dataset from tqdm.auto import tqdm @@ -10,6 +9,7 @@ from mteb._evaluators.evaluator import Evaluator from mteb.abstasks.task_metadata import TaskMetadata from mteb.models import EncoderProtocol +from mteb.types import Array logger = logging.getLogger(__name__) @@ -69,11 +69,11 @@ def __call__( def _similarity_search( self, - query_embeddings: np.ndarray, - corpus_embeddings: np.ndarray, + query_embeddings: Array, + corpus_embeddings: Array, model: EncoderProtocol, query_chunk_size: int = 100, - corpus_chunk_size: int = 500000, + corpus_chunk_size: int = 500_000, ) -> list[dict[str, float]]: """This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings. @@ -104,13 +104,15 @@ def _similarity_search( ): query_embeddings = query_embeddings.to(corpus_embeddings.device) - queries_result_list = [[] for _ in range(len(query_embeddings))] + queries_result_list: list[list[dict[str, float]]] = [ + [] for _ in range(len(query_embeddings)) + ] for query_start_idx in range(0, len(query_embeddings), query_chunk_size): # Iterate over chunks of the corpus for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size): # Compute cosine similarities - similarity_scores = model.similarity( # type: ignore + similarity_scores = model.similarity( query_embeddings[ query_start_idx : query_start_idx + query_chunk_size ], @@ -120,15 +122,17 @@ def _similarity_search( ) # Get top-k scores - cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( - torch.tensor(similarity_scores), - 1, - dim=1, - largest=True, - sorted=False, + cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = ( + torch.topk( + torch.tensor(similarity_scores), + 1, + dim=1, + largest=True, + sorted=False, + ) ) - cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() - cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() + cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist() + cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist() for query_itr in range(len(similarity_scores)): for sub_corpus_id, score in zip( @@ -141,11 +145,14 @@ def _similarity_search( {"corpus_id": corpus_id, "score": score} ) + result_queries_list: list[dict[str, float]] = [ + {} for _ in range(len(query_embeddings)) + ] # Sort and strip to top_k results for idx in range(len(queries_result_list)): queries_result_list[idx] = sorted( queries_result_list[idx], key=lambda x: x["score"], reverse=True ) - queries_result_list[idx] = queries_result_list[idx][0] + result_queries_list[idx] = queries_result_list[idx][0] - return queries_result_list + return result_queries_list diff --git a/mteb/_evaluators/text/summarization_evaluator.py b/mteb/_evaluators/text/summarization_evaluator.py index 5c6068d6eb..0efc311715 100644 --- a/mteb/_evaluators/text/summarization_evaluator.py +++ b/mteb/_evaluators/text/summarization_evaluator.py @@ -135,10 +135,10 @@ def __call__( ) # Split the embeddings into the original human & machine summaries - embs_human_summaries_all = np.split( + embs_human_summaries_all_split = np.split( embs_human_summaries_all, np.cumsum(human_lens)[:-1] ) - embs_machine_summaries_all = np.split( + embs_machine_summaries_all_split = np.split( embs_machine_summaries_all, np.cumsum(machine_lens)[:-1] ) @@ -148,7 +148,9 @@ def __call__( all_human_scores = [] for i, (embs_human_summaries, embs_machine_summaries) in tqdm( - enumerate(zip(embs_human_summaries_all, embs_machine_summaries_all)), + enumerate( + zip(embs_human_summaries_all_split, embs_machine_summaries_all_split) + ), desc="Scoring", total=len(self.human_summaries), ): @@ -164,7 +166,7 @@ def __call__( dot_scores = dot_score(emb_machine_summary, embs_human_summaries) _sim_score = [ - float(model.similarity(emb_machine_summary, emb_human_summary)) # type: ignore + float(model.similarity(emb_machine_summary, emb_human_summary)) for emb_human_summary in embs_human_summaries ] sim_score = torch.tensor(_sim_score) @@ -216,17 +218,19 @@ def _calculate_metrics( strict=True, ): cosine_spearman_scores.append( - spearmanr(human_scores, cosine_pred_scores).statistic + float(spearmanr(human_scores, cosine_pred_scores).statistic) ) cosine_pearson_scores.append( - pearsonr(human_scores, cosine_pred_scores).statistic + float(pearsonr(human_scores, cosine_pred_scores).statistic) ) dot_spearman_scores.append( - spearmanr(human_scores, dot_pred_scores).statistic + float(spearmanr(human_scores, dot_pred_scores).statistic) + ) + dot_pearson_scores.append( + float(pearsonr(human_scores, dot_pred_scores).statistic) ) - dot_pearson_scores.append(pearsonr(human_scores, dot_pred_scores).statistic) - spearman_scores.append(spearmanr(human_scores, sim_scores).statistic) - pearson_scores.append(pearsonr(human_scores, sim_scores).statistic) + spearman_scores.append(float(spearmanr(human_scores, sim_scores).statistic)) + pearson_scores.append(float(pearsonr(human_scores, sim_scores).statistic)) return SummarizationMetrics( pearson=float(np.mean(pearson_scores)), @@ -273,10 +277,10 @@ def _calculate_metrics( pearson_scores.append(pearsonr(human_scores, sim_scores)) return SummarizationMetrics( - pearson=float(np.mean(pearson_scores)), - spearman=float(np.mean(spearman_scores)), - cosine_spearman=float(np.mean(cosine_spearman_scores)), - cosine_pearson=float(np.mean(cosine_pearson_scores)), - dot_pearson=float(np.mean(dot_pearson_scores)), - dot_spearman=float(np.mean(dot_spearman_scores)), + pearson=float(np.mean(pearson_scores)), # type: ignore[arg-type] + spearman=float(np.mean(spearman_scores)), # type: ignore[arg-type] + cosine_spearman=float(np.mean(cosine_spearman_scores)), # type: ignore[arg-type] + cosine_pearson=float(np.mean(cosine_pearson_scores)), # type: ignore[arg-type] + dot_pearson=float(np.mean(dot_pearson_scores)), # type: ignore[arg-type] + dot_spearman=float(np.mean(dot_spearman_scores)), # type: ignore[arg-type] ) diff --git a/mteb/abstasks/_data_filter/filters.py b/mteb/abstasks/_data_filter/filters.py index 23f12cd820..16ed5e8d97 100644 --- a/mteb/abstasks/_data_filter/filters.py +++ b/mteb/abstasks/_data_filter/filters.py @@ -61,7 +61,7 @@ def filter_unclear_label( for text, label in zip(ds[input_column], ds[label_column]): key = text.strip().lower() normalized.setdefault(key, set()).add( - label if isinstance(label, (str, int, float)) else tuple(label) + label if isinstance(label, (str, int, float)) else tuple(label) # type: ignore[arg-type] ) bad_texts = {t for t, labels in normalized.items() if len(labels) > 1} diff --git a/mteb/abstasks/_data_filter/task_pipelines.py b/mteb/abstasks/_data_filter/task_pipelines.py index f12f10e60e..c376edc546 100644 --- a/mteb/abstasks/_data_filter/task_pipelines.py +++ b/mteb/abstasks/_data_filter/task_pipelines.py @@ -89,6 +89,9 @@ def process_classification( subset=None, ) + if task.dataset is None: + raise ValueError("Task dataset is None.") + new_ds = {} for subset in task.dataset: new_ds[subset] = clean_dataset( diff --git a/mteb/abstasks/_statistics_calculation.py b/mteb/abstasks/_statistics_calculation.py index 404bee14e6..598d50af71 100644 --- a/mteb/abstasks/_statistics_calculation.py +++ b/mteb/abstasks/_statistics_calculation.py @@ -2,7 +2,8 @@ import hashlib from collections import Counter -from typing import TYPE_CHECKING +from collections.abc import Mapping +from typing import TYPE_CHECKING, cast from mteb.types import TopRankedDocumentsType from mteb.types.statistics import ( @@ -52,7 +53,7 @@ def calculate_image_statistics(images: list[Image.Image]) -> ImageStatistics: seen_hashes: set[str] = set() for img in images: - width, height = img.size # type: ignore + width, height = img.size img_heights.append(height) img_widths.append(width) @@ -82,17 +83,24 @@ def calculate_label_statistics(labels: list[int | list[int]]) -> LabelStatistics LabelStatistics: A dictionary containing the descriptive statistics. """ + total_labels: list[int | None] = [] + if not isinstance(labels[0], list): - label_len = [1] * len(labels) - total_label_len = len(labels) - total_labels = labels + # single label classification + single_label = cast(list[int], labels) + label_len = [1] * len(single_label) + total_label_len = len(single_label) + total_labels.extend(single_label) elif isinstance(labels[0], list): # multilabel classification - label_len = [len(l) for l in labels] + multilabel_labels = cast(list[list[int]], labels) + label_len = [len(l) for l in multilabel_labels] total_label_len = sum(label_len) - total_labels = [] - for l in labels: - total_labels.extend(l if len(l) > 0 else [None]) + for l in multilabel_labels: + if l and len(l) > 0: + total_labels.extend(l) + else: + total_labels.append(None) else: raise ValueError( "Labels must be a list of integers or a list of lists of integers." @@ -159,7 +167,7 @@ def calculate_top_ranked_statistics( def calculate_relevant_docs_statistics( - relevant_docs: dict[str, dict[str, float]], + relevant_docs: Mapping[str, Mapping[str, int]], ) -> RelevantDocsStatistics: qrels_lengths = [len(relevant_docs[qid]) for qid in relevant_docs] unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]}) diff --git a/mteb/abstasks/_stratification.py b/mteb/abstasks/_stratification.py index e647717a4d..b30c591486 100644 --- a/mteb/abstasks/_stratification.py +++ b/mteb/abstasks/_stratification.py @@ -39,6 +39,7 @@ """ import itertools +from typing import Any import numpy as np import scipy.sparse as sp @@ -119,8 +120,10 @@ def _get_most_desired_combination(samples_with_combination: dict): if support_size == 0: continue if currently_chosen is None or ( - best_number_of_combinations < number_of_combinations # type: ignore - and best_support_size > support_size # type: ignore + best_number_of_combinations is not None + and best_support_size is not None + and best_number_of_combinations < number_of_combinations + and best_support_size > support_size ): currently_chosen = combination best_number_of_combinations, best_support_size = ( @@ -162,7 +165,7 @@ def __init__( self._rng_state = check_random_state(random_state) need_shuffle = shuffle or random_state is not None self.order = order - super().__init__( # type: ignore + super().__init__( n_splits, shuffle=need_shuffle, random_state=self._rng_state if need_shuffle else None, @@ -172,8 +175,7 @@ def __init__( self.percentage_per_fold = sample_distribution_per_fold else: self.percentage_per_fold = [ - 1 / float(self.n_splits) - for _ in range(self.n_splits) # type: ignore + 1 / float(self.n_splits) for _ in range(self.n_splits) ] def _prepare_stratification( @@ -182,9 +184,9 @@ def _prepare_stratification( list[list[int]], dict[int, bool], list[list[int]], - list[list[list[int]]], - dict[tuple[int, ...], list[int]], - list[list[int]], + list[list[Any]], + dict[str, list[Any]], + list[list[Any]], ]: """Prepares variables for performing stratification @@ -206,14 +208,14 @@ def _prepare_stratification( """ self.n_samples, self.n_labels = y.shape self.desired_samples_per_fold = np.array( - [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)] # type: ignore + [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)] ) rows = sp.lil_matrix(y).rows rows_used = dict.fromkeys(range(self.n_samples), False) all_combinations = [] - per_row_combinations = [[] for i in range(self.n_samples)] - samples_with_combination = {} - folds = [[] for _ in range(self.n_splits)] # type: ignore + per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)] + samples_with_combination: dict[str, list[Any]] = {} + folds: list[list[int]] = [[] for _ in range(self.n_splits)] # for every row for sample_index, label_assignment in enumerate(rows): @@ -229,21 +231,19 @@ def _prepare_stratification( all_combinations.append(combination) per_row_combinations[sample_index].append(combination) - all_combinations = [list(x) for x in set(all_combinations)] - self.desired_samples_per_combination_per_fold = { combination: np.array( [ len(evidence_for_combination) * self.percentage_per_fold[j] - for j in range(self.n_splits) # type: ignore + for j in range(self.n_splits) ] ) for combination, evidence_for_combination in samples_with_combination.items() } return ( - rows, + rows.tolist(), rows_used, - all_combinations, + [list(x) for x in set(all_combinations)], per_row_combinations, samples_with_combination, folds, @@ -328,7 +328,7 @@ def _iter_test_indices(self, X, y=None, groups=None): # noqa: N803 per_row_combinations, samples_with_combination, folds, - ) = self._prepare_stratification(y) # type: ignore + ) = self._prepare_stratification(y) self._distribute_positive_evidence( rows_used, folds, samples_with_combination, per_row_combinations diff --git a/mteb/abstasks/abstask.py b/mteb/abstasks/abstask.py index 3cf147ed8d..e243794e35 100644 --- a/mteb/abstasks/abstask.py +++ b/mteb/abstasks/abstask.py @@ -2,10 +2,10 @@ import logging import warnings from abc import ABC, abstractmethod -from collections.abc import Sequence +from collections.abc import Mapping, Sequence from copy import copy from pathlib import Path -from typing import Any, cast +from typing import Any, Literal, cast import numpy as np from datasets import ClassLabel, Dataset, DatasetDict, load_dataset @@ -79,8 +79,8 @@ class AbsTask(ABC): """ metadata: TaskMetadata - abstask_prompt: str | None = None - _eval_splits: list[str] | None = None + abstask_prompt: str + _eval_splits: Sequence[str] | None = None dataset: dict[HFSubset, DatasetDict] | None = None data_loaded: bool = False hf_subsets: list[HFSubset] @@ -124,7 +124,7 @@ def evaluate( encode_kwargs: dict[str, Any], prediction_folder: Path | None = None, **kwargs: Any, - ) -> dict[HFSubset, ScoresDict]: + ) -> Mapping[HFSubset, ScoresDict]: """Evaluates an MTEB compatible model on the task. Args: @@ -196,12 +196,12 @@ def evaluate( @abstractmethod def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, - encode_kwargs: dict[str, Any], hf_split: str, hf_subset: str, + encode_kwargs: dict[str, Any], prediction_folder: Path | None = None, **kwargs: Any, ) -> ScoresDict: @@ -211,7 +211,7 @@ def _evaluate_subset( def _save_task_predictions( self, - predictions: dict[str, Any] | list[Any], + predictions: Mapping[str, Any] | list[Any], model: MTEBModels, prediction_folder: Path, hf_split: str, @@ -227,7 +227,7 @@ def _save_task_predictions( hf_subset: The subset of the dataset (e.g. "en"). """ predictions_path = self._predictions_path(prediction_folder) - existing_results = { + existing_results: dict[str, Any] = { "mteb_model_meta": { "model_name": model.mteb_model_meta.name, "revision": model.mteb_model_meta.revision, @@ -327,7 +327,7 @@ def load_data(self) -> None: ) else: # some of monolingual datasets explicitly adding the split name to the dataset name - self.dataset = load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -363,15 +363,19 @@ def calculate_descriptive_statistics( """ from mteb.abstasks import AbsTaskClassification - if self.metadata.descriptive_stat_path.exists() and not overwrite_results: + existing_stats = self.metadata.descriptive_stats + + if existing_stats is not None and not overwrite_results: logger.info("Loading metadata descriptive statistics from cache.") - return self.metadata.descriptive_stats + return existing_stats if not self.data_loaded: self.load_data() descriptive_stats: dict[str, DescriptiveStatistics] = {} - hf_subset_stat = "hf_subset_descriptive_stats" + hf_subset_stat: Literal["hf_subset_descriptive_stats"] = ( + "hf_subset_descriptive_stats" + ) eval_splits = self.metadata.eval_splits if isinstance(self, AbsTaskClassification): eval_splits.append(self.train_split) @@ -382,7 +386,7 @@ def calculate_descriptive_statistics( logger.info(f"Processing metadata for split {split}") if self.metadata.is_multilingual: descriptive_stats[split] = ( - self._calculate_descriptive_statistics_from_split( + self._calculate_descriptive_statistics_from_split( # type: ignore[assignment] split, compute_overall=True ) ) @@ -401,7 +405,7 @@ def calculate_descriptive_statistics( descriptive_stats[split][hf_subset_stat][hf_subset] = split_details else: split_details = self._calculate_descriptive_statistics_from_split(split) - descriptive_stats[split] = split_details + descriptive_stats[split] = split_details # type: ignore[assignment] with self.metadata.descriptive_stat_path.open("w") as f: json.dump(descriptive_stats, f, indent=4) @@ -438,7 +442,7 @@ def languages(self) -> list[str]: return self.metadata.languages - def filter_eval_splits(self, eval_splits: list[str] | None) -> Self: + def filter_eval_splits(self, eval_splits: Sequence[str] | None) -> Self: """Filter the evaluation splits of the task. Args: @@ -452,9 +456,9 @@ def filter_eval_splits(self, eval_splits: list[str] | None) -> Self: def filter_languages( self, - languages: list[str] | None, - script: list[str] | None = None, - hf_subsets: list[HFSubset] | None = None, + languages: Sequence[str] | None, + script: Sequence[str] | None = None, + hf_subsets: Sequence[HFSubset] | None = None, exclusive_language_filter: bool = False, ) -> Self: """Filter the languages of the task. @@ -500,12 +504,14 @@ def filter_languages( self.hf_subsets = subsets_to_keep return self - def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None: + def _add_main_score(self, scores: ScoresDict) -> None: scores["main_score"] = scores[self.metadata.main_score] def _upload_dataset_to_hub( self, repo_name: str, fields: list[str] | dict[str, str] ) -> None: + if self.dataset is None: + raise ValueError("Dataset not loaded") if self.metadata.is_multilingual: for config in self.metadata.eval_langs: logger.info(f"Converting {config} of {self.metadata.name}") @@ -575,7 +581,7 @@ def is_aggregate(self) -> bool: return False @property - def eval_splits(self) -> list[str]: + def eval_splits(self) -> Sequence[str]: """Returns the evaluation splits of the task.""" if self._eval_splits: return self._eval_splits diff --git a/mteb/abstasks/aggregate_task_metadata.py b/mteb/abstasks/aggregate_task_metadata.py index 97a38c8268..560fb7c60f 100644 --- a/mteb/abstasks/aggregate_task_metadata.py +++ b/mteb/abstasks/aggregate_task_metadata.py @@ -5,7 +5,6 @@ from typing_extensions import Self from mteb.types import ( - HFSubset, ISOLanguageScript, Languages, Licenses, @@ -60,14 +59,7 @@ class AggregateTaskMetadata(TaskMetadata): reference: str | None = None bibtex_citation: str | None = None - @property - def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]: - """Return a dictionary mapping huggingface subsets to languages.""" - if isinstance(self.eval_langs, dict): - return self.eval_langs - return {"default": self.eval_langs} # type: ignore - - @model_validator(mode="after") # type: ignore + @model_validator(mode="after") def _compute_unfilled_cases(self) -> Self: if not self.eval_langs: self.eval_langs = self._compute_eval_langs() diff --git a/mteb/abstasks/aggregated_task.py b/mteb/abstasks/aggregated_task.py index 7e1657d528..1480ea8437 100644 --- a/mteb/abstasks/aggregated_task.py +++ b/mteb/abstasks/aggregated_task.py @@ -1,11 +1,11 @@ import logging import warnings +from collections.abc import Mapping from pathlib import Path from typing import Any import numpy as np from datasets import Dataset, DatasetDict -from typing_extensions import Self from mteb.models.models_protocols import MTEBModels from mteb.results.task_result import TaskResult @@ -33,7 +33,7 @@ def __init__(self, **kwargs: Any): def task_results_to_scores( self, task_results: list[TaskResult] - ) -> dict[str, dict[HFSubset, ScoresDict]]: + ) -> dict[str, Mapping[HFSubset, ScoresDict]]: """The function that aggregated scores. Can be redefined to allow for custom aggregations. Args: @@ -42,7 +42,7 @@ def task_results_to_scores( Returns: A dictionary with the aggregated scores. """ - scores = {} + scores: dict[str, Mapping[HFSubset, ScoresDict]] = {} subsets = ( self.metadata.eval_langs.keys() if isinstance(self.metadata.eval_langs, dict) @@ -121,19 +121,6 @@ def combine_task_results(self, task_results: list[TaskResult]) -> TaskResult: task_res.mteb_version = task_results[0].mteb_version return task_res - def filter_eval_splits(self, eval_splits: list[str] | None) -> Self: - """Filter the evaluation splits of the task. - - Args: - eval_splits: List of splits to evaluate on. If None, all splits in metadata - are used. - - Returns: - The task with filtered evaluation splits. - """ - self._eval_splits = eval_splits - return self - def evaluate( self, model: MTEBModels, diff --git a/mteb/abstasks/classification.py b/mteb/abstasks/classification.py index fe26d2623c..a4f33f18fc 100644 --- a/mteb/abstasks/classification.py +++ b/mteb/abstasks/classification.py @@ -143,6 +143,9 @@ def evaluate( if not self.data_loaded: self.load_data() + if self.dataset is None: + raise RuntimeError("Dataset not loaded.") + if "random_state" in self.evaluator_model.get_params(): self.evaluator_model = self.evaluator_model.set_params( random_state=self.seed @@ -175,11 +178,11 @@ def evaluate( ) self._add_main_score(scores[hf_subset]) - return scores + return scores # type: ignore[return-value] def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: DatasetDict, *, encode_kwargs: dict[str, Any], @@ -188,6 +191,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs: Any, ) -> FullClassificationMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + train_split = data_split[self.train_split] eval_split = data_split[hf_split] @@ -237,7 +243,7 @@ def _evaluate_subset( # ap will be none for non binary classification tasks k: ( float(np.mean(values)) - if (values := [s[k] for s in scores if s[k] is not None]) + if (values := [s[k] for s in scores if s[k] is not None]) # type: ignore[literal-required] else np.nan ) for k in scores[0].keys() @@ -245,7 +251,7 @@ def _evaluate_subset( logger.info(f"Running {self.metadata.name} - Finished.") return FullClassificationMetrics( scores_per_experiment=scores, - **avg_scores, + **avg_scores, # type: ignore[typeddict-item] ) def _calculate_scores( diff --git a/mteb/abstasks/clustering.py b/mteb/abstasks/clustering.py index ca603a3be4..1aae041023 100644 --- a/mteb/abstasks/clustering.py +++ b/mteb/abstasks/clustering.py @@ -3,7 +3,7 @@ import random from collections import defaultdict from pathlib import Path -from typing import Any +from typing import Any, cast import numpy as np from datasets import Dataset, DatasetDict @@ -11,8 +11,8 @@ from sklearn.metrics.cluster import v_measure_score from mteb._create_dataloaders import create_dataloader -from mteb.models import EncoderProtocol -from mteb.types import HFSubset, ScoresDict +from mteb.models import EncoderProtocol, MTEBModels +from mteb.types import Array, HFSubset, ScoresDict from mteb.types.statistics import ( ImageStatistics, LabelStatistics, @@ -34,7 +34,7 @@ def _evaluate_clustering_bootstrapped( - embeddings: np.ndarray, + embeddings: Array, labels: list[list[str]], n_clusters: int, cluster_size: int, @@ -61,21 +61,21 @@ def _evaluate_clustering_bootstrapped( max_depth = max(map(len, labels)) # Evaluate on each level til max depth for i_level in range(max_depth): - level_labels = [] + level_labels: list[str | int] = [] # Assign -1 to gold label if the level is not there for label in labels: if len(label) > i_level: level_labels.append(label[i_level]) else: level_labels.append(-1) - level_labels = np.array(level_labels) + np_level_labels = np.array(level_labels) valid_idx = np.array( - [level_label != -1 for level_label in level_labels] + [level_label != -1 for level_label in np_level_labels] ) # Could be level_labels != -1 but fails with FutureWarning: elementwise comparison failed - level_labels = level_labels[valid_idx] + np_level_labels = np_level_labels[valid_idx] level_embeddings = embeddings[valid_idx] clustering_model = MiniBatchKMeans( - n_clusters=np.unique(level_labels).size, + n_clusters=np.unique(np_level_labels).size, batch_size=kmean_batch_size, init="k-means++", n_init=1, # default when kmeans++ is used @@ -87,7 +87,7 @@ def _evaluate_clustering_bootstrapped( cluster_indices = rng_state.choices(range(n_embeddings), k=cluster_size) _embeddings = level_embeddings[cluster_indices] - _labels = level_labels[cluster_indices] + _labels = np_level_labels[cluster_indices] cluster_assignment = clustering_model.fit_predict(_embeddings) v_measure = v_measure_score(_labels, cluster_assignment) v_measures[f"Level {i_level}"].append(v_measure) @@ -153,7 +153,7 @@ class AbsTaskClustering(AbsTask): def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, encode_kwargs: dict[str, Any], @@ -162,6 +162,10 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs: Any, ) -> ScoresDict: + if not isinstance(model, EncoderProtocol): + raise TypeError( + "Expected encoder model to be an instance of EncoderProtocol." + ) if ( self.max_document_to_embed is not None and self.max_fraction_of_documents_to_embed is not None @@ -182,13 +186,13 @@ def _evaluate_subset( self.max_fraction_of_documents_to_embed * len(data_split) ) else: - max_documents_to_embed = self.max_document_to_embed + max_documents_to_embed = cast(int, self.max_document_to_embed) - max_documents_to_embed = min(len(data_split), max_documents_to_embed) # type: ignore + max_documents_to_embed = min(len(data_split), max_documents_to_embed) example_indices = self.rng_state.sample( range(len(data_split)), k=max_documents_to_embed ) - downsampled_dataset = data_split.select(example_indices) # type: ignore + downsampled_dataset = data_split.select(example_indices) downsampled_dataset = downsampled_dataset.select_columns( [self.input_column_name, self.label_column_name] diff --git a/mteb/abstasks/clustering_legacy.py b/mteb/abstasks/clustering_legacy.py index 07d85b75cd..0860cad049 100644 --- a/mteb/abstasks/clustering_legacy.py +++ b/mteb/abstasks/clustering_legacy.py @@ -8,7 +8,7 @@ from sklearn import metrics from mteb._evaluators import ClusteringEvaluator -from mteb.models import EncoderProtocol +from mteb.models import EncoderProtocol, MTEBModels from mteb.types import ScoresDict from mteb.types.statistics import ( ImageStatistics, @@ -80,7 +80,7 @@ class AbsTaskClusteringLegacy(AbsTask): def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, encode_kwargs: dict[str, Any], @@ -89,6 +89,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs: Any, ) -> ScoresDict: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + data_split = data_split.select_columns( [self.input_column_name, self.label_column_name] ) @@ -139,9 +142,6 @@ def _evaluate_subset( } return scores - data_split = data_split.select_columns( - [self.input_column_name, self.label_column_name] - ) evaluator = self.evaluator( data_split, input_column_name=self.input_column_name, @@ -151,10 +151,10 @@ def _evaluate_subset( hf_subset=hf_subset, **kwargs, ) - clusters = evaluator(model, encode_kwargs=encode_kwargs) + evaluate_clusters = evaluator(model, encode_kwargs=encode_kwargs) if prediction_folder: self._save_task_predictions( - clusters, + evaluate_clusters, model, prediction_folder, hf_subset=hf_subset, @@ -163,7 +163,7 @@ def _evaluate_subset( return self._compute_metrics( data_split[self.label_column_name], - clusters, + evaluate_clusters, ) def _compute_metrics( diff --git a/mteb/abstasks/image/image_text_pair_classification.py b/mteb/abstasks/image/image_text_pair_classification.py index 829a70a45e..1c390cca80 100644 --- a/mteb/abstasks/image/image_text_pair_classification.py +++ b/mteb/abstasks/image/image_text_pair_classification.py @@ -12,7 +12,7 @@ calculate_text_statistics, ) from mteb.abstasks.abstask import AbsTask -from mteb.models.models_protocols import EncoderProtocol +from mteb.models.models_protocols import EncoderProtocol, MTEBModels from mteb.types.statistics import ( ImageStatistics, SplitDescriptiveStatistics, @@ -116,7 +116,7 @@ def _calculate_descriptive_statistics_from_split( def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, encode_kwargs: dict[str, Any], @@ -125,6 +125,8 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs: Any, ) -> ImageTextPairClassificationMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") select_columns = [] for columns in (self.images_column_names, self.texts_column_names): if isinstance(columns, str): @@ -154,7 +156,7 @@ def _evaluate_subset( hf_subset=hf_subset, **kwargs, ) - scores = evaluator(model, encode_kwargs=encode_kwargs) + scores: list[torch.Tensor] = evaluator(model, encode_kwargs=encode_kwargs) # type: ignore[assignment] if prediction_folder: self._save_task_predictions( [score.tolist() for score in scores], diff --git a/mteb/abstasks/multilabel_classification.py b/mteb/abstasks/multilabel_classification.py index 66e494b697..8731b86b0f 100644 --- a/mteb/abstasks/multilabel_classification.py +++ b/mteb/abstasks/multilabel_classification.py @@ -16,7 +16,8 @@ from mteb._create_dataloaders import create_dataloader from mteb._evaluators.classification_metrics import hamming_score from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol -from mteb.models import EncoderProtocol +from mteb.models import EncoderProtocol, MTEBModels +from mteb.types import Array from .classification import AbsTaskClassification @@ -24,14 +25,14 @@ def _evaluate_classifier( - embeddings_train: np.ndarray, + embeddings_train: Array, y_train: np.ndarray, - embeddings_test: np.ndarray, + embeddings_test: Array, classifier: SklearnModelProtocol, ) -> tuple[np.ndarray, SklearnModelProtocol]: - classifier: SklearnModelProtocol = clone(classifier) - classifier.fit(embeddings_train, y_train) - return classifier.predict(embeddings_test), classifier + classifier_copy: SklearnModelProtocol = clone(classifier) + classifier_copy.fit(embeddings_train, y_train) + return classifier_copy.predict(embeddings_test), classifier_copy class MultilabelClassificationMetrics(TypedDict): @@ -72,14 +73,14 @@ class AbsTaskMultilabelClassification(AbsTaskClassification): evaluator: Classifier to use for evaluation. Must implement the SklearnModelProtocol. """ - evaluator: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5) + evaluator: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5) # type: ignore[assignment] input_column_name: str = "text" label_column_name: str = "label" @override - def _evaluate_subset( + def _evaluate_subset( # type: ignore[override] self, - model: EncoderProtocol, + model: MTEBModels, data_split: DatasetDict, *, encode_kwargs: dict[str, Any], @@ -88,6 +89,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs: Any, ) -> FullMultilabelClassificationMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + if isinstance(data_split, DatasetDict): data_split = data_split.select_columns( [self.input_column_name, self.label_column_name] @@ -185,19 +189,20 @@ def _evaluate_subset( ) avg_scores: dict[str, Any] = { - k: np.mean([s[k] for s in scores]) for k in scores[0].keys() + k: np.mean([s[k] for s in scores]) # type: ignore[literal-required] + for k in scores[0].keys() } logger.info("Running multilabel classification - Finished.") return FullMultilabelClassificationMetrics( scores_per_experiment=scores, - **avg_scores, + **avg_scores, # type: ignore[typeddict-item] ) - def _calculate_scores( + def _calculate_scores( # type: ignore[override] self, y_test: np.ndarray, y_pred: np.ndarray, - x_test_embedding: np.ndarray, + x_test_embedding: Array, current_classifier: SklearnModelProtocol, ) -> MultilabelClassificationMetrics: accuracy = current_classifier.score(x_test_embedding, y_test) @@ -232,10 +237,9 @@ def _undersample_data_indices( """ sample_indices = [] if idxs is None: - idxs = np.arange(len(y)) + idxs = list(np.arange(len(y))) self.np_rng.shuffle(idxs) - idxs = idxs.tolist() - label_counter = defaultdict(int) + label_counter: dict[int, int] = defaultdict(int) for i in idxs: if any((label_counter[label] < samples_per_label) for label in y[i]): sample_indices.append(i) diff --git a/mteb/abstasks/pair_classification.py b/mteb/abstasks/pair_classification.py index df134bbccc..96966f89bd 100644 --- a/mteb/abstasks/pair_classification.py +++ b/mteb/abstasks/pair_classification.py @@ -18,7 +18,7 @@ ) from mteb.abstasks.abstask import AbsTask from mteb.models.model_meta import ScoringFunction -from mteb.models.models_protocols import EncoderProtocol +from mteb.models.models_protocols import EncoderProtocol, MTEBModels from mteb.types import PromptType from mteb.types.statistics import ( ImageStatistics, @@ -44,8 +44,8 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics): """ num_samples: int - number_of_characters: int - unique_pairs: int + number_of_characters: int | None + unique_pairs: int | None text1_statistics: TextStatistics | None image1_statistics: ImageStatistics | None @@ -79,7 +79,7 @@ class AbsTaskPairClassification(AbsTask): def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, hf_split: str, @@ -88,6 +88,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs, ) -> dict[str, float]: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + if self.metadata.modalities == ["text"]: # for compatibility with v1 version where datasets were stored in a single row data_split = data_split[0] if len(data_split) == 1 else data_split @@ -120,7 +123,7 @@ def _compute_metrics( self, similarity_scores: PairClassificationDistances, labels: list[int] ) -> dict[str, float]: logger.info("Computing metrics...") - labels = np.asarray(labels) + np_labels = np.asarray(labels) output_scores = {} max_scores = defaultdict(list) for short_name, scores, reverse in [ @@ -142,7 +145,7 @@ def _compute_metrics( ], [ScoringFunction.DOT_PRODUCT.value, similarity_scores["dot_scores"], True], ]: - metrics = self._compute_metrics_values(scores, labels, reverse) + metrics = self._compute_metrics_values(scores, np_labels, reverse) # type: ignore[arg-type] for metric_name, metric_value in metrics.items(): output_scores[f"{short_name}_{metric_name}"] = metric_value max_scores[metric_name].append(metric_value) @@ -237,6 +240,12 @@ def _compute_image_hash(inputs: list) -> list[str]: def _push_dataset_to_hub(self, repo_name: str) -> None: # previously pair classification datasets were stored in a single row + if self.dataset is None: + # overall this shouldn't happen as we check for dataset before pushing to hub + # added here for type checking purposes + raise RuntimeError( + "Dataset not loaded. To load dataset run `task.load_data()`." + ) if self.metadata.is_multilingual: for subset in self.dataset: for split in self.dataset[subset]: @@ -290,13 +299,13 @@ def _compute_metrics_values( ) def _find_best_acc_and_threshold( - self, scores: np.ndarray, labels: np.ndarray, high_score_more_similar: bool + self, scores: list[float], labels: np.ndarray, high_score_more_similar: bool ) -> tuple[float, float]: rows = list(zip(scores, labels)) rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar) max_acc = 0 - best_threshold = -1 + best_threshold = -1.0 positive_so_far = 0 remaining_negatives = sum(np.array(labels) == 0) @@ -323,7 +332,7 @@ def _find_best_f1_and_threshold( rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar) - best_f1 = best_precision = best_recall = 0 + best_f1 = best_precision = best_recall = 0.0 threshold = 0 nextract = 0 ncorrect = 0 diff --git a/mteb/abstasks/regression.py b/mteb/abstasks/regression.py index 024afcc91e..322a221e10 100644 --- a/mteb/abstasks/regression.py +++ b/mteb/abstasks/regression.py @@ -87,7 +87,7 @@ class AbsTaskRegression(AbsTaskClassification): Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol]. """ - evaluator: type[SklearnModelProtocol] = SklearnEvaluator + evaluator: type[SklearnEvaluator] = SklearnEvaluator evaluator_model: SklearnModelProtocol = LinearRegression(n_jobs=-1) train_split: str = "train" @@ -113,7 +113,7 @@ def _undersample_data( )["train"] return train_split_sampled, [] - def _calculate_scores( + def _calculate_scores( # type: ignore[override] self, y_test: np.ndarray | list[int], y_pred: np.ndarray, @@ -183,7 +183,7 @@ def stratified_subsampling( return dataset_dict - def _calculate_descriptive_statistics_from_split( + def _calculate_descriptive_statistics_from_split( # type: ignore[override] self, split: str, hf_subset: str | None = None, compute_overall: bool = False ) -> RegressionDescriptiveStatistics: train_text = [] diff --git a/mteb/abstasks/retrieval.py b/mteb/abstasks/retrieval.py index c44be10811..96dea9978c 100644 --- a/mteb/abstasks/retrieval.py +++ b/mteb/abstasks/retrieval.py @@ -1,7 +1,7 @@ import json import logging from collections import defaultdict -from collections.abc import Callable, Sequence +from collections.abc import Callable, Mapping, Sequence from pathlib import Path from time import time from typing import Any, Literal @@ -286,7 +286,7 @@ def evaluate( encode_kwargs: dict[str, Any], prediction_folder: Path | None = None, **kwargs, - ) -> dict[HFSubset, ScoresDict]: + ) -> Mapping[HFSubset, ScoresDict]: """Evaluate the model on the retrieval task. Args: @@ -357,6 +357,8 @@ def _evaluate_subset( **kwargs, ) + search_model: SearchProtocol + if isinstance(model, EncoderProtocol) and not isinstance(model, SearchProtocol): search_model = SearchEncoderWrapper(model) elif isinstance(model, CrossEncoderProtocol): @@ -578,11 +580,12 @@ def _push_section( if isinstance(data[split][subset_item], Dataset): sections[split] = data[split][subset_item] elif converter is not None: + subset_data = data[split][subset_item] + if subset_data is None: + continue + sections[split] = Dataset.from_list( - [ - converter(idx, item) - for idx, item in data[split][subset_item].items() - ] + [converter(idx, item) for idx, item in subset_data.items()] ) else: raise ValueError( @@ -680,7 +683,7 @@ def convert_to_reranking( top_k_sorted = defaultdict(list) for query_id, values in top_ranked.items(): - sorted_keys = sorted(values, key=values.get, reverse=True) + sorted_keys = sorted(values, key=lambda k: values[k], reverse=True) top_k_sorted[query_id] = sorted_keys[: self._top_k] self.dataset[subset][split]["top_ranked"] = top_k_sorted @@ -688,10 +691,10 @@ def convert_to_reranking( def _process_relevant_docs( - collection: dict[str, dict[str, float]], + collection: Mapping[str, Mapping[str, int]], hf_subset: str, split: str, -) -> dict[str, dict[str, float]]: +) -> dict[str, dict[str, int]]: """Collections can contain overlapping ids in different splits. Prepend split and subset to avoid this Returns: diff --git a/mteb/abstasks/sts.py b/mteb/abstasks/sts.py index 16432d0b50..9a7150f4b4 100644 --- a/mteb/abstasks/sts.py +++ b/mteb/abstasks/sts.py @@ -7,7 +7,7 @@ from mteb._evaluators import AnySTSEvaluator from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores -from mteb.models import EncoderProtocol +from mteb.models import EncoderProtocol, MTEBModels from mteb.types import PromptType from mteb.types.statistics import ( ImageStatistics, @@ -103,7 +103,7 @@ class AbsTaskSTS(AbsTask): def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, encode_kwargs: dict[str, Any], hf_split: str, @@ -111,6 +111,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs: Any, ) -> STSMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + normalized_scores = list(map(self._normalize, data_split["score"])) data_split = data_split.select_columns(list(self.column_names)) @@ -142,7 +145,7 @@ def _calculate_scores( ) -> STSMetrics: def compute_corr(x: list[float], y: list[float]) -> tuple[float, float]: """Return (pearson, spearman) correlations between x and y.""" - return pearsonr(x, y)[0], spearmanr(x, y)[0] + return float(pearsonr(x, y)[0]), float(spearmanr(x, y)[0]) cosine_pearson, cosine_spearman = compute_corr( normalized_scores, scores["cosine_scores"] diff --git a/mteb/abstasks/task_metadata.py b/mteb/abstasks/task_metadata.py index 7afd6e2a04..0ce299d5ed 100644 --- a/mteb/abstasks/task_metadata.py +++ b/mteb/abstasks/task_metadata.py @@ -2,9 +2,10 @@ import logging from collections.abc import Sequence from pathlib import Path -from typing import Any, Literal +from typing import Any, Literal, cast from huggingface_hub import ( + CardData, DatasetCard, DatasetCardData, constants, @@ -150,7 +151,7 @@ "InstructionReranking", ) + MIEB_TASK_TYPE -TaskType = Literal[_TASK_TYPE] +TaskType = Literal[_TASK_TYPE] # type: ignore[valid-type] """The type of the task. E.g. includes "Classification", "Retrieval" and "Clustering".""" @@ -192,8 +193,10 @@ """The type of the annotators. Is often important for understanding the quality of a dataset.""" -PromptDict = TypedDict( - "PromptDict", {prompt_type.value: str for prompt_type in PromptType}, total=False +PromptDict = TypedDict( # type: ignore[misc] + "PromptDict", + {prompt_type.value: str for prompt_type in PromptType}, + total=False, ) """A dictionary containing the prompt used for the task. @@ -365,7 +368,7 @@ def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]: """Return a dictionary mapping huggingface subsets to languages.""" if isinstance(self.eval_langs, dict): return self.eval_langs - return {"default": self.eval_langs} # type: ignore + return {"default": cast(list[str], self.eval_langs)} @property def intext_citation(self, include_cite: bool = True) -> str: @@ -413,7 +416,7 @@ def n_samples(self) -> dict[str, int] | None: for subset, subset_value in stats.items(): if subset == "hf_subset_descriptive_stats": continue - n_samples[subset] = subset_value["num_samples"] # type: ignore + n_samples[subset] = subset_value["num_samples"] return n_samples @property @@ -446,7 +449,7 @@ def get_modalities(self, prompt_type: PromptType | None = None) -> list[Modaliti Raises: ValueError: If the prompt type is not recognized. """ - if prompt_type is None: + if prompt_type is None or self.category is None: return self.modalities query_modalities, doc_modalities = self.category.split("2") category_to_modality: dict[str, Modalities] = { @@ -466,7 +469,7 @@ def get_modalities(self, prompt_type: PromptType | None = None) -> list[Modaliti def _create_dataset_card_data( self, - existing_dataset_card_data: DatasetCardData | None = None, + existing_dataset_card_data: CardData | None = None, ) -> tuple[DatasetCardData, dict[str, Any]]: """Create a DatasetCardData object from the task metadata. @@ -501,12 +504,13 @@ def _create_dataset_card_data( tags = ["mteb"] + self.modalities - descriptive_stats = self.descriptive_stats - if descriptive_stats is not None: - for split, split_stat in descriptive_stats.items(): + descriptive_stats = "" + if self.descriptive_stats is not None: + descriptive_stats_ = self.descriptive_stats + for split, split_stat in descriptive_stats_.items(): if len(split_stat.get("hf_subset_descriptive_stats", {})) > 10: split_stat.pop("hf_subset_descriptive_stats", {}) - descriptive_stats = json.dumps(descriptive_stats, indent=4) + descriptive_stats = json.dumps(descriptive_stats_, indent=4) dataset_card_data_params = existing_dataset_card_data.to_dict() # override the existing values @@ -694,11 +698,11 @@ def _hf_task_category(self) -> list[str]: def _hf_languages(self) -> list[str]: languages: list[str] = [] - if self.is_multilingual: - for val in list(self.eval_langs.values()): + if self.is_multilingual and isinstance(self.eval_langs, dict): + for val in self.eval_langs.values(): languages.extend(val) else: - languages = self.eval_langs + languages = cast(list[str], self.eval_langs) # value "python" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters), # or a special value like "code", "multilingual". readme_langs = [] @@ -710,7 +714,7 @@ def _hf_languages(self) -> list[str]: readme_langs.append(lang_name) return sorted(set(readme_langs)) - def _hf_license(self) -> str: + def _hf_license(self) -> str | None: dataset_license = self.license if dataset_license: license_mapping = { diff --git a/mteb/abstasks/text/bitext_mining.py b/mteb/abstasks/text/bitext_mining.py index 961c5caf60..5ca00a62d3 100644 --- a/mteb/abstasks/text/bitext_mining.py +++ b/mteb/abstasks/text/bitext_mining.py @@ -1,7 +1,7 @@ import logging from collections import defaultdict from pathlib import Path -from typing import Any, ClassVar, TypedDict +from typing import Any, ClassVar, TypedDict, cast from datasets import Dataset, DatasetDict from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score @@ -78,6 +78,9 @@ def evaluate( **kwargs: Any, ) -> dict[HFSubset, ScoresDict]: """Added load for "parallel" datasets""" + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + if not self.data_loaded: self.load_data() @@ -87,11 +90,16 @@ def evaluate( if subsets_to_run is not None: hf_subsets = [s for s in hf_subsets if s in subsets_to_run] - scores = {} + encoder_model = cast(EncoderProtocol, model) + + if self.dataset is None: + raise ValueError("Dataset is not loaded.") + + scores: dict[str, BitextMiningMetrics] = {} if self.parallel_subsets: - scores = self._evaluate_subset( - model, - self.dataset[split], # type: ignore + scores = self._evaluate_subset( # type: ignore[assignment] + encoder_model, + self.dataset[split], parallel=True, hf_split=split, hf_subset="parallel", @@ -109,8 +117,8 @@ def evaluate( data_split = self.dataset[split] else: data_split = self.dataset[hf_subset][split] - scores[hf_subset] = self._evaluate_subset( - model, + scores[hf_subset] = self._evaluate_subset( # type: ignore[assignment] + encoder_model, data_split, hf_split=split, hf_subset=hf_subset, @@ -119,32 +127,32 @@ def evaluate( **kwargs, ) - return scores + return cast(dict[HFSubset, ScoresDict], scores) def _get_pairs(self, parallel: bool) -> list[tuple[str, str]]: pairs = self._DEFAULT_PAIR if parallel: - pairs = [langpair.split("-") for langpair in self.hf_subsets] + pairs = [langpair.split("-") for langpair in self.hf_subsets] # type: ignore[misc] return pairs - def _evaluate_subset( + def _evaluate_subset( # type: ignore[override] self, model: EncoderProtocol, data_split: Dataset, *, hf_split: str, hf_subset: str, - parallel: bool = False, encode_kwargs: dict[str, Any], prediction_folder: Path | None = None, + parallel: bool = False, **kwargs, - ) -> ScoresDict: + ) -> BitextMiningMetrics | dict[str, BitextMiningMetrics]: pairs = self._get_pairs(parallel) evaluator = BitextMiningEvaluator( data_split, task_metadata=self.metadata, - pair_columns=pairs, # type: ignore + pair_columns=pairs, hf_split=hf_split, hf_subset=hf_subset, **kwargs, @@ -168,16 +176,16 @@ def _evaluate_subset( ) if parallel: - metrics = {} + parallel_metrics = {} for keys, nearest_neighbors in neighbours.items(): - metrics[keys] = self._compute_metrics(nearest_neighbors, gold) + parallel_metrics[keys] = self._compute_metrics(nearest_neighbors, gold) - for v in metrics.values(): + for v in parallel_metrics.values(): self._add_main_score(v) - else: - def_pair_str = "-".join(self._DEFAULT_PAIR[0]) - metrics = self._compute_metrics(neighbours[def_pair_str], gold) - self._add_main_score(metrics) + return parallel_metrics + def_pair_str = "-".join(self._DEFAULT_PAIR[0]) + metrics = self._compute_metrics(neighbours[def_pair_str], gold) + self._add_main_score(metrics) return metrics def _compute_metrics( @@ -250,8 +258,11 @@ def _calculate_descriptive_statistics_from_split( ) def _push_dataset_to_hub(self, repo_name: str) -> None: + if self.dataset is None: + raise ValueError("Dataset is not loaded.") + if self.metadata.is_multilingual: - dataset = defaultdict(dict) + dataset: dict[str, dict[str, list[str]]] = defaultdict(dict) for config in self.metadata.eval_langs: logger.info(f"Converting {config} of {self.metadata.name}") @@ -266,10 +277,10 @@ def _push_dataset_to_hub(self, repo_name: str) -> None: for split in self.dataset[config]: dataset[split][lang_1] = self.dataset[config][split][sent_1] dataset[split][lang_2] = self.dataset[config][split][sent_2] - for split in dataset: - dataset[split] = Dataset.from_dict(dataset[split]) - dataset = DatasetDict(dataset) - dataset.push_to_hub(repo_name) + dataset_dict = DatasetDict( + {split: Dataset.from_dict(dataset[split]) for split in dataset} + ) + dataset_dict.push_to_hub(repo_name) else: sentences = {} for split in self.dataset: diff --git a/mteb/abstasks/text/reranking.py b/mteb/abstasks/text/reranking.py index 13ceebdd77..f142b8a63e 100644 --- a/mteb/abstasks/text/reranking.py +++ b/mteb/abstasks/text/reranking.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) -OLD_FORMAT_RERANKING_TASKS = [] +OLD_FORMAT_RERANKING_TASKS: list[str] = [] @deprecated( @@ -105,7 +105,9 @@ def transform_old_dataset_format(self, given_dataset: Dataset | None = None): ) given_dataset = copy(given_dataset) - self.dataset = defaultdict(lambda: defaultdict(dict)) + self.dataset: dict[str, dict[str, RetrievalSplitData]] = defaultdict( + lambda: defaultdict(dict) # type: ignore[arg-type] + ) hf_subsets = self.hf_subsets @@ -115,19 +117,19 @@ def transform_old_dataset_format(self, given_dataset: Dataset | None = None): if hf_subset in cur_dataset: cur_dataset = cur_dataset[hf_subset] elif "name" in self.metadata.dataset: - cur_dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + cur_dataset = datasets.load_dataset(**self.metadata.dataset) assert hf_subset == "default", ( f"Only default subset is supported for {self.metadata.name} since `name` is given in the metadata." ) else: cur_dataset = datasets.load_dataset( **self.metadata.dataset, name=hf_subset - ) # type: ignore + ) for split in cur_dataset: corpus = [] queries = [] - relevant_docs = defaultdict(dict) + relevant_docs: dict[str, dict[str, int]] = defaultdict(dict) top_ranked = defaultdict(list) # Create an enumerated dataset to pass indices diff --git a/mteb/abstasks/text/summarization.py b/mteb/abstasks/text/summarization.py index 4f53884824..1879af9e19 100644 --- a/mteb/abstasks/text/summarization.py +++ b/mteb/abstasks/text/summarization.py @@ -12,7 +12,7 @@ calculate_text_statistics, ) from mteb.abstasks.abstask import AbsTask -from mteb.models import EncoderProtocol +from mteb.models import EncoderProtocol, MTEBModels from mteb.types.statistics import ( ScoreStatistics, SplitDescriptiveStatistics, @@ -77,7 +77,7 @@ class AbsTaskSummarization(AbsTask): def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, hf_split: str, @@ -86,8 +86,13 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs, ) -> SummarizationMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + normalized_scores = [ - (np.array(x) - self.min_score) / (self.max_score - self.min_score) + ( + (np.array(x) - self.min_score) / (self.max_score - self.min_score) + ).tolist() for x in data_split[self.relevancy_column_name] ] evaluator = self.evaluator( diff --git a/mteb/abstasks/zeroshot_classification.py b/mteb/abstasks/zeroshot_classification.py index 15045309e4..206e6b3ed9 100644 --- a/mteb/abstasks/zeroshot_classification.py +++ b/mteb/abstasks/zeroshot_classification.py @@ -7,7 +7,7 @@ from sklearn import metrics from mteb._evaluators import ZeroShotClassificationEvaluator -from mteb.models import EncoderProtocol +from mteb.models import EncoderProtocol, MTEBModels from mteb.types.statistics import ( ImageStatistics, LabelStatistics, @@ -111,7 +111,7 @@ def _calculate_descriptive_statistics_from_split( def _evaluate_subset( self, - model: EncoderProtocol, + model: MTEBModels, data_split: Dataset, *, hf_split: str, @@ -120,6 +120,9 @@ def _evaluate_subset( prediction_folder: Path | None = None, **kwargs, ) -> ZeroShotClassificationMetrics: + if not isinstance(model, EncoderProtocol): + raise TypeError("Expected model to be an instance of EncoderProtocol") + candidate_labels = self.get_candidate_labels() data_split = data_split.select_columns( [self.input_column_name, self.label_column_name] diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py index 1b1578da78..41db0641b4 100644 --- a/mteb/benchmarks/benchmark.py +++ b/mteb/benchmarks/benchmark.py @@ -1,6 +1,6 @@ from __future__ import annotations -from collections.abc import Iterable, Sequence +from collections.abc import Iterator, Sequence from dataclasses import dataclass, field from typing import TYPE_CHECKING, Literal @@ -47,7 +47,7 @@ class Benchmark: display_name: str | None = None language_view: list[str] | Literal["all"] = field(default_factory=list) - def __iter__(self) -> Iterable[AbsTask]: + def __iter__(self) -> Iterator[AbsTask]: return iter(self.tasks) def __len__(self) -> int: diff --git a/mteb/cache.py b/mteb/cache.py index 05b1142b53..e16f98ab20 100644 --- a/mteb/cache.py +++ b/mteb/cache.py @@ -5,7 +5,7 @@ import subprocess import warnings from collections import defaultdict -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from pathlib import Path from typing import cast @@ -291,8 +291,8 @@ def __repr__(self) -> str: def get_cache_paths( self, - models: Sequence[str] | Sequence[ModelMeta] | None = None, - tasks: Sequence[str] | Sequence[AbsTask] | None = None, + models: Sequence[str] | Iterable[ModelMeta] | None = None, + tasks: Sequence[str] | Iterable[AbsTask] | None = None, require_model_meta: bool = True, include_remote: bool = True, ) -> list[Path]: @@ -425,7 +425,7 @@ def _get_model_name_and_revision_from_path( @staticmethod def _filter_paths_by_model_and_revision( paths: list[Path], - models: Sequence[str] | Sequence[ModelMeta] | None = None, + models: Sequence[str] | Iterable[ModelMeta] | None = None, ) -> list[Path]: """Filter a list of paths by model name and optional revision. @@ -435,8 +435,9 @@ def _filter_paths_by_model_and_revision( if not models: return paths - if isinstance(models[0], ModelMeta): - models = cast(list[ModelMeta], models) + first_model = next(iter(models)) + if isinstance(first_model, ModelMeta): + models = cast(Iterable[ModelMeta], models) name_and_revision = { (m.model_name_as_path(), m.revision or "no_revision_available") for m in models @@ -447,13 +448,14 @@ def _filter_paths_by_model_and_revision( if (p.parent.parent.name, p.parent.name) in name_and_revision ] - model_names = {m.replace("/", "__").replace(" ", "_") for m in models} + str_models = cast(Sequence[str], models) + model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models} return [p for p in paths if p.parent.parent.name in model_names] @staticmethod def _filter_paths_by_task( paths: list[Path], - tasks: Sequence[str] | Sequence[AbsTask] | None = None, + tasks: Sequence[str] | Iterable[AbsTask] | None = None, ) -> list[Path]: if tasks is not None: task_names = set() @@ -469,8 +471,8 @@ def _filter_paths_by_task( def load_results( self, - models: Sequence[str] | Sequence[ModelMeta] | None = None, - tasks: Sequence[str] | Sequence[AbsTask] | Benchmark | str | None = None, + models: Sequence[str] | Iterable[ModelMeta] | None = None, + tasks: Sequence[str] | Iterable[AbsTask] | str | None = None, require_model_meta: bool = True, include_remote: bool = True, validate_and_filter: bool = False, @@ -514,7 +516,7 @@ def load_results( ) models_results = defaultdict(list) - task_names = {} + task_names: dict[str, AbsTask | None] = {} if tasks is not None: for task in tasks: if isinstance(task, AbsTask): @@ -532,9 +534,11 @@ def load_results( ) if validate_and_filter: - task = task_names[task_result.task_name] + task_instance = task_names[task_result.task_name] try: - task_result = task_result.validate_and_filter_scores(task=task) + task_result = task_result.validate_and_filter_scores( + task=task_instance + ) except Exception as e: logger.info( f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}" @@ -544,7 +548,7 @@ def load_results( models_results[(model_name, revision)].append(task_result) # create BenchmarkResults object - models_results = [ + models_results_object = [ ModelResult( model_name=model_name, model_revision=revision, @@ -553,9 +557,7 @@ def load_results( for (model_name, revision), task_results in models_results.items() ] - benchmark_results = BenchmarkResults( - model_results=models_results, + return BenchmarkResults( + model_results=models_results_object, benchmark=tasks if isinstance(tasks, Benchmark) else None, ) - - return benchmark_results diff --git a/mteb/cli/_display_tasks.py b/mteb/cli/_display_tasks.py index 4b4fa1268d..cda4f36a00 100644 --- a/mteb/cli/_display_tasks.py +++ b/mteb/cli/_display_tasks.py @@ -1,4 +1,4 @@ -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from mteb.abstasks import AbsTask from mteb.benchmarks import Benchmark @@ -31,7 +31,7 @@ def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None: _display_tasks(benchmark.tasks, name=name) -def _display_tasks(task_list: Sequence[AbsTask], name: str | None = None) -> None: +def _display_tasks(task_list: Iterable[AbsTask], name: str | None = None) -> None: from rich.console import Console console = Console() diff --git a/mteb/cli/build_cli.py b/mteb/cli/build_cli.py index d88edbe5e2..c307320a4a 100644 --- a/mteb/cli/build_cli.py +++ b/mteb/cli/build_cli.py @@ -8,12 +8,12 @@ from rich.logging import RichHandler import mteb +from mteb.abstasks.abstask import AbsTask from mteb.cache import ResultCache +from mteb.cli._display_tasks import _display_benchmarks, _display_tasks from mteb.cli.generate_model_card import generate_model_card from mteb.evaluate import OverwriteStrategy -from ._display_tasks import _display_benchmarks, _display_tasks - logger = logging.getLogger(__name__) @@ -54,7 +54,7 @@ def run(args: argparse.Namespace) -> None: if args.benchmarks: benchmarks = mteb.get_benchmarks(names=args.benchmarks) - tasks = [t for b in benchmarks for t in b.tasks] + tasks = tuple(t for b in benchmarks for t in b.tasks) else: tasks = mteb.get_tasks( categories=args.categories, @@ -290,9 +290,9 @@ def _create_meta(args: argparse.Namespace) -> None: "Output path already exists, use --overwrite to overwrite." ) - tasks = [] + tasks: list[AbsTask] = [] if tasks_names is not None: - tasks = mteb.get_tasks(tasks_names) + tasks = list(mteb.get_tasks(tasks_names)) if benchmarks is not None: benchmarks = mteb.get_benchmarks(benchmarks) for benchmark in benchmarks: diff --git a/mteb/cli/generate_model_card.py b/mteb/cli/generate_model_card.py index 6fe57e1f1d..bfab99ebf4 100644 --- a/mteb/cli/generate_model_card.py +++ b/mteb/cli/generate_model_card.py @@ -1,5 +1,6 @@ import logging import warnings +from collections.abc import Sequence from pathlib import Path from huggingface_hub import ModelCard, ModelCardData, repo_exists @@ -13,7 +14,7 @@ def generate_model_card( model_name: str, - tasks: list[AbsTask] | None = None, + tasks: Sequence[AbsTask] | None = None, existing_model_card_id_or_path: str | Path | None = None, results_cache: ResultCache = ResultCache(), output_path: Path = Path("model_card.md"), @@ -48,8 +49,8 @@ def generate_model_card( for task_result in models_results.task_results: eval_results.extend(task_result.get_hf_eval_results()) - existing_model_card_data = ( - existing_model_card.data if existing_model_card else ModelCardData() + existing_model_card_data: ModelCardData = ( + existing_model_card.data if existing_model_card else ModelCardData() # type: ignore[assignment] ) if existing_model_card_data.eval_results is None: @@ -89,7 +90,8 @@ def generate_model_card( benchmark_results, existing_model_card ) - if push_to_hub: + if push_to_hub and existing_model_card_id_or_path: + existing_model_card_id_or_path = str(existing_model_card_id_or_path) if repo_exists(existing_model_card_id_or_path): existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token) else: diff --git a/mteb/deprecated_evaluator.py b/mteb/deprecated_evaluator.py index a67847c5d2..c42f27a71b 100644 --- a/mteb/deprecated_evaluator.py +++ b/mteb/deprecated_evaluator.py @@ -6,23 +6,23 @@ import sys import traceback import warnings -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from copy import deepcopy from datetime import datetime from itertools import chain from pathlib import Path from time import time -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, cast import datasets import mteb from mteb.abstasks import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.abstasks.task_metadata import TaskCategory, TaskType from mteb.benchmarks import Benchmark from mteb.models import ( CrossEncoderWrapper, - EncoderProtocol, ModelMeta, MTEBModels, SentenceTransformerEncoderWrapper, @@ -53,7 +53,7 @@ class MTEB: ) def __init__( self, - tasks: Iterable[AbsTask | Benchmark], + tasks: Iterable[AbsTask] | Iterable[Benchmark], *, err_logs_path: str = "error_logs.txt", ) -> None: @@ -64,15 +64,14 @@ def __init__( `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)"). err_logs_path: Path to save error logs. """ - from mteb.benchmarks import Benchmark - - self.tasks = list(tasks) - if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark): + if isinstance(next(iter(tasks)), Benchmark): self.benchmarks = tasks - self.tasks = list(chain.from_iterable(self.tasks)) + self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks))) + elif isinstance(next(iter(tasks)), AbsTask): + self.tasks = list(cast(Iterable[AbsTask], tasks)) self.err_logs_path = Path(err_logs_path) - self.last_evaluated_splits = {} + self._last_evaluated_splits: dict[str, list[str]] = {} @property def available_tasks(self) -> list[str]: @@ -85,7 +84,7 @@ def available_task_types(self) -> list[TaskType]: return sorted({x.metadata.type for x in self.tasks}) @property - def available_task_categories(self) -> set[TaskCategory]: + def available_task_categories(self) -> set[TaskCategory | None]: """Set of available task categories.""" return {x.metadata.category for x in self.tasks} @@ -232,13 +231,14 @@ def _merge_results( merged_kg_co2_emissions = None if existing_kg_co2_emissions and new_kg_co2_emissions: merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions + existing_evaluation_time = existing_results.evaluation_time or 0 + new_evaluation_time = new_results.evaluation_time or 0 merged_results = TaskResult( dataset_revision=new_results.dataset_revision, task_name=new_results.task_name, mteb_version=new_results.mteb_version, scores=merged_scores, - evaluation_time=existing_results.evaluation_time - + new_results.evaluation_time, + evaluation_time=existing_evaluation_time + new_evaluation_time, kg_co2_emissions=merged_kg_co2_emissions, ) @@ -307,13 +307,16 @@ def run( elif verbosity == 3: datasets.logging.set_verbosity(logging.DEBUG) - meta = self.create_model_meta(model) - output_path = self._create_output_folder(meta, output_folder) - + mteb_model: MTEBModels if isinstance(model, SentenceTransformer): - model = SentenceTransformerEncoderWrapper(model) + mteb_model = SentenceTransformerEncoderWrapper(model) elif isinstance(model, CrossEncoder): - model = CrossEncoderWrapper(model) + mteb_model = CrossEncoderWrapper(model) + else: + mteb_model = cast(MTEBModels, model) + + meta = self.create_model_meta(mteb_model) + output_path = self._create_output_folder(meta, output_folder) # Disable co2_tracker for API models if "API" in meta.framework: @@ -334,7 +337,7 @@ def run( ) # save them in case we re-use the object (e.g. for reranking) # To evaluate missing splits, we keep track of the task name and the corresponding splits. - self.last_evaluated_splits = {} + self._last_evaluated_splits = {} while len(self.tasks) > 0: task = self.tasks[0] @@ -343,9 +346,10 @@ def run( ) if task.is_aggregate: - self_ = MTEB(tasks=task.metadata.tasks) - task_results = self_.run( - model, + aggregated_task = cast(AbsTaskAggregate, task) + self_ = MTEB(tasks=aggregated_task.metadata.tasks) + aggregated_task_results = self_.run( + mteb_model, verbosity=verbosity - 1, output_folder=output_folder, eval_splits=eval_splits, @@ -356,12 +360,15 @@ def run( encode_kwargs=encode_kwargs, **kwargs, ) - new_results = task.combine_task_results(task_results) + new_results = aggregated_task.combine_task_results( + aggregated_task_results + ) evaluation_results.append(new_results) if output_path: - save_path = output_path / f"{task.metadata.name}.json" - new_results.to_disk(save_path) + new_results.to_disk( + output_path / f"{aggregated_task.metadata.name}.json" + ) del self.tasks[0] continue @@ -383,7 +390,7 @@ def run( task_subsets = task.hf_subsets existing_results = None - save_path = None + save_path: Path | None = None final_splits_to_run = task_eval_splits missing_evaluations = self._get_missing_evaluations( existing_results, @@ -433,7 +440,7 @@ def run( logger.info( f"No splits to evaluate for {task.metadata.name}. Skipping evaluation." ) - self.last_evaluated_splits[task.metadata.name] = [] + self._last_evaluated_splits[task.metadata.name] = [] del self.tasks[0] continue @@ -441,11 +448,11 @@ def run( task.check_if_dataset_is_superseded() task.load_data() - task_results = {} + task_results: dict[str, dict[str, dict[str, Any]]] = {} evaluation_time = 0 kg_co2_emissions: int | None = 0 if co2_tracker else None - self.last_evaluated_splits[task.metadata.name] = [] + self._last_evaluated_splits[task.metadata.name] = [] for split in final_splits_to_run: info = missing_evaluations[split] @@ -466,7 +473,9 @@ def run( if co2_tracker: try: - from codecarbon import EmissionsTracker + from codecarbon import ( # type: ignore[import-untyped] + EmissionsTracker, + ) except ImportError: raise ImportError( "codecarbon is not installed. Please install it using `pip install 'mteb[codecarbon]'` to track CO₂ emissions." @@ -482,7 +491,7 @@ def run( ) as tracker: results, tick, tock = self._run_eval( task, - model, + mteb_model, split, encode_kwargs=encode_kwargs, subsets_to_run=subsets_to_run, @@ -495,7 +504,7 @@ def run( else: results, tick, tock = self._run_eval( task, - model, + mteb_model, split, subsets_to_run=subsets_to_run, encode_kwargs=encode_kwargs, @@ -511,25 +520,25 @@ def run( if verbosity >= 1: logger.info(f"Scores: {task_results[split]}") - self.last_evaluated_splits[task.metadata.name].append(split) + self._last_evaluated_splits[task.metadata.name].append(split) # Create new TaskResult new_results = TaskResult.from_task_results( task, - task_results, + task_results, # type: ignore[arg-type] evaluation_time=evaluation_time, kg_co2_emissions=kg_co2_emissions, ) # Merge with existing if needed - if output_path and save_path.exists(): + if output_path and save_path and save_path.exists(): existing_results = TaskResult.from_disk(save_path) if existing_results: merged_results = self._merge_results(existing_results, new_results) else: merged_results = new_results - if output_path: + if output_path and save_path: merged_results.to_disk(save_path) evaluation_results.append(merged_results) @@ -556,7 +565,7 @@ def run( def create_model_meta(model: MTEBModels) -> ModelMeta: """Create a ModelMeta object for the given model.""" if hasattr(model, "mteb_model_meta") and model.mteb_model_meta is not None: - meta = model.mteb_model_meta # type: ignore + meta = model.mteb_model_meta else: meta = MTEB._get_model_meta(model) @@ -582,7 +591,11 @@ def _create_output_folder( if output_folder is None: return None - model_revision: str = model_meta.revision # type: ignore + model_revision: str = ( + model_meta.revision + if model_meta.revision is not None + else "no_revision_available" + ) model_path_name = model_meta.model_name_as_path() output_path = Path(output_folder) / model_path_name / model_revision @@ -604,15 +617,15 @@ def _get_last_evaluated_splits(self) -> dict[str, list[str]]: Tasks with empty lists indicate that results already existed and no splits were evaluated. """ return deepcopy( - {task: list(splits) for task, splits in self.last_evaluated_splits.items()} + {task: list(splits) for task, splits in self._last_evaluated_splits.items()} ) @staticmethod def _get_missing_evaluations( existing_results: TaskResult | None, - task_eval_splits: list[str], - task_eval_langs: list[str], - eval_subsets: list[str] | None, + task_eval_splits: Sequence[str], + task_eval_langs: Sequence[str], + eval_subsets: Sequence[str] | None, ) -> dict[str, dict[str, Any]]: """Return a dictionary for each split, indicating if the whole split is missing and which subsets are missing.""" missing_evaluations = { @@ -661,7 +674,7 @@ def _get_missing_evaluations( return missing_evaluations @staticmethod - def _get_model_meta(model: EncoderProtocol) -> ModelMeta: + def _get_model_meta(model: MTEBModels) -> ModelMeta: from sentence_transformers import CrossEncoder, SentenceTransformer if isinstance(model, CrossEncoder): diff --git a/mteb/evaluate.py b/mteb/evaluate.py index 1b272fb5d1..17635bd1da 100644 --- a/mteb/evaluate.py +++ b/mteb/evaluate.py @@ -14,11 +14,10 @@ from mteb.abstasks import AbsTaskRetrieval from mteb.abstasks.abstask import AbsTask from mteb.abstasks.aggregated_task import AbsTaskAggregate +from mteb.benchmarks.benchmark import Benchmark from mteb.cache import ResultCache from mteb.models.model_meta import ModelMeta from mteb.models.models_protocols import ( - CrossEncoderProtocol, - EncoderProtocol, MTEBModels, ) from mteb.models.sentence_transformer_wrapper import ( @@ -58,27 +57,26 @@ def _sanitize_model( ) -> tuple[MTEBModels | ModelMeta, ModelMeta, ModelName, Revision]: from sentence_transformers import CrossEncoder, SentenceTransformer + wrapped_model: MTEBModels | ModelMeta if isinstance(model, SentenceTransformer): - _mdl = SentenceTransformerEncoderWrapper(model) - meta = _mdl.mteb_model_meta - _mdl = cast(EncoderProtocol, _mdl) - model = _mdl + wrapped_model = SentenceTransformerEncoderWrapper(model) + meta = wrapped_model.mteb_model_meta elif isinstance(model, CrossEncoder): - _mdl = CrossEncoderWrapper(model) - _mdl = cast(CrossEncoderProtocol, _mdl) - meta = _mdl.mteb_model_meta - model = _mdl + wrapped_model = CrossEncoderWrapper(model) + meta = wrapped_model.mteb_model_meta elif hasattr(model, "mteb_model_meta"): - meta = model.mteb_model_meta # type: ignore[attr-defined] + meta = getattr(model, "mteb_model_meta") if not isinstance(meta, ModelMeta): - meta = ModelMeta.from_hub(None) + meta = ModelMeta._from_hub(None) + wrapped_model = cast(MTEBModels | ModelMeta, model) else: - meta = ModelMeta.from_hub(None) if not isinstance(model, ModelMeta) else model + meta = ModelMeta._from_hub(None) if not isinstance(model, ModelMeta) else model + wrapped_model = meta model_name = cast(str, meta.name) model_revision = cast(str, meta.revision) - return model, meta, model_name, model_revision + return wrapped_model, meta, model_name, model_revision def _evaluate_task( @@ -124,7 +122,8 @@ def _evaluate_task( prediction_folder=prediction_folder, public_only=public_only, ) - result.kg_co2_emissions = tracker.final_emissions + if isinstance(result, TaskResult): + result.kg_co2_emissions = tracker.final_emissions return result task_results = {} @@ -150,7 +149,7 @@ def _evaluate_task( if public_only is False: raise e - evaluation_time = 0 + evaluation_time = 0.0 for split, hf_subsets in splits.items(): tick = time() @@ -197,12 +196,18 @@ def _check_model_modalities( return model_modalities = set(model.modalities) + check_tasks: Iterable[AbsTask] = [] if isinstance(tasks, AbsTask): - tasks = [tasks] + check_tasks = [tasks] + elif isinstance(tasks, Benchmark): + benchmark = cast(Benchmark, tasks) + check_tasks = benchmark.tasks + else: + check_tasks = cast(Iterable[AbsTask], tasks) warnings, errors = [], [] - for task in tasks: + for task in check_tasks: # only retrieval tasks have different modalities for query and document and can be run with partial overlaps if isinstance(task, AbsTaskRetrieval): query_mods = set(task.metadata.get_modalities(PromptType.query)) @@ -335,10 +340,10 @@ def evaluate( # AbsTaskAggregate is a special case where we have to run multiple tasks and combine the results if isinstance(tasks, AbsTaskAggregate): - task = cast(AbsTaskAggregate, tasks) + aggregated_task = cast(AbsTaskAggregate, tasks) results = evaluate( model, - task.metadata.tasks, + aggregated_task.metadata.tasks, co2_tracker=co2_tracker, raise_error=raise_error, encode_kwargs=encode_kwargs, @@ -348,17 +353,18 @@ def evaluate( show_progress_bar=show_progress_bar, public_only=public_only, ) - result = task.combine_task_results(results.task_results) + combined_results = aggregated_task.combine_task_results(results.task_results) return ModelResult( model_name=results.model_name, model_revision=results.model_revision, - task_results=[result], + task_results=[combined_results], ) if isinstance(tasks, AbsTask): task = tasks else: - results = [] + tasks = cast(Iterable[AbsTask], tasks) + evaluate_results = [] exceptions = [] tasks_tqdm = tqdm( tasks, @@ -379,23 +385,23 @@ def evaluate( show_progress_bar=False, public_only=public_only, ) - results.extend(_res.task_results) + evaluate_results.extend(_res.task_results) if _res.exceptions: exceptions.extend(_res.exceptions) return ModelResult( model_name=_res.model_name, model_revision=_res.model_revision, - task_results=results, + task_results=evaluate_results, exceptions=exceptions, ) overwrite_strategy = OverwriteStrategy.from_str(overwrite_strategy) - existing_results = None + existing_results: TaskResult | None = None if cache and overwrite_strategy != OverwriteStrategy.ALWAYS: - results = cache.load_task_result(task.metadata.name, meta) - if results: - existing_results = results + cache_results = cache.load_task_result(task.metadata.name, meta) + if cache_results: + existing_results = cache_results if ( existing_results diff --git a/mteb/filter_tasks.py b/mteb/filter_tasks.py index 759e1f03d8..ea0f5cc0f8 100644 --- a/mteb/filter_tasks.py +++ b/mteb/filter_tasks.py @@ -1,7 +1,7 @@ """This script contains functions that are used to get an overview of the MTEB benchmark.""" import logging -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from typing import overload from mteb.abstasks import ( @@ -34,14 +34,14 @@ def _check_is_valid_language(lang: str) -> None: @overload def filter_tasks( - tasks: Sequence[AbsTask], + tasks: Iterable[AbsTask], *, - languages: list[str] | None = None, - script: list[str] | None = None, - domains: list[TaskDomain] | None = None, - task_types: list[TaskType] | None = None, # type: ignore - categories: list[TaskCategory] | None = None, - modalities: list[Modalities] | None = None, + languages: Sequence[str] | None = None, + script: Sequence[str] | None = None, + domains: Iterable[TaskDomain] | None = None, + task_types: Iterable[TaskType] | None = None, + categories: Iterable[TaskCategory] | None = None, + modalities: Iterable[Modalities] | None = None, exclusive_modality_filter: bool = False, exclude_superseded: bool = False, exclude_aggregate: bool = False, @@ -51,14 +51,14 @@ def filter_tasks( @overload def filter_tasks( - tasks: Sequence[type[AbsTask]], + tasks: Iterable[type[AbsTask]], *, - languages: list[str] | None = None, - script: list[str] | None = None, - domains: list[TaskDomain] | None = None, - task_types: list[TaskType] | None = None, # type: ignore - categories: list[TaskCategory] | None = None, - modalities: list[Modalities] | None = None, + languages: Sequence[str] | None = None, + script: Sequence[str] | None = None, + domains: Iterable[TaskDomain] | None = None, + task_types: Iterable[TaskType] | None = None, + categories: Iterable[TaskCategory] | None = None, + modalities: Iterable[Modalities] | None = None, exclusive_modality_filter: bool = False, exclude_superseded: bool = False, exclude_aggregate: bool = False, @@ -67,14 +67,14 @@ def filter_tasks( def filter_tasks( - tasks: Sequence[AbsTask] | Sequence[type[AbsTask]], + tasks: Iterable[AbsTask] | Iterable[type[AbsTask]], *, - languages: list[str] | None = None, - script: list[str] | None = None, - domains: list[TaskDomain] | None = None, - task_types: list[TaskType] | None = None, # type: ignore - categories: list[TaskCategory] | None = None, - modalities: list[Modalities] | None = None, + languages: Sequence[str] | None = None, + script: Sequence[str] | None = None, + domains: Iterable[TaskDomain] | None = None, + task_types: Iterable[TaskType] | None = None, + categories: Iterable[TaskCategory] | None = None, + modalities: Iterable[Modalities] | None = None, exclusive_modality_filter: bool = False, exclude_superseded: bool = False, exclude_aggregate: bool = False, @@ -92,7 +92,6 @@ def filter_tasks( task_types: A string specifying the type of task e.g. "Classification" or "Retrieval". If None, all tasks are included. categories: A list of task categories these include "t2t" (text to text), "t2i" (text to image). See TaskMetadata for the full list. exclude_superseded: A boolean flag to exclude datasets which are superseded by another. - eval_splits: A list of evaluation splits to include. If None, all splits are included. modalities: A list of modalities to include. If None, all modalities are included. exclusive_modality_filter: If True, only keep tasks where _all_ filter modalities are included in the task's modalities and ALL task modalities are in filter modalities (exact match). @@ -113,12 +112,12 @@ def filter_tasks( """ langs_to_keep = None if languages: - [_check_is_valid_language(lang) for lang in languages] + [_check_is_valid_language(lang) for lang in languages] # type: ignore[func-returns-value] langs_to_keep = set(languages) script_to_keep = None if script: - [_check_is_valid_script(s) for s in script] + [_check_is_valid_script(s) for s in script] # type: ignore[func-returns-value] script_to_keep = set(script) domains_to_keep = None @@ -178,4 +177,4 @@ def _convert_to_set(domain: list[TaskDomain] | None) -> set: _tasks.append(t) - return _tasks + return _tasks # type: ignore[return-value] # type checker cannot infer the overload return type diff --git a/mteb/get_tasks.py b/mteb/get_tasks.py index b0a2d2b105..1c4efcf226 100644 --- a/mteb/get_tasks.py +++ b/mteb/get_tasks.py @@ -4,7 +4,7 @@ import logging import warnings from collections import Counter, defaultdict -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from typing import Any import pandas as pd @@ -23,12 +23,11 @@ def _gather_tasks() -> tuple[type[AbsTask], ...]: import mteb.tasks as tasks - tasks = [ + return tuple( t for t in tasks.__dict__.values() if isinstance(t, type) and issubclass(t, AbsTask) - ] - return tuple(tasks) + ) def _create_name_to_task_mapping( @@ -44,7 +43,7 @@ def _create_name_to_task_mapping( return metadata_names -def _create_similar_tasks(tasks: Sequence[type[AbsTask]]) -> dict[str, list[str]]: +def _create_similar_tasks(tasks: Iterable[type[AbsTask]]) -> dict[str, list[str]]: """Create a dictionary of similar tasks. Returns: @@ -195,9 +194,8 @@ def to_latex( string with a LaTeX table. """ if include_citation_in_name and "name" in properties: - properties += ["intext_citation"] - df = self.to_dataframe(properties) - df["name"] = df["name"] + " " + df["intext_citation"] + df = self.to_dataframe(tuple(properties) + ("intext_citation",)) + df["name"] = df["name"] + " " + df["intext_citation"] # type: ignore[operator] df = df.drop(columns=["intext_citation"]) else: df = self.to_dataframe(properties) @@ -222,17 +220,17 @@ def to_latex( def get_tasks( - tasks: list[str] | None = None, + tasks: Sequence[str] | None = None, *, - languages: list[str] | None = None, - script: list[str] | None = None, - domains: list[TaskDomain] | None = None, - task_types: list[TaskType] | None = None, # type: ignore - categories: list[TaskCategory] | None = None, + languages: Sequence[str] | None = None, + script: Sequence[str] | None = None, + domains: Sequence[TaskDomain] | None = None, + task_types: Sequence[TaskType] | None = None, + categories: Sequence[TaskCategory] | None = None, exclude_superseded: bool = True, - eval_splits: list[str] | None = None, + eval_splits: Sequence[str] | None = None, exclusive_language_filter: bool = False, - modalities: list[Modalities] | None = None, + modalities: Sequence[Modalities] | None = None, exclusive_modality_filter: bool = False, exclude_aggregate: bool = False, exclude_private: bool = True, @@ -288,7 +286,7 @@ def get_tasks( ] return MTEBTasks(_tasks) - _tasks = filter_tasks( + tasks_: Sequence[type[AbsTask]] = filter_tasks( TASK_LIST, languages=languages, script=script, @@ -301,12 +299,12 @@ def get_tasks( exclude_aggregate=exclude_aggregate, exclude_private=exclude_private, ) - _tasks = [ - cls().filter_languages(languages, script).filter_eval_splits(eval_splits) - for cls in _tasks - ] - - return MTEBTasks(_tasks) + return MTEBTasks( + [ + cls().filter_languages(languages, script).filter_eval_splits(eval_splits) + for cls in tasks_ + ] + ) _TASK_RENAMES = {"PersianTextTone": "SynPerTextToneClassification"} @@ -314,10 +312,10 @@ def get_tasks( def get_task( task_name: str, - languages: list[str] | None = None, - script: list[str] | None = None, - eval_splits: list[str] | None = None, - hf_subsets: list[str] | None = None, + languages: Sequence[str] | None = None, + script: Sequence[str] | None = None, + eval_splits: Sequence[str] | None = None, + hf_subsets: Sequence[str] | None = None, exclusive_language_filter: bool = False, ) -> AbsTask: """Get a task by name. diff --git a/mteb/languages/language_scripts.py b/mteb/languages/language_scripts.py index b8f05492f0..3cf48b9aa8 100644 --- a/mteb/languages/language_scripts.py +++ b/mteb/languages/language_scripts.py @@ -1,9 +1,9 @@ -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from dataclasses import dataclass from typing_extensions import Self -from mteb.languages import check_language_code +from mteb.languages.check_language_code import check_language_code @dataclass @@ -25,7 +25,9 @@ class LanguageScripts: @classmethod def from_languages_and_scripts( - cls, languages: list[str] | None = None, scripts: list[str] | None = None + cls, + languages: Sequence[str] | None = None, + scripts: Sequence[str] | None = None, ) -> Self: """Create a LanguageScripts object from lists of languages and scripts. diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 0cf8cae482..5ae4b7b094 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -169,7 +169,7 @@ def _update_task_info(task_names: str) -> gr.DataFrame: df = df.drop(columns="reference") return gr.DataFrame( df, - datatype=["markdown"] + ["str"] * (len(df.columns) - 1), # type: ignore + datatype=["markdown"] + ["str"] * (len(df.columns) - 1), buttons=["copy", "fullscreen"], show_search="filter", ) diff --git a/mteb/load_results.py b/mteb/load_results.py index 4108e0b066..c306423bd5 100644 --- a/mteb/load_results.py +++ b/mteb/load_results.py @@ -1,7 +1,7 @@ import json import logging import sys -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from pathlib import Path from mteb.abstasks.abstask import AbsTask @@ -45,8 +45,8 @@ def _model_name_and_revision( def load_results( results_repo: str = "https://github.com/embeddings-benchmark/results", download_latest: bool = True, - models: Sequence[ModelMeta] | Sequence[str] | None = None, - tasks: Sequence[AbsTask] | Sequence[str] | None = None, + models: Iterable[ModelMeta] | Sequence[str] | None = None, + tasks: Iterable[AbsTask] | Sequence[str] | None = None, validate_and_filter: bool = True, require_model_meta: bool = True, only_main_score: bool = False, @@ -83,21 +83,21 @@ def load_results( if models is not None: models_to_keep = {} - for model_path in models: - if isinstance(model_path, ModelMeta): - models_to_keep[model_path.name] = model_path.revision + for model in models: + if isinstance(model, ModelMeta): + models_to_keep[model.name] = model.revision else: - models_to_keep[model_path] = None + models_to_keep[model] = None else: models_to_keep = None - task_names = {} + task_names: dict[str, AbsTask | None] = {} if tasks is not None: - for task in tasks: - if isinstance(task, AbsTask): - task_names[task.metadata.name] = task + for task_ in tasks: + if isinstance(task_, AbsTask): + task_names[task_.metadata.name] = task_ else: - task_names[task] = None + task_names[task_] = None model_results = [] for model_path in model_paths: diff --git a/mteb/models/abs_encoder.py b/mteb/models/abs_encoder.py index d7f0c731dc..ebf281974f 100644 --- a/mteb/models/abs_encoder.py +++ b/mteb/models/abs_encoder.py @@ -44,7 +44,7 @@ class AbsEncoder(ABC): model: Any mteb_model_meta: ModelMeta | None = None model_prompts: dict[str, str] | None = None - instruction_template: str | Callable[[str, PromptType], str] | None = None + instruction_template: str | Callable[[str, PromptType | None], str] | None = None prompts_dict: dict[str, str] | None = None def get_prompt_name( @@ -111,7 +111,7 @@ def get_prompt( if not self.model_prompts: return None prompt_name = self.get_prompt_name(task_metadata, prompt_type) - return self.model_prompts.get(prompt_name) + return self.model_prompts.get(prompt_name) if prompt_name else None @staticmethod @overload diff --git a/mteb/models/cache_wrappers/cache_backend_protocol.py b/mteb/models/cache_wrappers/cache_backend_protocol.py index 581ff5c66d..b194b044d8 100644 --- a/mteb/models/cache_wrappers/cache_backend_protocol.py +++ b/mteb/models/cache_wrappers/cache_backend_protocol.py @@ -5,8 +5,6 @@ import numpy as np -from mteb.types import BatchedInput - @runtime_checkable class CacheBackendProtocol(Protocol): @@ -26,7 +24,7 @@ def __init__(self, directory: Path | None = None, **kwargs: Any) -> None: **kwargs: Additional backend-specific arguments. """ - def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None: + def add(self, item: list[dict[str, Any]], vectors: np.ndarray) -> None: """Add a vector to the cache. Args: @@ -34,7 +32,7 @@ def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None: vectors: Embedding vector of shape (dim,) or (1, dim). """ - def get_vector(self, item: BatchedInput) -> np.ndarray | None: + def get_vector(self, item: dict[str, Any]) -> np.ndarray | None: """Retrieve the cached vector for the given item. Args: @@ -53,5 +51,5 @@ def load(self) -> None: def close(self) -> None: """Release resources or flush data.""" - def __contains__(self, item: BatchedInput) -> bool: + def __contains__(self, item: dict[str, Any]) -> bool: """Check whether the cache contains an item.""" diff --git a/mteb/models/cache_wrappers/cache_backends/_hash_utils.py b/mteb/models/cache_wrappers/cache_backends/_hash_utils.py index 1161e48450..f86cfb5702 100644 --- a/mteb/models/cache_wrappers/cache_backends/_hash_utils.py +++ b/mteb/models/cache_wrappers/cache_backends/_hash_utils.py @@ -1,12 +1,13 @@ import hashlib +from collections.abc import Mapping +from typing import Any -from mteb.types import BatchedInput - -def _hash_item(item: BatchedInput) -> str: +def _hash_item(item: Mapping[str, Any]) -> str: item_hash = "" if "text" in item: - item_hash = hashlib.sha256(item["text"].encode()).hexdigest() + item_text: str = item["text"] + item_hash = hashlib.sha256(item_text.encode()).hexdigest() if "image" in item: from PIL import Image diff --git a/mteb/models/cache_wrappers/cache_backends/faiss_cache.py b/mteb/models/cache_wrappers/cache_backends/faiss_cache.py index 07cfda68a4..a5cce688ab 100644 --- a/mteb/models/cache_wrappers/cache_backends/faiss_cache.py +++ b/mteb/models/cache_wrappers/cache_backends/faiss_cache.py @@ -2,6 +2,7 @@ import logging import warnings from pathlib import Path +from typing import Any import numpy as np @@ -37,7 +38,7 @@ def __init__(self, directory: str | Path): logger.info(f"Initialized FAISS VectorCacheMap in {self.directory}") self.load() - def add(self, items: list[BatchedInput], vectors: np.ndarray) -> None: + def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None: """Add vector to FAISS index.""" import faiss diff --git a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py index 37fcdd3b97..e33b73fa6f 100644 --- a/mteb/models/cache_wrappers/cache_backends/numpy_cache.py +++ b/mteb/models/cache_wrappers/cache_backends/numpy_cache.py @@ -2,11 +2,10 @@ import logging import warnings from pathlib import Path +from typing import Any import numpy as np -from mteb.types import BatchedInput - from ._hash_utils import _hash_item logger = logging.getLogger(__name__) @@ -15,7 +14,7 @@ class NumpyCache: """Generic vector cache for both text and images.""" - def __init__(self, directory: str | Path, initial_vectors: int = 100000): + def __init__(self, directory: str | Path, initial_vectors: int = 100_000): self.directory = Path(directory) self.directory.mkdir(parents=True, exist_ok=True) self.vectors_file = self.directory / "vectors.npy" @@ -28,7 +27,7 @@ def __init__(self, directory: str | Path, initial_vectors: int = 100000): logger.info(f"Initialized VectorCacheMap in directory: {self.directory}") self._initialize_vectors_file() - def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None: + def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None: """Add a vector to the cache.""" try: if self.vector_dim is None: @@ -39,7 +38,12 @@ def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None: self._save_dimension() logger.info(f"Initialized vector dimension to {self.vector_dim}") - for item, vec in zip(item, vectors): + if self.vectors is None: + raise RuntimeError( + "Vectors file not initialized. Call _initialize_vectors_file() first." + ) + + for item, vec in zip(items, vectors): item_hash = _hash_item(item) if item_hash in self.hash_to_index: msg = f"Hash collision or duplicate item for hash {item_hash}. Overwriting existing vector." @@ -75,18 +79,26 @@ def _initialize_vectors_file(self) -> None: shape=(self.initial_vectors, self.vector_dim), ) else: - self.vectors = np.memmap(self.vectors_file, dtype="float32", mode="r+") - self.vectors = self.vectors.reshape(-1, self.vector_dim) + self.vectors = np.memmap( + self.vectors_file, + dtype="float32", + mode="r+", + shape=(-1, self.vector_dim), + ) logger.info(f"Vectors file initialized with shape: {self.vectors.shape}") def _double_vectors_file(self) -> None: + if self.vectors is None or self.vector_dim is None: + raise RuntimeError( + "Vectors file not initialized. Call _initialize_vectors_file() first." + ) current_size = len(self.vectors) new_size = current_size * 2 logger.info(f"Doubling vectors file from {current_size} to {new_size} vectors") self.vectors.flush() new_vectors = np.memmap( - self.vectors_file, - dtype="float32", + str(self.vectors_file), + dtype=np.float32, mode="r+", shape=(new_size, self.vector_dim), ) @@ -147,9 +159,11 @@ def load(self) -> None: if self.vector_dim is not None: self.vectors = np.memmap( - self.vectors_file, dtype="float32", mode="r+" + self.vectors_file, + dtype="float32", + mode="r+", + shape=(-1, self.vector_dim), ) - self.vectors = self.vectors.reshape(-1, self.vector_dim) logger.info(f"Loaded vectors file with shape: {self.vectors.shape}") else: msg = "Vector dimension not set. Unable to load vectors file." @@ -164,8 +178,11 @@ def load(self) -> None: logger.error(f"Error loading VectorCacheMap: {str(e)}") raise - def get_vector(self, item: BatchedInput) -> np.ndarray | None: + def get_vector(self, item: dict[str, Any]) -> np.ndarray | None: """Retrieve vector from index by hash.""" + if self.vectors is None: + return None + try: item_hash = _hash_item(item) if item_hash not in self.hash_to_index: @@ -177,7 +194,7 @@ def get_vector(self, item: BatchedInput) -> np.ndarray | None: logger.error(f"Error retrieving vector for item: {str(e)}") raise - def __contains__(self, item: BatchedInput) -> bool: + def __contains__(self, item: dict[str, Any]) -> bool: return _hash_item(item) in self.hash_to_index def __del__(self): diff --git a/mteb/models/cache_wrappers/cache_wrapper.py b/mteb/models/cache_wrappers/cache_wrapper.py index b895b3a8b0..4807385074 100644 --- a/mteb/models/cache_wrappers/cache_wrapper.py +++ b/mteb/models/cache_wrappers/cache_wrapper.py @@ -90,9 +90,9 @@ def encode( try: cache = self._get_or_create_cache(task_name) - uncached_items: list[BatchedInput] = [] + uncached_items: list[dict[str, Any]] = [] uncached_indices: list[int] = [] - all_items = inputs.dataset + all_items: Dataset = inputs.dataset cached_vectors: dict[int, np.ndarray] = {} for i, item in enumerate(all_items): diff --git a/mteb/models/get_model_meta.py b/mteb/models/get_model_meta.py index 4c83d29372..23c1b860b2 100644 --- a/mteb/models/get_model_meta.py +++ b/mteb/models/get_model_meta.py @@ -100,7 +100,7 @@ def get_model( meta = meta.model_copy(deep=True) meta.loader_kwargs |= kwargs - model.mteb_model_meta = meta # type: ignore + model.mteb_model_meta = meta # type: ignore[misc] return model diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 8ae7888093..780eea27a6 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -17,7 +17,7 @@ def instruct_wrapper( model_name_or_path: str, mode: str, - instruction_template: str | Callable[[str], str] | None = None, + instruction_template: str | Callable[[str, PromptType | None], str] | None = None, **kwargs, ): """Instruct wrapper for models. Uses GritLM to pass instructions to the model. @@ -40,7 +40,9 @@ def __init__( self, model_name_or_path: str, mode: str, - instruction_template: str | Callable[[str, PromptType], str] | None = None, + instruction_template: str + | Callable[[str, PromptType | None], str] + | None = None, **kwargs, ): if ( @@ -82,8 +84,11 @@ def encode( logger.info( f"Using instruction: '{instruction}' for task: '{task_metadata.name}'" ) - embeddings = super().encode( - _inputs, instruction=instruction, *args, **kwargs + embeddings = super().encode( # type: ignore[safe-super] + _inputs, # type: ignore[arg-type] + instruction=instruction, + *args, + **kwargs, ) if isinstance(embeddings, torch.Tensor): # sometimes in kwargs can be return_tensors=True @@ -141,7 +146,7 @@ def __init__( ) self.instruction_template = instruction_template - tokenizer_params = {} + tokenizer_params: dict[str, Any] = {} if add_eos_token: tokenizer_params["add_eos_token"] = add_eos_token if max_seq_length is not None: @@ -193,6 +198,7 @@ def encode( The encoded input in a numpy array or torch tensor of the shape (Number of sentences) x (Embedding dimension). """ sentences = [text for batch in inputs for text in batch["text"]] + instruction: str | None instruction = self.get_task_instruction(task_metadata, prompt_type) # to passage prompts won't be applied to passages diff --git a/mteb/models/model_implementations/andersborges.py b/mteb/models/model_implementations/andersborges.py index 176040a60c..be11173a8d 100644 --- a/mteb/models/model_implementations/andersborges.py +++ b/mteb/models/model_implementations/andersborges.py @@ -4,7 +4,7 @@ from mteb.models.model_meta import ModelMeta, ScoringFunction model2vecdk = ModelMeta( - loader=Model2VecModel, # type: ignore + loader=Model2VecModel, name="andersborges/model2vecdk", model_type=["dense"], languages=["dan-Latn"], @@ -35,7 +35,7 @@ model2vecdk_stem = ModelMeta( - loader=Model2VecModel, # type: ignore + loader=Model2VecModel, name="andersborges/model2vecdk-stem", model_type=["dense"], languages=["dan-Latn"], diff --git a/mteb/models/model_implementations/blip_models.py b/mteb/models/model_implementations/blip_models.py index 9c83129346..83646c432e 100644 --- a/mteb/models/model_implementations/blip_models.py +++ b/mteb/models/model_implementations/blip_models.py @@ -128,7 +128,7 @@ def encode( # in descending order of usage (downloads from huggingface) blip_image_captioning_large = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-image-captioning-large", model_type=["dense"], languages=["eng-Latn"], @@ -156,7 +156,7 @@ def encode( ) blip_image_captioning_base = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-image-captioning-base", model_type=["dense"], languages=["eng-Latn"], @@ -185,7 +185,7 @@ def encode( blip_vqa_base = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-vqa-base", model_type=["dense"], languages=["eng-Latn"], @@ -212,7 +212,7 @@ def encode( ) blip_vqa_capfilt_large = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-vqa-capfilt-large", model_type=["dense"], languages=["eng-Latn"], @@ -239,7 +239,7 @@ def encode( ) blip_itm_base_coco = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-itm-base-coco", model_type=["dense"], languages=["eng-Latn"], @@ -266,7 +266,7 @@ def encode( ) blip_itm_large_coco = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-itm-large-coco", model_type=["dense"], languages=["eng-Latn"], @@ -294,7 +294,7 @@ def encode( ) blip_itm_base_flickr = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-itm-base-flickr", model_type=["dense"], languages=["eng-Latn"], @@ -322,7 +322,7 @@ def encode( ) blip_itm_large_flickr = ModelMeta( - loader=BLIPModel, # type: ignore + loader=BLIPModel, name="Salesforce/blip-itm-large-flickr", model_type=["dense"], languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/bm25.py b/mteb/models/model_implementations/bm25.py index b143a597ab..0f2ef91d9f 100644 --- a/mteb/models/model_implementations/bm25.py +++ b/mteb/models/model_implementations/bm25.py @@ -113,7 +113,7 @@ def search( def encode(self, texts: list[str]): """Encode input text as term vectors""" - return bm25s.tokenize(texts, stopwords=self.stopwords, stemmer=self.stemmer) # type: ignore + return bm25s.tokenize(texts, stopwords=self.stopwords, stemmer=self.stemmer) return BM25Search(**kwargs) diff --git a/mteb/models/model_implementations/clip_models.py b/mteb/models/model_implementations/clip_models.py index d4a0e14f07..e2cb05ad95 100644 --- a/mteb/models/model_implementations/clip_models.py +++ b/mteb/models/model_implementations/clip_models.py @@ -115,7 +115,7 @@ def encode( clip_vit_large_patch14 = ModelMeta( - loader=CLIPModel, # type: ignore + loader=CLIPModel, name="openai/clip-vit-large-patch14", model_type=["dense"], languages=["eng-Latn"], @@ -139,7 +139,7 @@ def encode( ) clip_vit_base_patch32 = ModelMeta( - loader=CLIPModel, # type: ignore + loader=CLIPModel, name="openai/clip-vit-base-patch32", model_type=["dense"], languages=["eng-Latn"], @@ -163,7 +163,7 @@ def encode( ) clip_vit_base_patch16 = ModelMeta( - loader=CLIPModel, # type: ignore + loader=CLIPModel, name="openai/clip-vit-base-patch16", model_type=["dense"], languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/cohere_models.py b/mteb/models/model_implementations/cohere_models.py index 8758086c7a..36415f7e2e 100644 --- a/mteb/models/model_implementations/cohere_models.py +++ b/mteb/models/model_implementations/cohere_models.py @@ -222,7 +222,7 @@ def __init__( ) -> None: requires_package(self, "cohere", model_name, "pip install 'mteb[cohere]'") - import cohere # type: ignore + import cohere self.model_name = model_name.removeprefix("Cohere/Cohere-") self.sep = sep diff --git a/mteb/models/model_implementations/cohere_v.py b/mteb/models/model_implementations/cohere_v.py index ff22c79ee3..bbf06d9b4d 100644 --- a/mteb/models/model_implementations/cohere_v.py +++ b/mteb/models/model_implementations/cohere_v.py @@ -378,7 +378,7 @@ def encode( cohere_mult_3 = ModelMeta( - loader=cohere_v_loader, # type: ignore + loader=cohere_v_loader, loader_kwargs={"model_name": "embed-multilingual-v3.0"}, name="cohere/embed-multilingual-v3.0", model_type=["dense"], @@ -402,7 +402,7 @@ def encode( ) cohere_eng_3 = ModelMeta( - loader=cohere_v_loader, # type: ignore + loader=cohere_v_loader, loader_kwargs={"model_name": "embed-english-v3.0"}, name="cohere/embed-english-v3.0", model_type=["dense"], diff --git a/mteb/models/model_implementations/dino_models.py b/mteb/models/model_implementations/dino_models.py index c9ac81987d..4527d1ae97 100644 --- a/mteb/models/model_implementations/dino_models.py +++ b/mteb/models/model_implementations/dino_models.py @@ -104,7 +104,7 @@ def encode( dinov2_small = ModelMeta( - loader=DINOModel, # type: ignore + loader=DINOModel, name="facebook/dinov2-small", model_type=["dense"], languages=["eng-Latn"], @@ -125,7 +125,7 @@ def encode( use_instructions=False, training_datasets=dinov2_training_datasets, citation="""@misc{oquab2023dinov2, - title={DINOv2: Learning Robust Visual Features without Supervision}, + title={DINOv2: Learning Robust Visual Features without Supervision}, author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski}, year={2023}, eprint={2304.07193}, @@ -135,7 +135,7 @@ def encode( ) dinov2_base = ModelMeta( - loader=DINOModel, # type: ignore + loader=DINOModel, name="facebook/dinov2-base", model_type=["dense"], languages=["eng-Latn"], @@ -156,7 +156,7 @@ def encode( use_instructions=False, training_datasets=dinov2_training_datasets, citation="""@misc{oquab2023dinov2, - title={DINOv2: Learning Robust Visual Features without Supervision}, + title={DINOv2: Learning Robust Visual Features without Supervision}, author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski}, year={2023}, eprint={2304.07193}, @@ -166,7 +166,7 @@ def encode( ) dinov2_large = ModelMeta( - loader=DINOModel, # type: ignore + loader=DINOModel, name="facebook/dinov2-large", model_type=["dense"], languages=["eng-Latn"], @@ -187,7 +187,7 @@ def encode( use_instructions=False, training_datasets=dinov2_training_datasets, citation="""@misc{oquab2023dinov2, - title={DINOv2: Learning Robust Visual Features without Supervision}, + title={DINOv2: Learning Robust Visual Features without Supervision}, author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski}, year={2023}, eprint={2304.07193}, @@ -197,7 +197,7 @@ def encode( ) dinov2_giant = ModelMeta( - loader=DINOModel, # type: ignore + loader=DINOModel, name="facebook/dinov2-giant", model_type=["dense"], languages=["eng-Latn"], @@ -218,7 +218,7 @@ def encode( use_instructions=False, training_datasets=dinov2_training_datasets, citation="""@misc{oquab2023dinov2, - title={DINOv2: Learning Robust Visual Features without Supervision}, + title={DINOv2: Learning Robust Visual Features without Supervision}, author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski}, year={2023}, eprint={2304.07193}, @@ -253,7 +253,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -284,7 +284,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -315,7 +315,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -346,7 +346,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -377,7 +377,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -408,7 +408,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -439,7 +439,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -470,7 +470,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -502,7 +502,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -533,7 +533,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -564,7 +564,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -595,7 +595,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -626,7 +626,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -657,7 +657,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, @@ -688,7 +688,7 @@ def encode( use_instructions=False, training_datasets=webssl_dino_training_datasets, citation="""@article{fan2025scaling, - title={Scaling Language-Free Visual Representation Learning}, + title={Scaling Language-Free Visual Representation Learning}, author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie}, year={2025}, eprint={2504.01017}, diff --git a/mteb/models/model_implementations/emillykkejensen_models.py b/mteb/models/model_implementations/emillykkejensen_models.py index a77f4ed677..13d8b076bb 100644 --- a/mteb/models/model_implementations/emillykkejensen_models.py +++ b/mteb/models/model_implementations/emillykkejensen_models.py @@ -2,7 +2,7 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader embedding_gemma_300m_scandi = ModelMeta( - loader=sentence_transformers_loader, # type: ignore + loader=sentence_transformers_loader, name="emillykkejensen/EmbeddingGemma-Scandi-300m", model_type=["dense"], languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"], @@ -35,7 +35,7 @@ qwen_scandi = ModelMeta( - loader=sentence_transformers_loader, # type: ignore + loader=sentence_transformers_loader, name="emillykkejensen/Qwen3-Embedding-Scandi-0.6B", model_type=["dense"], languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"], @@ -59,7 +59,7 @@ mmbert_scandi = ModelMeta( - loader=sentence_transformers_loader, # type: ignore + loader=sentence_transformers_loader, name="emillykkejensen/mmBERTscandi-base-embedding", model_type=["dense"], languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"], diff --git a/mteb/models/model_implementations/jina_clip.py b/mteb/models/model_implementations/jina_clip.py index 469a0c7b29..5459f69eb9 100644 --- a/mteb/models/model_implementations/jina_clip.py +++ b/mteb/models/model_implementations/jina_clip.py @@ -121,7 +121,7 @@ def encode( jina_clip_v1 = ModelMeta( - loader=JinaCLIPModel, # type: ignore + loader=JinaCLIPModel, name="jinaai/jina-clip-v1", model_type=["dense"], languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/jina_models.py b/mteb/models/model_implementations/jina_models.py index f7d06931ec..d414ee2a8c 100644 --- a/mteb/models/model_implementations/jina_models.py +++ b/mteb/models/model_implementations/jina_models.py @@ -795,7 +795,7 @@ def get_programming_task_override( jina_embeddings_v3 = ModelMeta( - loader=JinaWrapper, # type: ignore + loader=JinaWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts={ diff --git a/mteb/models/model_implementations/kennethenevoldsen_models.py b/mteb/models/model_implementations/kennethenevoldsen_models.py index 1b9b8ca4e1..ee38fadc6a 100644 --- a/mteb/models/model_implementations/kennethenevoldsen_models.py +++ b/mteb/models/model_implementations/kennethenevoldsen_models.py @@ -4,7 +4,7 @@ ) dfm_enc_large = ModelMeta( - loader=sentence_transformers_loader, # type: ignore + loader=sentence_transformers_loader, name="KennethEnevoldsen/dfm-sentence-encoder-large", model_type=["dense"], languages=["dan-Latn"], @@ -39,7 +39,7 @@ ) dfm_enc_med = ModelMeta( - loader=sentence_transformers_loader, # type: ignore + loader=sentence_transformers_loader, name="KennethEnevoldsen/dfm-sentence-encoder-medium", model_type=["dense"], languages=["dan-Latn"], diff --git a/mteb/models/model_implementations/llm2clip_models.py b/mteb/models/model_implementations/llm2clip_models.py index fe66d5812e..e2c77e878c 100644 --- a/mteb/models/model_implementations/llm2clip_models.py +++ b/mteb/models/model_implementations/llm2clip_models.py @@ -181,7 +181,7 @@ def encode( ) llm2clip_openai_l_14_336 = ModelMeta( - loader=llm2clip_loader, # type: ignore + loader=llm2clip_loader, name="microsoft/LLM2CLIP-Openai-L-14-336", model_type=["dense"], languages=["eng-Latn"], @@ -206,7 +206,7 @@ def encode( # NOTE: https://huggingface.co/microsoft/LLM2CLIP-Openai-L-14-224/discussions/1 llm2clip_openai_l_14_224 = ModelMeta( - loader=llm2clip_loader, # type: ignore + loader=llm2clip_loader, name="microsoft/LLM2CLIP-Openai-L-14-224", model_type=["dense"], languages=["eng-Latn"], @@ -230,7 +230,7 @@ def encode( ) llm2clip_openai_b_16 = ModelMeta( - loader=llm2clip_loader, # type: ignore + loader=llm2clip_loader, name="microsoft/LLM2CLIP-Openai-B-16", model_type=["dense"], languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/moco_models.py b/mteb/models/model_implementations/moco_models.py index 236fca0fd3..e018a50b2a 100644 --- a/mteb/models/model_implementations/moco_models.py +++ b/mteb/models/model_implementations/moco_models.py @@ -117,7 +117,7 @@ def encode( ) mocov3_vit_base = ModelMeta( - loader=mocov3_loader, # type: ignore + loader=mocov3_loader, name="nyu-visionx/moco-v3-vit-b", model_type=["dense"], languages=["eng-Latn"], @@ -141,7 +141,7 @@ def encode( ) mocov3_vit_large = ModelMeta( - loader=mocov3_loader, # type: ignore + loader=mocov3_loader, name="nyu-visionx/moco-v3-vit-l", model_type=["dense"], languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/model2vec_models.py b/mteb/models/model_implementations/model2vec_models.py index 0f45674aac..5d082a96b8 100644 --- a/mteb/models/model_implementations/model2vec_models.py +++ b/mteb/models/model_implementations/model2vec_models.py @@ -139,7 +139,7 @@ def __init__( **kwargs: Additional arguments to pass to the wrapper. """ requires_package(self, "model2vec", model_name, "pip install 'mteb[model2vec]'") - from model2vec import StaticModel # type: ignore + from model2vec import StaticModel self.model_name = model_name self.model = StaticModel.from_pretrained(self.model_name) diff --git a/mteb/models/model_implementations/nomic_models.py b/mteb/models/model_implementations/nomic_models.py index 595dd48cda..5b8b722ec7 100644 --- a/mteb/models/model_implementations/nomic_models.py +++ b/mteb/models/model_implementations/nomic_models.py @@ -193,7 +193,7 @@ def encode( """ nomic_embed_v1_5 = ModelMeta( - loader=NomicWrapper, # type: ignore + loader=NomicWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts=model_prompts, @@ -222,7 +222,7 @@ def encode( ) nomic_embed_v1 = ModelMeta( - loader=NomicWrapper, # type: ignore + loader=NomicWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts=model_prompts, @@ -251,7 +251,7 @@ def encode( ) nomic_embed_v1_ablated = ModelMeta( - loader=NomicWrapper, # type: ignore + loader=NomicWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts=model_prompts, @@ -279,7 +279,7 @@ def encode( ) nomic_embed_v1_unsupervised = ModelMeta( - loader=NomicWrapper, # type: ignore + loader=NomicWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts=model_prompts, @@ -334,7 +334,7 @@ def encode( training_datasets=nomic_training_data, public_training_data=None, citation="""@misc{nussbaum2024nomic, - title={Nomic Embed: Training a Reproducible Long Context Text Embedder}, + title={Nomic Embed: Training a Reproducible Long Context Text Embedder}, author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar}, year={2024}, eprint={2402.01613}, @@ -446,7 +446,7 @@ def encode( ] nomic_embed_text_v2_moe = ModelMeta( - loader=NomicWrapper, # type: ignore + loader=NomicWrapper, loader_kwargs=dict( trust_remote_code=True, model_prompts=model_prompts, @@ -472,12 +472,12 @@ def encode( training_datasets=None, # did not look into this further superseded_by=None, citation="""@misc{nussbaum2025trainingsparsemixtureexperts, - title={Training Sparse Mixture Of Experts Text Embedding Models}, + title={Training Sparse Mixture Of Experts Text Embedding Models}, author={Zach Nussbaum and Brandon Duderstadt}, year={2025}, eprint={2502.07972}, archivePrefix={arXiv}, primaryClass={cs.CL}, - url={https://arxiv.org/abs/2502.07972}, + url={https://arxiv.org/abs/2502.07972}, }""", ) diff --git a/mteb/models/model_implementations/openclip_models.py b/mteb/models/model_implementations/openclip_models.py index 6c05bb7457..29e4b2f7ef 100644 --- a/mteb/models/model_implementations/openclip_models.py +++ b/mteb/models/model_implementations/openclip_models.py @@ -120,7 +120,7 @@ def encode( CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K", model_type=["dense"], languages=["eng-Latn"], @@ -146,7 +146,7 @@ def encode( ) CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K", model_type=["dense"], languages=["eng-Latn"], @@ -172,7 +172,7 @@ def encode( ) CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K", model_type=["dense"], languages=["eng-Latn"], @@ -198,7 +198,7 @@ def encode( ) CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", model_type=["dense"], languages=["eng-Latn"], @@ -224,7 +224,7 @@ def encode( ) CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-g-14-laion2B-s34B-b88K", model_type=["dense"], languages=["eng-Latn"], @@ -250,7 +250,7 @@ def encode( ) CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-H-14-laion2B-s32B-b79K", model_type=["dense"], languages=["eng-Latn"], @@ -276,7 +276,7 @@ def encode( ) CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta( - loader=openclip_loader, # type: ignore + loader=openclip_loader, name="laion/CLIP-ViT-L-14-laion2B-s32B-b82K", model_type=["dense"], languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/random_baseline.py b/mteb/models/model_implementations/random_baseline.py index 6ae502844e..92dd754dd2 100644 --- a/mteb/models/model_implementations/random_baseline.py +++ b/mteb/models/model_implementations/random_baseline.py @@ -68,7 +68,7 @@ def _image_to_vector(image: Image.Image, size: int) -> np.ndarray: license="mit", max_tokens=np.inf, reference=None, - similarity_fn_name="cosine", # type: ignore + similarity_fn_name="cosine", framework=[], use_instructions=False, public_training_code=None, # No training code, as this is a random baseline @@ -187,7 +187,7 @@ def similarity_pairwise( random_encoder_baseline = ModelMeta( - loader=RandomEncoderBaseline, # type: ignore + loader=RandomEncoderBaseline, name="baseline/random-encoder-baseline", model_type=["dense"], modalities=["text", "image"], @@ -232,7 +232,7 @@ def predict( random_cross_encoder_baseline = ModelMeta( - loader=RandomCrossEncoderBaseline, # type: ignore + loader=RandomCrossEncoderBaseline, name="baseline/random-cross-encoder-baseline", model_type=["cross-encoder"], modalities=["text", "image"], diff --git a/mteb/models/model_implementations/rasgaard_models.py b/mteb/models/model_implementations/rasgaard_models.py index 8e9b237d57..72f161d8e8 100644 --- a/mteb/models/model_implementations/rasgaard_models.py +++ b/mteb/models/model_implementations/rasgaard_models.py @@ -4,7 +4,7 @@ from mteb.models.model_meta import ModelMeta, ScoringFunction potion_base_8m = ModelMeta( - loader=Model2VecModel, # type: ignore + loader=Model2VecModel, name="rasgaard/m2v-dfm-large", model_type=["dense"], languages=["dan-Latn"], diff --git a/mteb/models/model_implementations/repllama_models.py b/mteb/models/model_implementations/repllama_models.py index 179f3c4757..01ef7f53b0 100644 --- a/mteb/models/model_implementations/repllama_models.py +++ b/mteb/models/model_implementations/repllama_models.py @@ -154,7 +154,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: """ repllama_llama2_original = ModelMeta( - loader=RepLLaMAModel, # type: ignore + loader=RepLLaMAModel, loader_kwargs=dict( base_model_name_or_path="meta-llama/Llama-2-7b-hf", device_map="auto", @@ -187,7 +187,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: repllama_llama2_reproduced = ModelMeta( - loader=RepLLaMAModel, # type: ignore + loader=RepLLaMAModel, loader_kwargs=dict( base_model_name_or_path="meta-llama/Llama-2-7b-hf", device_map="auto", diff --git a/mteb/models/model_implementations/rerankers_custom.py b/mteb/models/model_implementations/rerankers_custom.py index cebcfdc2c6..935d32102a 100644 --- a/mteb/models/model_implementations/rerankers_custom.py +++ b/mteb/models/model_implementations/rerankers_custom.py @@ -214,7 +214,7 @@ def predict( monobert_large = ModelMeta( - loader=MonoBERTReranker, # type: ignore + loader=MonoBERTReranker, loader_kwargs=dict( fp_options="float16", ), @@ -239,7 +239,7 @@ def predict( # languages unclear: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual/discussions/28 jina_reranker_multilingual = ModelMeta( - loader=JinaReranker, # type: ignore + loader=JinaReranker, loader_kwargs=dict( fp_options="float16", ), @@ -263,7 +263,7 @@ def predict( ) bge_reranker_v2_m3 = ModelMeta( - loader=BGEReranker, # type: ignore + loader=BGEReranker, loader_kwargs=dict( fp_options="float16", ), diff --git a/mteb/models/model_implementations/rerankers_monot5_based.py b/mteb/models/model_implementations/rerankers_monot5_based.py index 00a44b8e8a..7fbe943af0 100644 --- a/mteb/models/model_implementations/rerankers_monot5_based.py +++ b/mteb/models/model_implementations/rerankers_monot5_based.py @@ -343,7 +343,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) monot5_base = ModelMeta( - loader=MonoT5Reranker, # type: ignore + loader=MonoT5Reranker, loader_kwargs=dict( fp_options="float16", ), @@ -442,7 +442,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) flant5_base = ModelMeta( - loader=FLANT5Reranker, # type: ignore + loader=FLANT5Reranker, loader_kwargs=dict( fp_options="float16", ), @@ -902,7 +902,7 @@ def get_prediction_tokens(self, *args, **kwargs): ) mt5_13b_mmarco_100k = ModelMeta( - loader=MonoT5Reranker, # type: ignore + loader=MonoT5Reranker, loader_kwargs=dict( fp_options="float16", ), diff --git a/mteb/models/model_implementations/siglip_models.py b/mteb/models/model_implementations/siglip_models.py index b295ba6bd8..d709049e7e 100644 --- a/mteb/models/model_implementations/siglip_models.py +++ b/mteb/models/model_implementations/siglip_models.py @@ -123,7 +123,7 @@ def encode( ) siglip_so400m_patch14_224 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-so400m-patch14-224", model_type=["dense"], languages=["eng-Latn"], @@ -147,7 +147,7 @@ def encode( ) siglip_so400m_patch14_384 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-so400m-patch14-384", model_type=["dense"], languages=["eng-Latn"], @@ -171,7 +171,7 @@ def encode( ) siglip_so400m_patch16_256_i18n = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-so400m-patch16-256-i18n", model_type=["dense"], languages=["eng-Latn"], @@ -195,7 +195,7 @@ def encode( ) siglip_base_patch16_256_multilingual = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-base-patch16-256-multilingual", model_type=["dense"], languages=["eng-Latn"], @@ -219,7 +219,7 @@ def encode( ) siglip_base_patch16_256 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-base-patch16-256", model_type=["dense"], languages=["eng-Latn"], @@ -243,7 +243,7 @@ def encode( ) siglip_base_patch16_512 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-base-patch16-512", model_type=["dense"], languages=["eng-Latn"], @@ -267,7 +267,7 @@ def encode( ) siglip_base_patch16_384 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-base-patch16-384", model_type=["dense"], languages=["eng-Latn"], @@ -291,7 +291,7 @@ def encode( ) siglip_base_patch16_224 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-base-patch16-224", model_type=["dense"], languages=["eng-Latn"], @@ -315,7 +315,7 @@ def encode( ) siglip_large_patch16_256 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-large-patch16-256", model_type=["dense"], languages=["eng-Latn"], @@ -339,7 +339,7 @@ def encode( ) siglip_large_patch16_384 = ModelMeta( - loader=SiglipModelWrapper, # type: ignore + loader=SiglipModelWrapper, name="google/siglip-large-patch16-384", model_type=["dense"], languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/vlm2vec_models.py b/mteb/models/model_implementations/vlm2vec_models.py index 8b7bd0929d..318302578f 100644 --- a/mteb/models/model_implementations/vlm2vec_models.py +++ b/mteb/models/model_implementations/vlm2vec_models.py @@ -41,7 +41,7 @@ def __init__( model_name, "pip install flash-attn --no-build-isolation", ): - import flash_attn # noqa + pass requires_package(self, "peft", model_name, "pip install 'mteb[peft]'") from peft import LoraConfig, PeftModel diff --git a/mteb/models/model_implementations/voyage_v.py b/mteb/models/model_implementations/voyage_v.py index 5f7e2f1f4b..b84b3040f2 100644 --- a/mteb/models/model_implementations/voyage_v.py +++ b/mteb/models/model_implementations/voyage_v.py @@ -40,15 +40,15 @@ def _downsample_image( logging.info( f"Downsampling image from {width}x{height} to {new_width}x{new_height}" ) - return image.resize(new_size, Image.LANCZOS) # type: ignore + return image.resize(new_size, Image.LANCZOS) if width > height: if width > 10000: logging.error("Processing extremely wide images.") - return image.resize((10000, height), Image.LANCZOS) # type: ignore + return image.resize((10000, height), Image.LANCZOS) else: if height > 10000: logging.error("Processing extremely high images.") - return image.resize((width, 10000), Image.LANCZOS) # type: ignore + return image.resize((width, 10000), Image.LANCZOS) return image @@ -202,7 +202,7 @@ def encode( voyage_v = ModelMeta( - loader=voyage_v_loader, # type: ignore + loader=voyage_v_loader, name="voyageai/voyage-multimodal-3", model_type=["dense"], languages=[], # Unknown diff --git a/mteb/models/model_meta.py b/mteb/models/model_meta.py index fb322734d5..96ae834cbc 100644 --- a/mteb/models/model_meta.py +++ b/mteb/models/model_meta.py @@ -81,7 +81,7 @@ def _get_loader_name( return loader.__name__ -_SENTENCE_TRANSFORMER_LIB_NAME = "Sentence Transformers" +_SENTENCE_TRANSFORMER_LIB_NAME: FRAMEWORKS = "Sentence Transformers" class ModelMeta(BaseModel): @@ -263,10 +263,8 @@ def load_model(self, **kwargs: Any) -> MTEBModels: _kwargs = self.loader_kwargs.copy() _kwargs.update(kwargs) - model: EncoderProtocol = self.loader( - self.name, revision=self.revision, **_kwargs - ) - model.mteb_model_meta = self # type: ignore + model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs) + model.mteb_model_meta = self # type: ignore[misc] return model def model_name_as_path(self) -> str: @@ -318,9 +316,8 @@ def _from_hub( model_config = None logger.warning(f"Can't get configuration for {model_name}. Error: {e}") - if ( - card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME - or _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags + if card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME or ( + card_data.tags and _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags ): frameworks.append(_SENTENCE_TRANSFORMER_LIB_NAME) else: @@ -435,7 +432,7 @@ def from_hub( and config_sbert.get("similarity_fn_name") is not None ): meta.similarity_fn_name = ScoringFunction.from_str( - config_sbert.get("similarity_fn_name") + config_sbert["similarity_fn_name"] ) else: meta.similarity_fn_name = ScoringFunction.COSINE @@ -516,7 +513,7 @@ def get_training_datasets(self) -> set[str] | None: warnings.warn(msg) return_dataset = training_datasets.copy() - visited = set() + visited: set[str] = set() for dataset in training_datasets: similar_tasks = _collect_similar_tasks(dataset, visited) @@ -550,6 +547,8 @@ def zero_shot_percentage( @staticmethod def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | None: + if not model_name: + return None try: safetensors_metadata = get_safetensors_metadata(model_name) if len(safetensors_metadata.parameter_count) >= 0: @@ -563,7 +562,7 @@ def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | N logger.warning( f"Can't calculate number of parameters for {model_name}. Got error {e}" ) - return None + return None def calculate_num_parameters_from_hub(self) -> int | None: """Calculates the number of parameters in the model. @@ -626,7 +625,7 @@ def calculate_memory_usage_mb(self) -> int | None: if "API" in self.framework or self.name is None: return None - return self._calculate_memory_usage_mb(self.model_name, self.n_parameters) + return self._calculate_memory_usage_mb(self.name, self.n_parameters) @staticmethod def fetch_release_date(model_name: str) -> StrDate | None: diff --git a/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py b/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py index 5d1b559a16..6234cfc9d6 100644 --- a/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +++ b/mteb/models/search_encoder_index/search_indexes/faiss_search_index.py @@ -109,7 +109,7 @@ def search( ids = ids.tolist() if issubclass(self.index_type, faiss.IndexFlatL2): - similarities = -np.sqrt(np.maximum(similarities, 0)) + similarities = (-np.sqrt(np.maximum(similarities, 0))).tolist() return similarities, ids @@ -117,8 +117,8 @@ def _reranking( self, embeddings: Array, top_k: int, - top_ranked: TopRankedDocumentsType | None = None, - query_idx_to_id: dict[int, str] | None = None, + top_ranked: TopRankedDocumentsType, + query_idx_to_id: dict[int, str], ) -> tuple[list[list[float]], list[list[int]]]: doc_id_to_idx = {doc_id: i for i, doc_id in enumerate(self.idxs)} scores_all: list[list[float]] = [] @@ -136,9 +136,9 @@ def _reranking( continue candidate_indices = [doc_id_to_idx[doc_id] for doc_id in ranked_ids] - d = self.index.d + d = self.index.d # type: ignore[union-attr] candidate_embs = np.vstack( - [self.index.reconstruct(idx) for idx in candidate_indices] + [self.index.reconstruct(idx) for idx in candidate_indices] # type: ignore[union-attr] ) sub_reranking_index = self.index_type(d) sub_reranking_index.add(candidate_embs) diff --git a/mteb/models/search_wrappers.py b/mteb/models/search_wrappers.py index d98a99fd79..4008e3a8e6 100644 --- a/mteb/models/search_wrappers.py +++ b/mteb/models/search_wrappers.py @@ -200,7 +200,7 @@ def search( # Reset the task corpus dataloader to None to free up memory self.task_corpus = None - results = {qid: {} for qid in query_idx_to_id.values()} + results: RetrievalOutputType = {qid: {} for qid in query_idx_to_id.values()} for qid in result_heaps: for score, corpus_id in result_heaps[qid]: results[qid][corpus_id] = score @@ -218,13 +218,19 @@ def _full_corpus_search( encode_kwargs: dict[str, Any], ) -> dict[str, list[tuple[float, str]]]: logger.info("Encoding Corpus in batches (this might take a while)...") + if self.task_corpus is None: + raise ValueError("Corpus must be indexed before searching.") + itr = range(0, len(self.task_corpus), self.corpus_chunk_size) - result_heaps = {qid: [] for qid in query_idx_to_id.values()} + result_heaps: dict[str, list[tuple[float, str]]] = { + qid: [] for qid in query_idx_to_id.values() + } for batch_num, corpus_start_idx in enumerate(itr): logger.info(f"Encoding Batch {batch_num + 1}/{len(itr)}...") corpus_end_idx = min( - corpus_start_idx + self.corpus_chunk_size, len(self.task_corpus) + corpus_start_idx + self.corpus_chunk_size, + len(self.task_corpus), ) sub_corpus = self.task_corpus.select( range(corpus_start_idx, corpus_end_idx) @@ -249,7 +255,7 @@ def _full_corpus_search( scores = self.model.similarity(query_embeddings, sub_corpus_embeddings) # get top-k values - cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( + cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = torch.topk( torch.as_tensor(scores), min( top_k + 1, @@ -258,8 +264,8 @@ def _full_corpus_search( dim=1, largest=True, ) - cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() - cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() + cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist() + cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist() sub_corpus_ids = list(sub_corpus_ids) result_heaps = self._sort_full_corpus_results( @@ -319,7 +325,11 @@ def _rerank_documents( Returns: A dictionary mapping query IDs to a list of tuples, each containing a relevance score and a document ID. """ - result_heaps = {qid: [] for qid in query_idx_to_id.values()} + if self.task_corpus is None: + raise ValueError("Corpus must be indexed before searching.") + result_heaps: dict[str, list[tuple[float, str]]] = { + qid: [] for qid in query_idx_to_id.values() + } doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)} all_doc_embeddings = self.model.encode( @@ -387,12 +397,12 @@ def _rerank_documents( def _rerank_sort_results( self, - result_heaps: list[tuple[float, str]], + result_heaps: dict[str, list[tuple[float, str]]], query_id: str, ranked_ids: list[str], scores_top_k_idx: torch.Tensor, scores_top_k_values: torch.Tensor, - ) -> list[tuple[float, str]]: + ) -> dict[str, list[tuple[float, str]]]: """Sort the heap into descending order list. Returns: @@ -503,6 +513,8 @@ def search( raise ValueError( "CrossEncoder search requires top_ranked documents for reranking." ) + if self.task_corpus is None: + raise ValueError("Corpus must be indexed before searching.") query_id_to_idx = {row["id"]: i for i, row in enumerate(queries)} doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)} @@ -542,7 +554,7 @@ def search( hf_subset=hf_subset, ) - results = {qid: {} for qid in queries["id"]} + results: RetrievalOutputType = {qid: {} for qid in queries["id"]} for (query_id, corpus_id), score in zip(doc_pairs_ids, predictions): results[query_id][corpus_id] = float(score) diff --git a/mteb/models/sentence_transformer_wrapper.py b/mteb/models/sentence_transformer_wrapper.py index 51be875a84..25cb069f0d 100644 --- a/mteb/models/sentence_transformer_wrapper.py +++ b/mteb/models/sentence_transformer_wrapper.py @@ -103,8 +103,11 @@ def __init__( logger.warning(msg) warnings.warn(msg) + def similarity(self, embeddings1: Array, embeddings2: Array) -> Array: + """Compute the similarity between two collections of embeddings.""" if hasattr(self.model, "similarity") and callable(self.model.similarity): - self.similarity = self.model.similarity + return self.model.similarity(embeddings1, embeddings2) + return super().similarity(embeddings1, embeddings2) def encode( self, @@ -150,7 +153,7 @@ def encode( prompt_name = None if self.model_prompts is not None: prompt_name = self.get_prompt_name(task_metadata, prompt_type) - prompt = self.model_prompts.get(prompt_name, None) + prompt = self.model_prompts.get(prompt_name, None) # type: ignore[arg-type] if prompt_name: prompt_log = f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}" else: @@ -221,7 +224,7 @@ def encode( prompt_name = None if self.model_prompts is not None: prompt_name = self.get_prompt_name(task_metadata, prompt_type) - prompt = self.model_prompts.get(prompt_name, None) + prompt = self.model_prompts.get(prompt_name, None) # type: ignore[arg-type] if prompt_name: logger.info( f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}" @@ -234,7 +237,9 @@ def encode( all_embeddings = [] for batch in inputs: batch_column = next(iter(batch.keys())) - batched_input = [dict() for _ in range(len(batch[batch_column]))] + batched_input: list[dict[str, Any]] = [ + dict() for _ in range(len(batch[batch_column])) + ] # transform from {"text": [text1, text2], "image": [image1, image2]} to # [{"text": text1, "image": image1}, {"text": text2, "image": image2}] diff --git a/mteb/py.typed b/mteb/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/results/benchmark_results.py b/mteb/results/benchmark_results.py index d1b9d30c89..91cbed789c 100644 --- a/mteb/results/benchmark_results.py +++ b/mteb/results/benchmark_results.py @@ -1,10 +1,12 @@ +from __future__ import annotations + import functools import json import logging import warnings -from collections.abc import Callable, Iterable, Iterator, Sequence +from collections.abc import Callable, Iterable, Iterator from pathlib import Path -from typing import Any, Literal +from typing import Any, Literal, cast import pandas as pd from packaging.version import InvalidVersion, Version @@ -33,11 +35,12 @@ logger = logging.getLogger(__name__) -# Global cache for model metas and version parsing @functools.lru_cache def _get_cached_model_metas() -> dict[str, str | None]: """Cache model metas to avoid repeated calls.""" - return {meta.name: meta.revision for meta in get_model_metas()} + return { + meta.name: meta.revision for meta in get_model_metas() if meta.name is not None + } @functools.lru_cache(maxsize=10000) @@ -77,10 +80,10 @@ def _filter_tasks( task_names: list[str] | None = None, languages: list[str] | None = None, domains: list[TaskDomain] | None = None, - task_types: list[TaskType] | None = None, # type: ignore + task_types: list[TaskType] | None = None, modalities: list[Modalities] | None = None, is_public: bool | None = None, - ) -> Self: + ) -> BenchmarkResults: # TODO: Same as filter_models model_results = [ res._filter_tasks( @@ -97,7 +100,7 @@ def _filter_tasks( model_results=[res for res in model_results if res.task_results] ) - def select_tasks(self, tasks: Sequence[AbsTask]) -> Self: + def select_tasks(self, tasks: Iterable[AbsTask]) -> BenchmarkResults: """Select tasks from the benchmark results. Args: @@ -115,7 +118,7 @@ def select_models( self, names: list[str] | list[ModelMeta], revisions: list[str | None] | None = None, - ) -> Self: + ) -> BenchmarkResults: """Get models by name and revision. Args: @@ -128,7 +131,7 @@ def select_models( models_res = [] _revisions = revisions if revisions is not None else [None] * len(names) - name_rev = {} + name_rev: dict[str, str | None] = {} if len(names) != len(_revisions): raise ValueError( @@ -137,9 +140,12 @@ def select_models( for name, revision in zip(names, _revisions): if isinstance(name, ModelMeta): + if name.name is None: + raise ValueError("name in ModelMeta is None. It must be a string.") name_rev[name.name] = name.revision else: - name_rev[name] = revision + name_ = cast(str, name) + name_rev[name_] = revision for model_res in self.model_results: model_name = model_res.model_name @@ -159,7 +165,7 @@ def _filter_models( n_parameters_range: tuple[int | None, int | None] = (None, None), use_instructions: bool | None = None, zero_shot_on: list[AbsTask] | None = None, - ) -> Self: + ) -> BenchmarkResults: # mostly a utility function for the leaderboard app. # I would probably move the filtering of the models outside of this call. No need to call get_model_metas inside the filter. # interface would then be the same as the get_models function @@ -182,7 +188,7 @@ def _filter_models( return type(self).model_construct(model_results=new_model_results) - def join_revisions(self) -> Self: + def join_revisions(self) -> BenchmarkResults: """Join revisions of the same model. In case of conflicts, the following rules are applied: @@ -212,10 +218,10 @@ def join_revisions(self) -> Self: # Use cached model metas model_to_main_revision = _get_cached_model_metas() - task_df["main_revision"] = task_df["model"].map(model_to_main_revision) # type: ignore + task_df["main_revision"] = task_df["model"].map(model_to_main_revision) # Use cached version parsing - task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached) # type: ignore + task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached) # Filter out rows without scores first task_df = task_df[task_df["has_scores"]] @@ -259,8 +265,8 @@ def join_revisions(self) -> Self: # so grouping by original revision ensures consistent ModelResult creation for (model, model_revision), group in task_df.groupby(["model", "revision"]): model_result = ModelResult.model_construct( - model_name=model, - model_revision=model_revision, + model_name=model, # type: ignore[arg-type] + model_revision=model_revision, # type: ignore[arg-type] task_results=list(group["task_result"]), ) model_results.append(model_result) @@ -291,7 +297,7 @@ def _get_scores( { "model": model_res.model_name, "revision": model_res.model_revision, - **model_scores, # type: ignore + **model_scores, } ) except Exception as e: @@ -404,7 +410,7 @@ def get_benchmark_result(self) -> pd.DataFrame: return self.benchmark._create_summary_table(self) - def __iter__(self) -> Iterator[ModelResult]: + def __iter__(self) -> Iterator[ModelResult]: # type: ignore[override] return iter(self.model_results) def __getitem__(self, index: int) -> ModelResult: @@ -426,7 +432,7 @@ def to_disk(self, path: Path | str) -> None: out_file.write(self.model_dump_json(indent=2)) @classmethod - def from_validated(cls, **data) -> Self: + def from_validated(cls, **data) -> BenchmarkResults: """Create BenchmarkResults from validated data. Args: diff --git a/mteb/results/model_result.py b/mteb/results/model_result.py index 1a378908c2..a668b33e9e 100644 --- a/mteb/results/model_result.py +++ b/mteb/results/model_result.py @@ -1,12 +1,14 @@ +from __future__ import annotations + import logging import warnings -from collections.abc import Callable, Iterable, Sequence -from typing import Any, Literal +from collections.abc import Callable, Iterable +from typing import Any, Literal, cast import numpy as np import pandas as pd from pydantic import BaseModel, ConfigDict, Field -from typing_extensions import Self +from typing_extensions import overload from mteb.abstasks.abstask import AbsTask from mteb.abstasks.task_metadata import ( @@ -58,7 +60,7 @@ def _aggregate_and_pivot( index=index_columns, columns=columns, values="score", - aggfunc=aggregation_fn, + aggfunc=aggregation_fn, # type: ignore[arg-type] ).reset_index() elif format == "long": return ( @@ -81,7 +83,7 @@ class ModelResult(BaseModel): model_revision: str | None task_results: list[TaskResult] default_modalities: list[Modalities] = Field( - default_factory=lambda: ["text"], alias="modalities" + default_factory=lambda: [cast(Modalities, "text")], alias="modalities" ) model_config = ( ConfigDict( # to free up the name model_* which is otherwise protected @@ -95,16 +97,17 @@ def __repr__(self) -> str: return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))" @classmethod - def from_validated(cls, **data: dict[str, Any]) -> Self: + def from_validated(cls, **data: dict[str, Any]) -> ModelResult: """Create a ModelResult from validated data. Args: data: The validated data. """ - data["task_results"] = [ - TaskResult.from_validated(**res) for res in data["task_results"] + data["task_results"] = [ # type: ignore[assignment] + TaskResult.from_validated(**res) # type: ignore[arg-type] + for res in data["task_results"] ] - return cls.model_construct(**data) + return cls.model_construct(**data) # type: ignore[arg-type] def _filter_tasks( self, @@ -114,7 +117,7 @@ def _filter_tasks( task_types: list[TaskType] | None = None, modalities: list[Modalities] | None = None, is_public: bool | None = None, - ) -> Self: + ) -> ModelResult: new_task_results = [] for task_result in self.task_results: if (task_names is not None) and (task_result.task_name not in task_names): @@ -142,7 +145,7 @@ def _filter_tasks( task_results=new_task_results, ) - def select_tasks(self, tasks: Sequence[AbsTask]) -> Self: + def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult: """Select tasks from the ModelResult based on a list of AbsTask objects. Args: @@ -160,6 +163,28 @@ def select_tasks(self, tasks: Sequence[AbsTask]) -> Self: task_results=new_task_results, ) + @overload + def _get_scores( + self, + splits: list[SplitName] | None = None, + languages: list[ISOLanguage | ISOLanguageScript] | None = None, + scripts: list[ISOLanguageScript] | None = None, + getter: Callable[[ScoresDict], Score] | None = None, + aggregation: Callable[[list[Score]], Any] | None = None, + format: Literal["wide"] = "wide", + ) -> dict: ... + + @overload + def _get_scores( + self, + splits: list[SplitName] | None = None, + languages: list[ISOLanguage | ISOLanguageScript] | None = None, + scripts: list[ISOLanguageScript] | None = None, + getter: Callable[[ScoresDict], Score] | None = None, + aggregation: Callable[[list[Score]], Any] | None = None, + format: Literal["long"] = "long", + ) -> list: ... + def _get_scores( self, splits: list[SplitName] | None = None, @@ -177,21 +202,24 @@ def _get_scores( aggregation = aggregation if aggregation is not None else np.mean else: use_fast = True + aggregation = cast(Callable[[list[Score]], Any], aggregation) + getter = cast(Callable[[ScoresDict], Score], getter) + if format == "wide": scores = {} for res in self.task_results: try: if use_fast: scores[res.task_name] = res._get_score_fast( - splits=splits, # type: ignore - languages=languages, # type: ignore + splits=splits, + languages=languages, ) else: scores[res.task_name] = res.get_score( splits=splits, languages=languages, - aggregation=aggregation, # type: ignore - getter=getter, # type: ignore + aggregation=aggregation, + getter=getter, scripts=scripts, ) except Exception as e: @@ -206,14 +234,14 @@ def _get_scores( if use_fast: score = task_res._get_score_fast( splits=splits, - languages=languages, # type: ignore + languages=languages, ) else: score = task_res.get_score( splits=splits, languages=languages, - aggregation=aggregation, # type: ignore - getter=getter, # type: ignore + aggregation=aggregation, + getter=getter, scripts=scripts, ) entry = dict( @@ -317,7 +345,7 @@ def to_dataframe( def __hash__(self) -> int: return id(self) - def __iter__(self) -> Iterable[TaskResult]: + def __iter__(self) -> Iterable[TaskResult]: # type: ignore[override] return iter(self.task_results) def __getitem__(self, index) -> TaskResult: @@ -370,13 +398,13 @@ def task_names(self) -> list[str]: return [task_res.task_name for task_res in self.task_results] @property - def modalities(self) -> list[str]: + def modalities(self) -> list[Modalities]: """Get all modalities in the task results. Returns: A list of modalities in the task results. """ - mods = [] + mods: list[Modalities] = [] for task_res in self.task_results: task_modalities = getattr(task_res, "modalities", []) mods.extend(task_modalities) diff --git a/mteb/results/task_result.py b/mteb/results/task_result.py index b46856e5db..723b924d0c 100644 --- a/mteb/results/task_result.py +++ b/mteb/results/task_result.py @@ -3,9 +3,8 @@ import json import logging import warnings -from argparse import Namespace from collections import defaultdict -from collections.abc import Callable, Iterable +from collections.abc import Callable, Iterable, Mapping from functools import cached_property from importlib.metadata import version from pathlib import Path @@ -17,8 +16,11 @@ from pydantic import BaseModel, field_validator from typing_extensions import Self +from mteb import TaskMetadata from mteb._helpful_enum import HelpfulStrEnum +from mteb.abstasks import AbsTaskClassification from mteb.abstasks.abstask import AbsTask +from mteb.abstasks.task_metadata import TaskDomain from mteb.languages import LanguageScripts from mteb.models.model_meta import ScoringFunction from mteb.types import ( @@ -40,67 +42,59 @@ class Criteria(HelpfulStrEnum): DATASET_REVISION = "dataset_revision" -class ScalaNbClassificationDummy: +class ScalaNbClassificationDummy(AbsTaskClassification): """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( # type: ignore + metadata = TaskMetadata( name="ScalaNbClassification", + description="A dummy", main_score="accuracy", type="Classification", - hf_subsets_to_langscripts={ - "default": ["nob-Latn"], - }, - dataset={"revision": "revision_not_applicable"}, - revision="revision_not_applicable", + eval_langs=["nob-Latn"], + dataset={"path": "not/exists", "revision": "revision_not_applicable"}, ) -class ScalaNnClassificationDummy: +class ScalaNnClassificationDummy(AbsTaskClassification): """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( # type: ignore + metadata = TaskMetadata( name="ScalaNnClassification", + description="A dummy", main_score="accuracy", type="Classification", - hf_subsets_to_langscripts={ - "default": ["nno-Latn"], - }, - dataset={"revision": "revision_not_applicable"}, - revision="revision_not_applicable", + eval_langs=["nob-Latn"], + dataset={"path": "not/exists", "revision": "revision_not_applicable"}, ) -class ScalaDaClassificationDummy: +class ScalaDaClassificationDummy(AbsTaskClassification): """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( # type: ignore + metadata = TaskMetadata( name="ScalaDaClassification", + description="A dummy", main_score="accuracy", type="Classification", - hf_subsets_to_langscripts={ - "default": ["dan-Latn"], - }, - dataset={"revision": "revision_not_applicable"}, - revision="revision_not_applicable", + eval_langs=["dan-Latn"], + dataset={"path": "not/exists", "revision": "revision_not_applicable"}, ) -class ScalaSvClassificationDummy: +class ScalaSvClassificationDummy(AbsTaskClassification): """A dummy task for loading historic results from before v1.11.0""" - metadata = Namespace( # type: ignore + metadata = TaskMetadata( name="ScalaSvClassification", + description="A dummy", main_score="accuracy", type="Classification", - hf_subsets_to_langscripts={ - "default": ["swe-Latn"], - }, - dataset={"revision": "revision_not_applicable"}, - revision="revision_not_applicable", + eval_langs=["swe-Latn"], + dataset={"path": "not/exists", "revision": "revision_not_applicable"}, ) -outdated_tasks = { +outdated_tasks: dict[str, type[AbsTask]] = { "ScalaNbClassification": ScalaNbClassificationDummy, "ScalaNnClassification": ScalaNnClassificationDummy, "ScalaDaClassification": ScalaDaClassificationDummy, @@ -167,10 +161,10 @@ class TaskResult(BaseModel): def from_task_results( cls, task: AbsTask | type[AbsTask], - scores: dict[SplitName, dict[HFSubset, ScoresDict]], + scores: dict[SplitName, Mapping[HFSubset, ScoresDict]], evaluation_time: float, kg_co2_emissions: float | None = None, - ) -> Self: + ) -> TaskResult: """Create a TaskResult from the task and scores. Args: @@ -247,12 +241,12 @@ def task(self) -> AbsTask: return get_task(self.task_name) @property - def domains(self) -> list[str]: + def domains(self) -> list[TaskDomain]: """Get the domains of the task.""" doms = self.task.metadata.domains if doms is None: doms = [] - return doms # type: ignore + return doms @property def task_type(self) -> str: @@ -308,7 +302,7 @@ def _round_scores(self, scores: dict[SplitName, list[ScoresDict]], n: int) -> No if isinstance(v, dict): self._round_scores(v, n) elif isinstance(v, float): - value[i] = round(v, n) + value[i] = round(v, n) # type: ignore[call-overload] elif isinstance(value, float): scores[key] = round(value, n) @@ -326,7 +320,7 @@ def to_disk(self, path: Path) -> None: json.dump(json_obj, f, indent=2) @classmethod - def from_disk(cls, path: Path, load_historic_data: bool = True) -> Self: # type: ignore + def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult: """Load TaskResult from disk. Args: @@ -357,7 +351,7 @@ def from_disk(cls, path: Path, load_historic_data: bool = True) -> Self: # type ) # assume it is before 1.11.0 if the version is not present try: - obj = cls.model_validate(data) + obj: TaskResult = cls.model_validate(data) except Exception as e: if not pre_1_11_load: raise e @@ -382,6 +376,7 @@ def _fix_pair_classification_scores(cls, obj: TaskResult) -> None: from mteb import get_task task_name = obj.task_name + task: AbsTask | type[AbsTask] if task_name in outdated_tasks: task = outdated_tasks[task_name] else: @@ -394,11 +389,11 @@ def _fix_pair_classification_scores(cls, obj: TaskResult) -> None: for key in list(hf_subset_scores.keys()): if isinstance(hf_subset_scores[key], dict): for k, v in hf_subset_scores[key].items(): - hf_subset_scores[f"{key}_{k}"] = v - hf_subset_scores.pop(key) + hf_subset_scores[f"{key}_{k}"] = v # type: ignore[index] + hf_subset_scores.pop(key) # type: ignore[attr-defined] @classmethod - def _convert_from_before_v1_11_0(cls, data: dict) -> Self: + def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult: from mteb.get_tasks import _TASKS_REGISTRY # in case the task name is not found in the registry, try to find a lower case version @@ -484,7 +479,7 @@ def _convert_from_before_v1_11_0(cls, data: dict) -> Self: scores["test"]["fra-fra"] = scores["test"].pop("fr") result: TaskResult = TaskResult.from_task_results( - task, # type: ignore + task, scores, evaluation_time, kg_co2_emissions=None, @@ -535,7 +530,7 @@ def get_score( def _get_score_fast( self, splits: Iterable[str] | None = None, - languages: str | None = None, + languages: list[ISOLanguage | ISOLanguageScript] | None = None, subsets: Iterable[str] | None = None, ) -> float: """Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified. @@ -584,7 +579,7 @@ def _get_score_fast( return val_sum / n_val @classmethod - def from_validated(cls, **data) -> Self: + def from_validated(cls, **data) -> TaskResult: """Create a TaskResult from validated data. Returns: @@ -595,13 +590,13 @@ def from_validated(cls, **data) -> Self: def __repr__(self) -> str: return f"TaskResult(task_name={self.task_name}, scores=...)" - def only_main_score(self) -> Self: + def only_main_score(self) -> TaskResult: """Return a new TaskResult object with only the main score. Returns: A new TaskResult object with only the main score. """ - new_scores = {} + new_scores: dict[str, list[Score]] = {} for split in self.scores: new_scores[split] = [] for subset_scores in self.scores[split]: @@ -613,10 +608,9 @@ def only_main_score(self) -> Self: } ) new_res = {**self.to_dict(), "scores": new_scores} - new_res = TaskResult.from_validated(**new_res) - return new_res + return TaskResult.from_validated(**new_res) - def validate_and_filter_scores(self, task: AbsTask | None = None) -> Self: + def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult: """Validate and filter the scores against the task metadata. This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata. @@ -638,7 +632,7 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> Self: splits = task.eval_splits hf_subsets = set(task.hf_subsets) # Convert to set once - new_scores = {} + new_scores: dict[str, list[Score]] = {} seen_splits = set() for split in self.scores: if split not in splits: @@ -739,7 +733,7 @@ def merge( "mteb_version", "dataset_revision", ], - ) -> Self: + ) -> TaskResult: """Merges two TaskResult objects. Args: diff --git a/mteb/similarity_functions.py b/mteb/similarity_functions.py index 1624a034d1..cd5f32abb6 100644 --- a/mteb/similarity_functions.py +++ b/mteb/similarity_functions.py @@ -186,7 +186,7 @@ def max_sim(a: Array, b: Array) -> torch.Tensor: b, ) - return scores.max(axis=-1).values.sum(axis=-1) + return scores.max(axis=-1).values.sum(axis=-1) # type: ignore[call-overload] # https://github.com/lightonai/pylate/blob/2d094a724866d6e15701781528368438081c0157/pylate/scores/scores.py#L67C1-L122C38 @@ -217,7 +217,7 @@ def pairwise_max_sim( document_embedding, ) - scores.append(query_document_score.max(axis=-1).values.sum()) + scores.append(query_document_score.max(axis=-1).values.sum()) # type: ignore[call-overload] return torch.stack(scores, dim=0) @@ -317,11 +317,15 @@ def similarity(text_embeddings: Array, input_embeddings: Array) -> Array: Returns: Matrix with similarities """ - text_embeddings = _convert_to_tensor(text_embeddings) - input_embeddings = _convert_to_tensor(input_embeddings) + text_embeddings_tensor = _convert_to_tensor(text_embeddings) + input_embeddings_tensor = _convert_to_tensor(input_embeddings) - text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True) - input_embeddings = input_embeddings / input_embeddings.norm(dim=-1, keepdim=True) - logits = torch.matmul(input_embeddings, text_embeddings.T) + text_embeddings_tensor = text_embeddings_tensor / text_embeddings_tensor.norm( + dim=-1, keepdim=True + ) + input_embeddings_tensor = input_embeddings_tensor / input_embeddings_tensor.norm( + dim=-1, keepdim=True + ) + logits = torch.matmul(input_embeddings_tensor, text_embeddings_tensor.T) probs = (logits * 100).softmax(dim=-1) return probs diff --git a/mteb/tasks/classification/dan/dk_hate_classification.py b/mteb/tasks/classification/dan/dk_hate_classification.py index 4d027e3576..ef0b6b2783 100644 --- a/mteb/tasks/classification/dan/dk_hate_classification.py +++ b/mteb/tasks/classification/dan/dk_hate_classification.py @@ -62,7 +62,7 @@ class DKHateClassification(AbsTaskClassification): def dataset_transform(self): # convert label to a 0/1 label - labels = self.dataset["train"]["label"] # type: ignore + labels = self.dataset["train"]["label"] lab2idx = {lab: idx for idx, lab in enumerate(set(labels))} self.dataset = self.dataset.map( lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"] diff --git a/mteb/tasks/classification/est/estonian_valence.py b/mteb/tasks/classification/est/estonian_valence.py index b55ef82e0e..e1c12db718 100644 --- a/mteb/tasks/classification/est/estonian_valence.py +++ b/mteb/tasks/classification/est/estonian_valence.py @@ -45,7 +45,7 @@ def dataset_transform(self): "valence", "label" ) # convert label to a numbers - labels = self.dataset["train"]["label"] # type: ignore + labels = self.dataset["train"]["label"] lab2idx = {lab: idx for idx, lab in enumerate(set(labels))} self.dataset = self.dataset.map( lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"] diff --git a/mteb/tasks/classification/multilingual/scala_classification.py b/mteb/tasks/classification/multilingual/scala_classification.py index bd13823046..0ab16f7127 100644 --- a/mteb/tasks/classification/multilingual/scala_classification.py +++ b/mteb/tasks/classification/multilingual/scala_classification.py @@ -57,7 +57,7 @@ class ScalaClassification(AbsTaskClassification): def dataset_transform(self): for lang in self.dataset.keys(): # convert label to a 0/1 label - labels = self.dataset[lang]["train"]["label"] # type: ignore + labels = self.dataset[lang]["train"]["label"] lab2idx = {lab: idx for idx, lab in enumerate(set(labels))} self.dataset[lang] = self.dataset[lang].map( lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"] diff --git a/mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py b/mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py index 62a34df671..b3a6b7d90b 100644 --- a/mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +++ b/mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py @@ -49,7 +49,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset = datasets.DatasetDict({"test": self.dataset["train"]}) self.dataset_transform() self.data_loaded = True diff --git a/mteb/tasks/retrieval/code/code_rag.py b/mteb/tasks/retrieval/code/code_rag.py index a9e291346c..b62ea8bccb 100644 --- a/mteb/tasks/retrieval/code/code_rag.py +++ b/mteb/tasks/retrieval/code/code_rag.py @@ -48,14 +48,14 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval): "path": "code-rag-bench/programming-solutions", "revision": "1064f7bba54d5400d4836f5831fe4c2332a566a6", }, - **common_args, # type: ignore + **common_args, ) def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -71,7 +71,7 @@ def dataset_transform(self) -> None: self.queries = {} split = self.metadata.eval_splits[0] - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) self.queries[split] = {} @@ -105,14 +105,14 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval): "path": "code-rag-bench/online-tutorials", "revision": "095bb77130082e4690d6c3a031997b03487bf6e2", }, - **common_args, # type: ignore + **common_args, ) def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -128,7 +128,7 @@ def dataset_transform(self) -> None: self.queries = {} split = self.metadata.eval_splits[0] - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) self.queries[split] = {} @@ -165,14 +165,14 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval): "path": "code-rag-bench/library-documentation", "revision": "b530d3b5a25087d2074e731b76232db85b9e9107", }, - **common_args, # type: ignore + **common_args, ) def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -188,7 +188,7 @@ def dataset_transform(self) -> None: self.queries = {} split = self.metadata.eval_splits[0] - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) self.queries[split] = {} @@ -222,14 +222,14 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval): "path": "code-rag-bench/stackoverflow-posts", "revision": "04e05d86cb0ac467b29a5d87f4c56eac99dfc0a4", }, - **common_args, # type: ignore + **common_args, ) def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -245,7 +245,7 @@ def dataset_transform(self) -> None: self.queries = {} split = self.metadata.eval_splits[0] - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) self.queries[split] = {} diff --git a/mteb/tasks/retrieval/dan/dan_fever_retrieval.py b/mteb/tasks/retrieval/dan/dan_fever_retrieval.py index c651e60f77..bbc6edd380 100644 --- a/mteb/tasks/retrieval/dan/dan_fever_retrieval.py +++ b/mteb/tasks/retrieval/dan/dan_fever_retrieval.py @@ -51,7 +51,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True diff --git a/mteb/tasks/retrieval/dan/tv2_nordretrieval.py b/mteb/tasks/retrieval/dan/tv2_nordretrieval.py index 98273e109d..12447cb07b 100644 --- a/mteb/tasks/retrieval/dan/tv2_nordretrieval.py +++ b/mteb/tasks/retrieval/dan/tv2_nordretrieval.py @@ -64,7 +64,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -81,7 +81,7 @@ def dataset_transform(self) -> None: text2id = {} for split in self.dataset: - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) ds = ds.select( range(2048) diff --git a/mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py b/mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py index 651e46840f..92fe5feed0 100644 --- a/mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +++ b/mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py @@ -40,7 +40,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -57,7 +57,7 @@ def dataset_transform(self) -> None: text2id = {} for split in self.dataset: - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.map(answers_to_list) self.queries[split] = {} diff --git a/mteb/tasks/retrieval/nob/norquad.py b/mteb/tasks/retrieval/nob/norquad.py index 54d41e8c57..43b6b35c15 100644 --- a/mteb/tasks/retrieval/nob/norquad.py +++ b/mteb/tasks/retrieval/nob/norquad.py @@ -54,7 +54,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -71,7 +71,7 @@ def dataset_transform(self) -> None: text2id = {} for split in self.dataset: - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) max_samples = min(1024, len(ds)) ds = ds.select( diff --git a/mteb/tasks/retrieval/nob/snl_retrieval.py b/mteb/tasks/retrieval/nob/snl_retrieval.py index 41322ac5b5..4cfdcc7503 100644 --- a/mteb/tasks/retrieval/nob/snl_retrieval.py +++ b/mteb/tasks/retrieval/nob/snl_retrieval.py @@ -41,7 +41,7 @@ def load_data(self) -> None: """Load dataset from HuggingFace hub""" if self.data_loaded: return - self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset = datasets.load_dataset(**self.metadata.dataset) self.dataset_transform() self.data_loaded = True @@ -58,7 +58,7 @@ def dataset_transform(self) -> None: text2id = {} for split in self.dataset: - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) self.queries[split] = {} diff --git a/mteb/tasks/retrieval/tur/tur_hist_quad.py b/mteb/tasks/retrieval/tur/tur_hist_quad.py index cd56138132..a189379ce3 100644 --- a/mteb/tasks/retrieval/tur/tur_hist_quad.py +++ b/mteb/tasks/retrieval/tur/tur_hist_quad.py @@ -59,7 +59,7 @@ def load_data(self, **kwargs) -> None: text2id = {} for split in self.metadata.eval_splits: - ds: datasets.Dataset = self.dataset[split] # type: ignore + ds: datasets.Dataset = self.dataset[split] ds = ds.shuffle(seed=42) max_samples = min(1024, len(ds)) ds = ds.select( diff --git a/mteb/types/_result.py b/mteb/types/_result.py index 848bb8e713..edb0f57bfb 100644 --- a/mteb/types/_result.py +++ b/mteb/types/_result.py @@ -1,3 +1,4 @@ +from collections.abc import Mapping from typing import Any, NamedTuple HFSubset = str @@ -8,7 +9,7 @@ Score = Any """A score value, could e.g. be accuracy. Normally it is a float or int, but it can take on any value. Should be json serializable.""" -ScoresDict = dict[str, Score] +ScoresDict = Mapping[str, Score] """A dictionary of scores, typically also include metadata, e.g {'main_score': 0.5, 'accuracy': 0.5, 'f1': 0.6, 'hf_subset': 'en-de', 'languages': ['eng-Latn', 'deu-Latn']}""" diff --git a/mteb/types/statistics.py b/mteb/types/statistics.py index 6be1e50ae9..97737c387c 100644 --- a/mteb/types/statistics.py +++ b/mteb/types/statistics.py @@ -10,8 +10,14 @@ class SplitDescriptiveStatistics(TypedDict): class DescriptiveStatistics(TypedDict, SplitDescriptiveStatistics): - """Class for descriptive statistics for the full task.""" + """Class for descriptive statistics for the full task. + Attributes: + num_samples: Total number of samples + hf_subset_descriptive_stats: HFSubset descriptive statistics (only for multilingual datasets) + """ + + num_samples: int hf_subset_descriptive_stats: NotRequired[dict[HFSubset, SplitDescriptiveStatistics]] @@ -88,9 +94,9 @@ class ScoreStatistics(TypedDict): max_score: Maximum score """ - min_score: int + min_score: int | float avg_score: float - max_score: int + max_score: int | float class TopRankedStatistics(TypedDict): diff --git a/pyproject.toml b/pyproject.toml index a38ec215c9..abddef89d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -124,7 +124,7 @@ docs = [ "tabulate>=0.9.0", ] typing = [ - "mypy>=1.18.1", + "mypy==1.19.1", "types-cachetools>=6.2.0.20250827", "types-pysocks>=1.7.1.20250828", "types-pyyaml>=6.0.12.20250822", @@ -132,9 +132,22 @@ typing = [ "types-simplejson>=3.20.0.20250822", "types-tqdm>=4.67.0.20250809", "types-tensorflow>=2.18.0.20250809", - # stubs require python >=3.10 - # "pandas-stubs>=2.3.2.250827", - # "scipy-stubs>=1.15.3.0", + "types-pygments>=2.19.0.20251121", + "types-colorama>=0.4.15.20250801", + "types-gevent>=25.9.0.20251102", + "types-networkx>=3.6.1.20251220", + "types-openpyxl>=3.1.5.20250919", + "types-psutil>=7.1.3.20251211", + "types-python-dateutil>=2.9.0.20251115", + "types-pywin32>=311.0.0.20251008", + "types-regex>=2025.11.3.20251106", + "types-setuptools>=80.9.0.20251221", + "types-tabulate>=0.9.0.20241207", + "types-xlrd>=2.0.0.20251020", + "pandas-stubs>=2.3.2.250926", + "scipy-stubs>=1.15.3.0", + "types-defusedxml>=0.7.0.20250822", + "pillow>=12.0.0", ] dev = [ {include-group = "lint"}, @@ -219,6 +232,7 @@ select = [ "PTH", # use pathlib "TID", # tidy-imports "D", # pydocstyle + "PGH", # pygrep-hooks Use specific rule codes when ignoring type issues ] ignore = [ @@ -325,8 +339,10 @@ plugins = ['pydantic.mypy'] # these modules not typed and don't have stubs module = [ "datasets", + "datasets.exceptions", "sklearn", "sklearn.*", + "faiss", ] ignore_missing_imports = true diff --git a/tests/mock_models.py b/tests/mock_models.py index 5d1d54bb9b..53ac737381 100644 --- a/tests/mock_models.py +++ b/tests/mock_models.py @@ -102,7 +102,7 @@ def encode( normalize_embeddings: bool = False, **kwargs, ) -> list[Tensor] | np.ndarray | Tensor: - return torch.randn(len(sentences), 10, dtype=torch.bfloat16) # type: ignore + return torch.randn(len(sentences), 10, dtype=torch.bfloat16) class MockSentenceTransformerWrapper(SentenceTransformerEncoderWrapper): diff --git a/tests/mock_tasks.py b/tests/mock_tasks.py index 3cdeb98881..8cbce53ae1 100644 --- a/tests/mock_tasks.py +++ b/tests/mock_tasks.py @@ -112,7 +112,7 @@ def instruction_retrieval_datasplit() -> RetrievalSplitData: class MockClassificationTask(AbsTaskClassification): - classifier = LogisticRegression(n_jobs=1, max_iter=10) # type: ignore + classifier = LogisticRegression(n_jobs=1, max_iter=10) expected_stats = { "test": { @@ -159,7 +159,7 @@ class MockClassificationTask(AbsTaskClassification): type="Classification", name="MockClassificationTask", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -315,7 +315,7 @@ class MockMultilingualClassificationTask(AbsTaskClassification): type="Classification", name="MockMultilingualClassificationTask", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -374,7 +374,7 @@ class MockBitextMiningTask(AbsTaskBitextMining): type="BitextMining", name="MockBitextMiningTask", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -464,7 +464,7 @@ class MockMultilingualBitextMiningTask(AbsTaskBitextMining): type="BitextMining", name="MockMultilingualBitextMiningTask", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -559,7 +559,7 @@ class MockMultilingualParallelBitextMiningTask(AbsTaskBitextMining): type="BitextMining", name="MockMultilingualParallelBitextMiningTask", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = { "eng_Latn-fra_Latn": ["eng-Latn", "fra-Latn"], @@ -612,7 +612,7 @@ class MockClusteringTask(AbsTaskClusteringLegacy): type="Clustering", name="MockClusteringTask", main_score="v_measure", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -710,7 +710,7 @@ class MockMultilingualClusteringTask(AbsTaskClusteringLegacy): type="Clustering", name="MockMultilingualClusteringTask", main_score="v_measure", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -771,7 +771,7 @@ class MockClusteringFastTask(AbsTaskClustering): type="Clustering", name="MockClusteringFastTask", main_score="v_measure", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -870,7 +870,7 @@ class MockMultilingualClusteringFastTask(AbsTaskClustering): type="Clustering", name="MockMultilingualClusteringFastTask", main_score="v_measure", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -935,7 +935,7 @@ class MockPairClassificationTask(AbsTaskPairClassification): type="PairClassification", name="MockPairClassificationTask", main_score="similarity_ap", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -1054,7 +1054,7 @@ class MockMultilingualPairClassificationTask(AbsTaskPairClassification): type="PairClassification", name="MockMultilingualPairClassificationTask", main_score="similarity_ap", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -1125,7 +1125,7 @@ class MockPairImageClassificationTask(AbsTaskPairClassification): type="PairClassification", name="MockPairImageClassificationTask", main_score="similarity_ap", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] @@ -1191,7 +1191,7 @@ class MockSTSTask(AbsTaskSTS): type="STS", name="MockSTSTask", main_score="cosine_spearman", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -1303,7 +1303,7 @@ class MockMultilingualSTSTask(AbsTaskSTS): type="STS", name="MockMultilingualSTSTask", main_score="cosine_spearman", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -1368,7 +1368,7 @@ class MockSummarizationTask(AbsTaskSummarization): type="Summarization", name="MockSummarizationTask", main_score="cosine_spearman", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -1497,7 +1497,7 @@ class MockMultilingualSummarizationTask(AbsTaskSummarization): type="Summarization", name="MockMultilingualSummarizationTask", main_score="cosine_spearman", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -1575,7 +1575,7 @@ class MockRerankingTask(AbsTaskRetrieval): type="Reranking", name="MockRerankingTask", main_score="map_at_1000", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -1694,7 +1694,7 @@ class MockMultilingualRerankingTask(AbsTaskRetrieval): type="Reranking", name="MockMultilingualRerankingTask", main_score="map_at_10", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -1772,7 +1772,7 @@ class MockRetrievalTask(AbsTaskRetrieval): type="Retrieval", name="MockRetrievalTask", main_score="ndcg_at_10", - **dict(general_args | {"eval_splits": ["val", "test"]}), # type: ignore + **dict(general_args | {"eval_splits": ["val", "test"]}), ) def load_data(self) -> None: @@ -1848,7 +1848,7 @@ class MockRetrievalDialogTask(AbsTaskRetrieval): type="Retrieval", name="MockRetrievalDialogTask", main_score="ndcg_at_10", - **dict(general_args | {"eval_splits": ["val", "test"]}), # type: ignore + **dict(general_args | {"eval_splits": ["val", "test"]}), ) def load_data(self) -> None: @@ -2057,7 +2057,7 @@ class MockMultilingualRetrievalTask(AbsTaskRetrieval): type="Retrieval", name="MockMultilingualRetrievalTask", main_score="ndcg_at_10", - **dict(general_args | {"eval_splits": ["val", "test"]}), # type: ignore + **dict(general_args | {"eval_splits": ["val", "test"]}), ) metadata.eval_langs = multilingual_eval_langs @@ -2118,7 +2118,7 @@ class MockMultilabelClassification(AbsTaskMultilabelClassification): type="MultilabelClassification", name="MockMultilabelClassification", main_score="lrap", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -2271,7 +2271,7 @@ class MockMultilingualMultilabelClassification(AbsTaskMultilabelClassification): type="MultilabelClassification", name="MockMultilingualMultilabelClassification", main_score="lrap", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -2340,7 +2340,7 @@ class MockInstructionRetrieval(AbsTaskRetrieval): type="InstructionRetrieval", name="MockInstructionRetrieval", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -2392,7 +2392,7 @@ class MockInstructionReranking(AbsTaskRetrieval): type="InstructionReranking", name="MockInstructionReranking", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) def load_data(self) -> None: @@ -2495,7 +2495,7 @@ class MockMultilingualInstructionRetrieval(AbsTaskRetrieval): type="InstructionRetrieval", name="MockMultilingualInstructionRetrieval", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -2618,7 +2618,7 @@ class MockMultilingualInstructionReranking(AbsTaskRetrieval): type="InstructionReranking", name="MockMultilingualInstructionReranking", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs @@ -2640,7 +2640,7 @@ class MockAggregatedTask(AbsTaskAggregate): MockRetrievalTask(), MockRerankingTask(), ], - **general_args, # type: ignore + **general_args, ) @@ -2695,7 +2695,7 @@ class MockMultiChoiceTask(AbsTaskRetrieval): type="Any2AnyMultiChoice", name="MockMultiChoice", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "it2i" @@ -2878,7 +2878,7 @@ class MockMultilingualMultiChoiceTask(AbsTaskRetrieval): type="Any2AnyMultiChoice", name="MockMultilingualMultiChoice", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.eval_langs = multilingual_eval_langs metadata.modalities = ["image", "text"] @@ -2970,7 +2970,7 @@ class MockAny2AnyRetrievalI2TTask(AbsTaskRetrieval): type="Any2AnyRetrieval", name="MockAny2AnyRetrievalI2T", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "i2t" @@ -3049,7 +3049,7 @@ class MockAny2AnyRetrievalT2ITask(AbsTaskRetrieval): type="Any2AnyRetrieval", name="MockAny2AnyRetrievalT2I", main_score="ndcg_at_10", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "t2i" @@ -3140,7 +3140,7 @@ class MockImageClassificationTask(AbsTaskClassification): type="ImageClassification", name="MockImageClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.category = "i2c" @@ -3316,7 +3316,7 @@ class MockMultilingualImageClassificationTask(AbsTaskClassification): type="ImageClassification", name="MockMultilingualImageClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.category = "i2c" @@ -3383,7 +3383,7 @@ class MockImageClusteringTask(AbsTaskClusteringLegacy): type="ImageClustering", name="MockImageClustering", main_score="nmi", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] input_column_name = "image" @@ -3439,7 +3439,7 @@ class MockImageClusteringFastTask(AbsTaskClustering): type="ImageClustering", name="MockImageClusteringFastTask", main_score="v_measure", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] input_column_name = "image" @@ -3529,7 +3529,7 @@ class MockImageMultilabelClassificationTask(AbsTaskMultilabelClassification): type="ImageMultilabelClassification", name="MockImageMultilabelClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.category = "i2c" @@ -3735,7 +3735,7 @@ class MockMultilingualImageMultilabelClassificationTask( type="ImageMultilabelClassification", name="MockMultilingualImageMultilabelClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.eval_langs = multilingual_eval_langs @@ -3802,7 +3802,7 @@ class MockImageTextPairClassificationTask(AbsTaskImageTextPairClassification): type="Compositionality", name="MockImageTextPairClassification", main_score="text_acc", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "i2t" @@ -3898,7 +3898,7 @@ class MockMultilingualImageTextPairClassificationTask( type="Compositionality", name="MockMultilingualImageTextPairClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "i2t" @@ -3965,7 +3965,7 @@ class MockVisualSTSTask(AbsTaskSTS): type="VisualSTS(eng)", name="MockVisualSTS", main_score="cosine_spearman", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.category = "i2i" @@ -4031,7 +4031,7 @@ class MockZeroShotClassificationTask(AbsTaskZeroShotClassification): type="ZeroShotClassification", name="MockZeroShotClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image", "text"] metadata.category = "i2t" @@ -4095,7 +4095,7 @@ class MockTextZeroShotClassificationTask(AbsTaskZeroShotClassification): type="ZeroShotClassification", name="MockTextZeroShotClassification", main_score="accuracy", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["text"] metadata.category = "t2t" @@ -4155,7 +4155,7 @@ class MockRegressionTask(AbsTaskRegression): type="Regression", name="MockRegressionTask", main_score="kendalltau", - **general_args, # type: ignore + **general_args, ) def load_data(self, **kwargs): @@ -4221,7 +4221,7 @@ class MockImageRegressionTask(AbsTaskRegression): type="Regression", name="MockRegressionTask", main_score="kendalltau", - **general_args, # type: ignore + **general_args, ) metadata.modalities = ["image"] metadata.category = "i2c" diff --git a/tests/test_abstasks/test_task_metadata.py b/tests/test_abstasks/test_task_metadata.py index 4af8e7d443..a05d24eb2b 100644 --- a/tests/test_abstasks/test_task_metadata.py +++ b/tests/test_abstasks/test_task_metadata.py @@ -51,7 +51,7 @@ def test_given_dataset_config_then_it_is_valid(): def test_given_missing_dataset_path_then_it_throws(): with pytest.raises(ValueError): - TaskMetadata( # type: ignore + TaskMetadata( name="MyTask", description="testing", reference=None, diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index fd1653a395..2a60e0df74 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -66,7 +66,7 @@ def test_evaluate_with_cache( path = cache.get_task_result_path( task.metadata.name, results.model_name.replace("/", "__"), - results.model_revision, # type: ignore + results.model_revision, ) model_meta_path = path.parent / "model_meta.json" assert path.exists() and path.is_file(), "cache file should exist" diff --git a/tests/test_filter_tasks.py b/tests/test_filter_tasks.py index 8a3860cd31..19b2ac6ce3 100644 --- a/tests/test_filter_tasks.py +++ b/tests/test_filter_tasks.py @@ -32,7 +32,7 @@ def test_filter_tasks( languages: list[str], script: list[str], domains: list[TaskDomain], - task_types: list[TaskType] | None, # type: ignore + task_types: list[TaskType] | None, ): """Tests that get_tasks filters tasks correctly. This could in principle be combined with the following tests, but they have been kept separate to reduce the grid size. @@ -67,7 +67,7 @@ def test_filter_tasks_superseded( all_tasks: list[AbsTask], languages: list[str], domains: list[TaskDomain], - task_types: list[TaskType] | None, # type: ignore + task_types: list[TaskType] | None, exclude_superseded_datasets: bool, ): tasks = filter_tasks( diff --git a/tests/test_get_tasks.py b/tests/test_get_tasks.py index cf7ed0ad17..a1583415f9 100644 --- a/tests/test_get_tasks.py +++ b/tests/test_get_tasks.py @@ -58,7 +58,7 @@ def test_get_tasks_filtering(): @pytest.mark.parametrize("modalities", [["text"], ["image"], None]) def test_mteb_mteb_tasks( script: list[str], - task_types: list[TaskType] | None, # type: ignore + task_types: list[TaskType] | None, modalities: list[Modalities] | None, ): tasks = mteb.get_tasks(script=script, task_types=task_types, modalities=modalities) diff --git a/tests/test_tasks/test_task_quality.py b/tests/test_tasks/test_task_quality.py index ba9f086432..90180e8b28 100644 --- a/tests/test_tasks/test_task_quality.py +++ b/tests/test_tasks/test_task_quality.py @@ -281,7 +281,7 @@ def _split_quality( ) -> list[str]: errors = [] - num_samples = split_stats["num_samples"] # type: ignore + num_samples = split_stats["num_samples"] text_stats = split_stats.get("text_statistics", None) if text_stats: text_stats = cast(TextStatistics, text_stats)