diff --git a/docs/create_tasks_table.py b/docs/create_tasks_table.py index ac16f0313d..6ee4bee7ab 100644 --- a/docs/create_tasks_table.py +++ b/docs/create_tasks_table.py @@ -79,7 +79,7 @@ def create_task_lang_table(tasks: list[mteb.AbsTask], sort_by_sum=False) -> str: if lang in PROGRAMMING_LANGS: lang = "code" if table_dict.get(lang) is None: - table_dict[lang] = {k: 0 for k in sorted(get_args(TASK_TYPE))} + table_dict[lang] = dict.fromkeys(sorted(get_args(TASK_TYPE)), 0) table_dict[lang][task.metadata.type] += 1 ## Wrangle for polars diff --git a/mteb/abstasks/stratification.py b/mteb/abstasks/stratification.py index b7e93b7d2e..9df608fa8a 100644 --- a/mteb/abstasks/stratification.py +++ b/mteb/abstasks/stratification.py @@ -216,7 +216,7 @@ def _prepare_stratification(self, y: np.ndarray) -> tuple: [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)] ) rows = sp.lil_matrix(y).rows - rows_used = {i: False for i in range(self.n_samples)} + rows_used = dict.fromkeys(range(self.n_samples), False) all_combinations = [] per_row_combinations = [[] for i in range(self.n_samples)] samples_with_combination = {} diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index cdf497e5a6..69034b1741 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -261,7 +261,7 @@ def search_cross_encoder( logging.info( f"previous_results is None. Using all the documents to rerank: {len(corpus)}" ) - q_results = {doc_id: 0.0 for doc_id in corpus.keys()} + q_results = dict.fromkeys(corpus.keys(), 0.0) else: q_results = self.previous_results[qid] # take the top-k only diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index fbc01496e8..5286680dc9 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -258,7 +258,7 @@ def apply_styling( joint_table[score_columns] = joint_table[score_columns].map(format_scores) joint_table_style = joint_table.style.format( { - **{column: "{:.2f}" for column in score_columns}, + **dict.fromkeys(score_columns, "{:.2f}"), "Rank (Borda)": "{:.0f}", }, na_rep="", diff --git a/mteb/task_aggregation.py b/mteb/task_aggregation.py index e5ce47a4d0..d380333e12 100644 --- a/mteb/task_aggregation.py +++ b/mteb/task_aggregation.py @@ -109,8 +109,7 @@ def borda_count( results = results.to_legacy_dict() n_candidates = sum(len(revs) for revs in results.values()) candidate_scores = { - model: {revision: 0.0 for revision in revisions} - for model, revisions in results.items() + model: dict.fromkeys(revisions, 0.0) for model, revisions in results.items() } tasks = defaultdict(list) # {task_name: [(model, revision, score), ...]} diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py index b67c2c0262..9ae6f30c00 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py @@ -24,9 +24,9 @@ def _load_wit_data( path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None ): - corpus = {lang: {split: None for split in splits} for lang in langs} - queries = {lang: {split: None for split in splits} for lang in langs} - relevant_docs = {lang: {split: None for split in splits} for lang in langs} + corpus = {lang: dict.fromkeys(splits) for lang in langs} + queries = {lang: dict.fromkeys(splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(splits) for lang in langs} split = "test" diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py index 0e08df2b9d..c5d373568a 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py @@ -21,9 +21,9 @@ def _load_xflickrco_data( path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None ): - corpus = {lang: {split: None for split in splits} for lang in langs} - queries = {lang: {split: None for split in splits} for lang in langs} - relevant_docs = {lang: {split: None for split in splits} for lang in langs} + corpus = {lang: dict.fromkeys(splits) for lang in langs} + queries = {lang: dict.fromkeys(splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(splits) for lang in langs} split = "test" diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py index 17136c8e3a..d05758b6d6 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py @@ -49,9 +49,9 @@ def _load_xm3600_data( path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None ): - corpus = {lang: {split: None for split in splits} for lang in langs} - queries = {lang: {split: None for split in splits} for lang in langs} - relevant_docs = {lang: {split: None for split in splits} for lang in langs} + corpus = {lang: dict.fromkeys(splits) for lang in langs} + queries = {lang: dict.fromkeys(splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(splits) for lang in langs} split = "test" diff --git a/mteb/tasks/Retrieval/dan/TwitterHjerneRetrieval.py b/mteb/tasks/Retrieval/dan/TwitterHjerneRetrieval.py index 198d1bc1b5..ae9ec81117 100644 --- a/mteb/tasks/Retrieval/dan/TwitterHjerneRetrieval.py +++ b/mteb/tasks/Retrieval/dan/TwitterHjerneRetrieval.py @@ -87,9 +87,7 @@ def dataset_transform(self) -> None: answer_id = str(text2id[a]) answer_ids.append(answer_id) - self.relevant_docs[split][query_id] = { - answer_id: 1 for answer_id in answer_ids - } + self.relevant_docs[split][query_id] = dict.fromkeys(answer_ids, 1) def answers_to_list(example: dict) -> dict: diff --git a/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py b/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py index 380f30dcdf..189641f271 100644 --- a/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py @@ -74,7 +74,8 @@ def load_data(self, **kwargs): self.corpus = {self._EVAL_SPLIT: {row["_id"]: row for row in corpus_rows}} self.relevant_docs = { self._EVAL_SPLIT: { - row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows + row["_id"]: dict.fromkeys(row["text"].split(" "), 1) + for row in qrels_rows } } diff --git a/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py b/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py index 9b280aaecf..0083fcff9e 100644 --- a/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py @@ -84,7 +84,7 @@ def load_data(self, **kwargs): existing_docs=all_docs, ) corpus.update(neg_docs) - relevant_docs[q_id] = {k: 1 for k in pos_docs} + relevant_docs[q_id] = dict.fromkeys(pos_docs, 1) corpus = { key: doc.get("title", "") + " " + doc["text"] for key, doc in corpus.items() } diff --git a/mteb/tasks/Retrieval/eng/BrightRetrieval.py b/mteb/tasks/Retrieval/eng/BrightRetrieval.py index 35b5b2e0bb..4b91c24729 100644 --- a/mteb/tasks/Retrieval/eng/BrightRetrieval.py +++ b/mteb/tasks/Retrieval/eng/BrightRetrieval.py @@ -37,11 +37,9 @@ def load_bright_data( cache_dir: str | None = None, revision: str | None = None, ): - corpus = {domain: {split: None for split in eval_splits} for domain in DOMAINS} - queries = {domain: {split: None for split in eval_splits} for domain in DOMAINS} - relevant_docs = { - domain: {split: None for split in eval_splits} for domain in DOMAINS - } + corpus = {domain: dict.fromkeys(eval_splits) for domain in DOMAINS} + queries = {domain: dict.fromkeys(eval_splits) for domain in DOMAINS} + relevant_docs = {domain: dict.fromkeys(eval_splits) for domain in DOMAINS} for domain in domains: domain_corpus = datasets.load_dataset( diff --git a/mteb/tasks/Retrieval/multilingual/CUREv1Retrieval.py b/mteb/tasks/Retrieval/multilingual/CUREv1Retrieval.py index 6e97786a77..1492123e0c 100644 --- a/mteb/tasks/Retrieval/multilingual/CUREv1Retrieval.py +++ b/mteb/tasks/Retrieval/multilingual/CUREv1Retrieval.py @@ -120,15 +120,9 @@ def load_data(self, **kwargs): cache_dir = kwargs.get("cache_dir", None) # Iterate over splits and languages - corpus = { - language: {split: None for split in eval_splits} for language in languages - } - queries = { - language: {split: None for split in eval_splits} for language in languages - } - relevant_docs = { - language: {split: None for split in eval_splits} for language in languages - } + corpus = {language: dict.fromkeys(eval_splits) for language in languages} + queries = {language: dict.fromkeys(eval_splits) for language in languages} + relevant_docs = {language: dict.fromkeys(eval_splits) for language in languages} for split in eval_splits: # Since this is a cross-lingual dataset, the corpus and the relevant documents do not depend on the language split_corpus = self._load_corpus(split=split, cache_dir=cache_dir) diff --git a/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py b/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py index d32d4d4cbd..2942593b28 100644 --- a/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py @@ -39,9 +39,9 @@ def _load_miracl_data( revision: str | None = None, trust_remote_code: bool = False, ): - corpus = {lang: {split: None for split in splits} for lang in langs} - queries = {lang: {split: None for split in splits} for lang in langs} - relevant_docs = {lang: {split: None for split in splits} for lang in langs} + corpus = {lang: dict.fromkeys(splits) for lang in langs} + queries = {lang: dict.fromkeys(splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(splits) for lang in langs} split = _EVAL_SPLIT @@ -170,9 +170,9 @@ def _load_miracl_data_hard_negatives( revision: str | None = None, trust_remote_code: bool = False, ) -> tuple: - corpus = {lang: {split: None for split in splits} for lang in langs} - queries = {lang: {split: None for split in splits} for lang in langs} - relevant_docs = {lang: {split: None for split in splits} for lang in langs} + corpus = {lang: dict.fromkeys(splits) for lang in langs} + queries = {lang: dict.fromkeys(splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(splits) for lang in langs} split = _EVAL_SPLIT diff --git a/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py b/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py index e143dac611..17943d14ac 100644 --- a/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py @@ -32,9 +32,9 @@ def load_mldr_data( cache_dir: str = None, revision: str = None, ): - corpus = {lang: {split: None for split in eval_splits} for lang in langs} - queries = {lang: {split: None for split in eval_splits} for lang in langs} - relevant_docs = {lang: {split: None for split in eval_splits} for lang in langs} + corpus = {lang: dict.fromkeys(eval_splits) for lang in langs} + queries = {lang: dict.fromkeys(eval_splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(eval_splits) for lang in langs} for lang in langs: lang_corpus = datasets.load_dataset( diff --git a/mteb/tasks/Retrieval/multilingual/NeuCLIR2022Retrieval.py b/mteb/tasks/Retrieval/multilingual/NeuCLIR2022Retrieval.py index 865473c0dc..efb55a0ea2 100644 --- a/mteb/tasks/Retrieval/multilingual/NeuCLIR2022Retrieval.py +++ b/mteb/tasks/Retrieval/multilingual/NeuCLIR2022Retrieval.py @@ -24,9 +24,9 @@ def load_neuclir_data( cache_dir: str | None = None, revision: str | None = None, ): - corpus = {lang: {split: None for split in eval_splits} for lang in langs} - queries = {lang: {split: None for split in eval_splits} for lang in langs} - relevant_docs = {lang: {split: None for split in eval_splits} for lang in langs} + corpus = {lang: dict.fromkeys(eval_splits) for lang in langs} + queries = {lang: dict.fromkeys(eval_splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(eval_splits) for lang in langs} for lang in langs: lang_corpus = datasets.load_dataset( @@ -112,9 +112,9 @@ def load_neuclir_data_hard_negatives( revision: str | None = None, ): split = "test" - corpus = {lang: {split: None for split in eval_splits} for lang in langs} - queries = {lang: {split: None for split in eval_splits} for lang in langs} - relevant_docs = {lang: {split: None for split in eval_splits} for lang in langs} + corpus = {lang: dict.fromkeys(eval_splits) for lang in langs} + queries = {lang: dict.fromkeys(eval_splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(eval_splits) for lang in langs} for lang in langs: corpus_identifier = f"corpus-{lang}" diff --git a/mteb/tasks/Retrieval/multilingual/NeuCLIR2023Retrieval.py b/mteb/tasks/Retrieval/multilingual/NeuCLIR2023Retrieval.py index f28198b474..7786d95f35 100644 --- a/mteb/tasks/Retrieval/multilingual/NeuCLIR2023Retrieval.py +++ b/mteb/tasks/Retrieval/multilingual/NeuCLIR2023Retrieval.py @@ -24,9 +24,9 @@ def load_neuclir_data( cache_dir: str | None = None, revision: str | None = None, ): - corpus = {lang: {split: None for split in eval_splits} for lang in langs} - queries = {lang: {split: None for split in eval_splits} for lang in langs} - relevant_docs = {lang: {split: None for split in eval_splits} for lang in langs} + corpus = {lang: dict.fromkeys(eval_splits) for lang in langs} + queries = {lang: dict.fromkeys(eval_splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(eval_splits) for lang in langs} for lang in langs: lang_corpus = datasets.load_dataset( @@ -113,9 +113,9 @@ def load_neuclir_data_hard_negatives( revision: str | None = None, ): split = "test" - corpus = {lang: {split: None for split in eval_splits} for lang in langs} - queries = {lang: {split: None for split in eval_splits} for lang in langs} - relevant_docs = {lang: {split: None for split in eval_splits} for lang in langs} + corpus = {lang: dict.fromkeys(eval_splits) for lang in langs} + queries = {lang: dict.fromkeys(eval_splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(eval_splits) for lang in langs} for lang in langs: corpus_identifier = f"corpus-{lang}" diff --git a/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py b/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py index 64e5646396..635de330b7 100644 --- a/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py @@ -65,9 +65,9 @@ def _load_webfaq_data( path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None ): - corpus = {lang: {split: None for split in splits} for lang in langs} - queries = {lang: {split: None for split in splits} for lang in langs} - relevant_docs = {lang: {split: None for split in splits} for lang in langs} + corpus = {lang: dict.fromkeys(splits) for lang in langs} + queries = {lang: dict.fromkeys(splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(splits) for lang in langs} split = _EVAL_SPLIT diff --git a/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py b/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py index f630009419..4dee4bad61 100644 --- a/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py @@ -54,7 +54,7 @@ def _load_xmarket_data( corpus[lang][split] = {row["_id"]: row for row in corpus_rows} queries[lang][split] = {row["_id"]: row["text"] for row in query_rows} relevant_docs[lang][split] = { - row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows + row["_id"]: dict.fromkeys(row["text"].split(" "), 1) for row in qrels_rows } corpus = datasets.DatasetDict(corpus) diff --git a/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2P.py b/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2P.py index 79e0f59d01..7685bc0bf7 100644 --- a/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2P.py +++ b/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2P.py @@ -83,7 +83,8 @@ def load_data(self, **kwargs): self.corpus = {"test": {row["_id"]: row for row in corpus_rows}} self.relevant_docs = { "test": { - row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows + row["_id"]: dict.fromkeys(row["text"].split(" "), 1) + for row in qrels_rows } } diff --git a/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2S.py b/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2S.py index f22739e2a1..31ada1167c 100644 --- a/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2S.py +++ b/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2S.py @@ -80,7 +80,8 @@ def load_data(self, **kwargs): self.corpus = {"test": {row["_id"]: row for row in corpus_rows}} self.relevant_docs = { "test": { - row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows + row["_id"]: dict.fromkeys(row["text"].split(" "), 1) + for row in qrels_rows } } diff --git a/pyproject.toml b/pyproject.toml index 89c221d1d0..e2cf0da12c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ mteb = "mteb.cli:main" [project.optional-dependencies] image = ["torchvision>0.2.1"] dev = [ -"ruff==0.9.7", # locked so we don't get PRs which fail only due to a lint update +"ruff==0.11.13", # locked so we don't get PRs which fail only due to a lint update "pytest>=8.3.4", "pytest-xdist>=3.6.1", "pytest-coverage>=0.0", diff --git a/tests/test_reproducible_workflow.py b/tests/test_reproducible_workflow.py index ffc892c44b..d584a98852 100644 --- a/tests/test_reproducible_workflow.py +++ b/tests/test_reproducible_workflow.py @@ -55,7 +55,7 @@ def test_validate_task_to_prompt_name(task_name: str | mteb.AbsTask): else: task_names = [task_name] - model_prompts = {task_name: "prompt_name" for task_name in task_names} + model_prompts = dict.fromkeys(task_names, "prompt_name") model_prompts |= {task_name + "-query": "prompt_name" for task_name in task_names} model_prompts |= {task_name + "-passage": "prompt_name" for task_name in task_names} model_prompts |= {