embeddings-benchmark · KennethEnevoldsen · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025
diff --git a/mteb/abstasks/AbsTaskClusteringFast.py b/mteb/abstasks/AbsTaskClusteringFast.py
@@ -320,9 +320,9 @@ def convert_to_fast(
 
             # check that it is the same distribution
             row_label_set = set(lab)
-            assert row_label_set.issubset(
-                all_labels_set
-            ), "The clusters are not sampled from the same distribution as they have different labels."
+            assert row_label_set.issubset(all_labels_set), (
+                "The clusters are not sampled from the same distribution as they have different labels."
+            )
 
             for l, s in zip(lab, sents):
                 if s not in sent_set:
@@ -353,6 +353,6 @@ def check_label_distribution(ds: DatasetDict) -> None:
 
         # check that it is the same distribution
         row_label_set = set(lab)
-        assert row_label_set.issubset(
-            all_labels_set
-        ), "The clusters are not sampled from the same distribution as they have different labels."
+        assert row_label_set.issubset(all_labels_set), (
+            "The clusters are not sampled from the same distribution as they have different labels."
+        )
diff --git a/mteb/abstasks/AbsTaskInstructionRetrieval.py b/mteb/abstasks/AbsTaskInstructionRetrieval.py
@@ -379,9 +379,9 @@ def load_data(self, **kwargs):
                 doc["id"]: {"title": doc["title"], "text": doc["text"]}
                 for doc in corpus
             }
-            assert (
-                len(top_ranked) == len(queries)
-            ), f"Top ranked not loaded properly! Expected {len(self.queries)} but got {len(self.top_ranked)}."
+            assert len(top_ranked) == len(queries), (
+                f"Top ranked not loaded properly! Expected {len(self.queries)} but got {len(self.top_ranked)}."
+            )
 
             (
                 self.corpus[split],

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
@@ -71,9 +71,9 @@ def __init__(
             if isinstance(tasks[0], Benchmark):
                 self.benchmarks = tasks
                 self._tasks = self._tasks = list(chain.from_iterable(tasks))  # type: ignore
-            assert (
-                task_types is None and task_categories is None
-            ), "Cannot specify both `tasks` and `task_types`/`task_categories`"
+            assert task_types is None and task_categories is None, (
+                "Cannot specify both `tasks` and `task_types`/`task_categories`"
+            )
         else:
             self._task_types = task_types
             self._task_categories = task_categories

diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py
@@ -85,9 +85,9 @@ def predict(self, input_to_rerank, **kwargs):
         assert len(queries) == len(passages)
         query_passage_tuples = list(zip(queries, passages))
         scores = self.model.compute_score(query_passage_tuples, normalize=True)
-        assert len(scores) == len(
-            queries
-        ), f"Expected {len(queries)} scores, got {len(scores)}"
+        assert len(scores) == len(queries), (
+            f"Expected {len(queries)} scores, got {len(scores)}"
+        )
         return scores
 
 

diff --git a/mteb/tasks/BitextMining/vie/VieMedEVBitextMining.py b/mteb/tasks/BitextMining/vie/VieMedEVBitextMining.py
@@ -54,19 +54,19 @@ def dataset_transform(self):
         # Pairs are in two halves
         en_sentences = all_texts[:mid_index]
         vie_sentences = all_texts[mid_index:]
-        assert len(en_sentences) == len(
-            vie_sentences
-        ), "The split does not result in equal halves."
+        assert len(en_sentences) == len(vie_sentences), (
+            "The split does not result in equal halves."
+        )
 
         # Downsample
         indices = list(range(len(en_sentences)))
         random.shuffle(indices)
         sample_indices = indices[:TEST_SAMPLES]
         en_sentences = [en_sentences[i] for i in sample_indices]
         vie_sentences = [vie_sentences[i] for i in sample_indices]
-        assert (
-            len(en_sentences) == len(vie_sentences) == TEST_SAMPLES
-        ), f"Exceeded {TEST_SAMPLES} samples for 'test' split."
+        assert len(en_sentences) == len(vie_sentences) == TEST_SAMPLES, (
+            f"Exceeded {TEST_SAMPLES} samples for 'test' split."
+        )
 
         # Return dataset
         ds["test"] = datasets.Dataset.from_dict(

diff --git a/pyproject.toml b/pyproject.toml
@@ -54,7 +54,7 @@ mteb = "mteb.cli:main"
 
 [project.optional-dependencies]
 dev = [
-"ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint update
+"ruff==0.9.7", # locked so we don't get PRs which fail only due to a lint update
 "pytest>=8.3.4",
 "pytest-xdist>=3.6.1",
 "pytest-coverage>=0.0",

diff --git a/scripts/running_model/check_results.py b/scripts/running_model/check_results.py
@@ -174,9 +174,9 @@ def normalize_results(results):
 # [t.task_name for t in mteb_results['GritLM/GritLM-7B']["13f00a0e36500c80ce12870ea513846a066004af"] if t.task_name == "SemRel24STS"]
 # it is there
 
-assert [
-    len(revisions.keys()) == 1 for model, revisions in mteb_results.items()
-], "Some models have more than one revision"
+assert [len(revisions.keys()) == 1 for model, revisions in mteb_results.items()], (
+    "Some models have more than one revision"
+)
 
 results_df = results_to_dataframe(mteb_results)
 

diff --git a/scripts/task_selection/create_main_results_table.ipynb b/scripts/task_selection/create_main_results_table.ipynb
@@ -62,7 +62,7 @@
     "    results = results.sort_values(\"Borda Count\", ascending=False)\n",
     "    # borda str: 1 ({borda count}) 2 ({borda count}) 3 ({borda count}) ...\n",
     "    results[\"Borda str\"] = [\n",
-    "        f\"{i+1} ({int(borda_count)})\"\n",
+    "        f\"{i + 1} ({int(borda_count)})\"\n",
     "        for i, borda_count in enumerate(results[\"Borda Count\"].to_list())\n",
     "    ]\n",
     "\n",

diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py
@@ -516,17 +516,17 @@ def test_disallow_trust_remote_code_in_new_datasets():
         "SwednClusteringS2S",
     ]
 
-    assert (
-        135 == len(exceptions)
-    ), "The number of exceptions has changed. Please do not add new datasets to this list."
+    assert 135 == len(exceptions), (
+        "The number of exceptions has changed. Please do not add new datasets to this list."
+    )
 
     exceptions = []
 
     for task in get_tasks():
         if task.metadata.dataset.get("trust_remote_code", False):
-            assert (
-                task.metadata.name not in exceptions
-            ), f"Dataset {task.metadata.name} should not trust remote code"
+            assert task.metadata.name not in exceptions, (
+                f"Dataset {task.metadata.name} should not trust remote code"
+            )
 
 
 def test_empy_descriptive_stat_in_new_datasets():
@@ -1088,26 +1088,26 @@ def test_empy_descriptive_stat_in_new_datasets():
         "SummEvalFrSummarization.v2",
     ]
 
-    assert (
-        553 == len(exceptions)
-    ), "The number of exceptions has changed. Please do not add new datasets to this list."
+    assert 553 == len(exceptions), (
+        "The number of exceptions has changed. Please do not add new datasets to this list."
+    )
 
     exceptions = []
 
     for task in get_tasks():
         if task.metadata.descriptive_stats is None:
-            assert (
-                task.metadata.name not in exceptions
-            ), f"Dataset {task.metadata.name} should have descriptive stats"
+            assert task.metadata.name not in exceptions, (
+                f"Dataset {task.metadata.name} should have descriptive stats"
+            )
 
 
 @pytest.mark.parametrize("task", get_tasks())
 def test_eval_langs_correctly_specified(task: AbsTask):
     if task.is_multilingual:
-        assert isinstance(
-            task.metadata.eval_langs, dict
-        ), f"{task.metadata.name} should have eval_langs as a dict"
+        assert isinstance(task.metadata.eval_langs, dict), (
+            f"{task.metadata.name} should have eval_langs as a dict"
+        )
     else:
-        assert isinstance(
-            task.metadata.eval_langs, list
-        ), f"{task.metadata.name} should have eval_langs as a list"
+        assert isinstance(task.metadata.eval_langs, list), (
+            f"{task.metadata.name} should have eval_langs as a list"
+        )
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -17,18 +17,18 @@ def test_available_tasks():
     command = f"{sys.executable} -m mteb available_tasks"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     assert result.returncode == 0, "Command failed"
-    assert (
-        "Banking77Classification" in result.stdout
-    ), "Sample task Banking77Classification task not found in available tasks"
+    assert "Banking77Classification" in result.stdout, (
+        "Sample task Banking77Classification task not found in available tasks"
+    )
 
 
 def test_available_benchmarks():
     command = f"{sys.executable} -m mteb available_benchmarks"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     assert result.returncode == 0, "Command failed"
-    assert (
-        "MTEB(eng, v1)" in result.stdout
-    ), "Sample benchmark MTEB(eng, v1) task not found in available benchmarks"
+    assert "MTEB(eng, v1)" in result.stdout, (
+        "Sample benchmark MTEB(eng, v1) task not found in available benchmarks"
+    )
 
 
 run_task_fixures = [
@@ -75,12 +75,12 @@ def test_run_task(
         f"tests/results/test_model/{model_name_as_path}/{model_revision}"
     )
     assert results_path.exists(), "Output folder not created"
-    assert "model_meta.json" in [
-        f.name for f in list(results_path.glob("*.json"))
-    ], "model_meta.json not found in output folder"
-    assert f"{task_name}.json" in [
-        f.name for f in list(results_path.glob("*.json"))
-    ], f"{task_name} not found in output folder"
+    assert "model_meta.json" in [f.name for f in list(results_path.glob("*.json"))], (
+        "model_meta.json not found in output folder"
+    )
+    assert f"{task_name}.json" in [f.name for f in list(results_path.glob("*.json"))], (
+        f"{task_name} not found in output folder"
+    )
 
 
 def test_create_meta():
@@ -117,9 +117,9 @@ def test_create_meta():
     for key in frontmatter_gold:
         assert key in frontmatter, f"Key {key} not found in output"
 
-        assert (
-            frontmatter[key] == frontmatter_gold[key]
-        ), f"Value for {key} does not match"
+        assert frontmatter[key] == frontmatter_gold[key], (
+            f"Value for {key} does not match"
+        )
 
     # ensure that the command line interface works as well
     command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --overwrite"
@@ -178,9 +178,9 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name:
     for key in frontmatter_gold:
         assert key in frontmatter, f"Key {key} not found in output"
 
-        assert (
-            frontmatter[key] == frontmatter_gold[key]
-        ), f"Value for {key} does not match"
+        assert frontmatter[key] == frontmatter_gold[key], (
+            f"Value for {key} does not match"
+        )
     assert readme_output == gold_readme
     # ensure that the command line interface works as well
     command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --from_existing {existing_readme} --overwrite"

diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py
@@ -117,6 +117,6 @@ def test_superseded_dataset_exists():
     tasks = mteb.get_tasks(exclude_superseded=False)
     for task in tasks:
         if task.superseded_by:
-            assert (
-                task.superseded_by in TASKS_REGISTRY
-            ), f"{task} is superseded by {task.superseded_by} but {task.superseded_by} is not in the TASKS_REGISTRY"
+            assert task.superseded_by in TASKS_REGISTRY, (
+                f"{task} is superseded by {task.superseded_by} but {task.superseded_by} is not in the TASKS_REGISTRY"
+            )