embeddings-benchmark · KennethEnevoldsen · Sep 9, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/mteb/__main__.py b/mteb/__main__.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from mteb.cli import main
+
+main()
diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py
@@ -5,6 +5,7 @@
 import logging
 import os
 from collections import defaultdict
+from pathlib import Path
 from typing import Any
 
 import numpy as np
@@ -42,7 +43,7 @@ def __init__(
         model: EncoderWithQueryCorpusEncode,
         encode_kwargs: dict[str, Any] = {},
         corpus_chunk_size: int = 50000,
-        previous_results: str | None = None,
+        previous_results: str | Path | None = None,
         **kwargs: Any,
     ):
         # Model is class that provides encode_corpus() and encode_queries()
@@ -62,7 +63,10 @@ def __init__(
             "dot": "Dot Product",
         }
         self.corpus_chunk_size = corpus_chunk_size
-        self.previous_results = previous_results
+        if isinstance(previous_results, Path):
+            self.previous_results = str(previous_results)
+        else:
+            self.previous_results = previous_results
         self.batch_size = encode_kwargs.get("batch_size")
         self.show_progress_bar = encode_kwargs.get("show_progress_bar")
         self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False)

diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py
@@ -31,14 +31,24 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray:
                 "Reducing embedding size available only for text-embedding-3-* models"
             )
 
-        return self._to_numpy(
-            self._client.embeddings.create(
-                input=sentences,
+        max_batch_size = 2048
+        sublists = [
+            sentences[i : i + max_batch_size]
+            for i in range(0, len(sentences), max_batch_size)
+        ]
+
+        all_embeddings = []
+
+        for sublist in sublists:
+            response = self._client.embeddings.create(
+                input=sublist,
                 model=self._model_name,
                 encoding_format="float",
                 dimensions=self._embed_dim or NotGiven(),
             )
-        )
+            all_embeddings.extend(self._to_numpy(response))
+
+        return np.array(all_embeddings)
 
     def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
         return self.encode(queries, **kwargs)

diff --git a/pyproject.toml b/pyproject.toml
@@ -52,7 +52,8 @@ homepage = "https://github.com/embeddings-benchmark/mteb"
 mteb = "mteb.cli:main"
 
 [project.optional-dependencies]
-dev = ["ruff>=0.6.0", "pytest", "pytest-xdist", "pytest-coverage"]
+dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint update
+"pytest", "pytest-xdist", "pytest-coverage"]
 codecarbon = ["codecarbon"]
 speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"]
 
@@ -97,10 +98,8 @@ select = [
     "D",  # formatting for docs
     "UP", # upgrade to latest syntax if possible
     "FA", # Future annotations
-    "C4", # cleaner comprehensions 
-    "ISC",
+    "C4", # cleaner comprehensions
 ]
-unfixable = ["ISC001"]  
 
 
 ignore = ["E501",   # line too long 

diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
@@ -29,18 +29,14 @@
 @pytest.mark.parametrize("tasks", [MOCK_TASK_TEST_GRID])
 @pytest.mark.parametrize("model", [MockNumpyEncoder()])
 def test_mulitple_mteb_tasks(
-    tasks: list[mteb.AbsTask], model: mteb.Encoder, monkeypatch
+    tasks: list[mteb.AbsTask], model: mteb.Encoder, tmp_path: Path
 ):
     """Test that multiple tasks can be run"""
     eval = mteb.MTEB(tasks=tasks)
-    output_folder = "tests/results"
-    eval.run(model, output_folder=output_folder, overwrite_results=True)
+    eval.run(model, output_folder=str(tmp_path), overwrite_results=True)
 
-    tasks_dict = {task.metadata.name: task for task in tasks}
-    monkeypatch.setattr(
-        mteb, "get_task", lambda task_name, **kwargs: tasks_dict[task_name]
-    )
-    generate_readme(Path(output_folder))
+    # ensure that we can generate a readme from the output folder
+    generate_readme(tmp_path)
 
 
 @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID)

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import subprocess
+import sys
 from argparse import Namespace
 from pathlib import Path
 
@@ -13,7 +14,7 @@
 
 
 def test_available_tasks():
-    command = "mteb available_tasks"
+    command = f"{sys.executable} -m mteb available_tasks"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     assert result.returncode == 0, "Command failed"
     assert (
@@ -111,7 +112,7 @@ def test_create_meta():
         ), f"Value for {key} does not match"
 
     # ensure that the command line interface works as well
-    command = f"mteb create_meta --results_folder {results} --output_path {output_path} --overwrite"
+    command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --overwrite"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     assert result.returncode == 0, "Command failed"
 
@@ -172,13 +173,13 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name:
         ), f"Value for {key} does not match"
     assert readme_output == gold_readme
     # ensure that the command line interface works as well
-    command = f"mteb create_meta --results_folder {results} --output_path {output_path} --from_existing {existing_readme} --overwrite"
+    command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --from_existing {existing_readme} --overwrite"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     assert result.returncode == 0, "Command failed"
 
 
 def test_save_predictions():
-    command = "mteb run -m all-MiniLM-L6-v2 -t NFCorpus --output_folder tests/results --save_predictions"
+    command = f"{sys.executable} -m mteb run -m all-MiniLM-L6-v2 -t NFCorpus --output_folder tests/results --save_predictions"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     assert result.returncode == 0, "Command failed"
     test_folder = Path(__file__).parent

diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py
@@ -2,7 +2,7 @@
 
 import json
 import logging
-import os
+from pathlib import Path
 
 from sentence_transformers import CrossEncoder, SentenceTransformer
 
@@ -11,7 +11,7 @@
 logging.basicConfig(level=logging.INFO)
 
 
-def test_mteb_rerank():
+def test_mteb_rerank(tmp_path: Path):
     # Test that reranking works
     # unfortunately, we need all the query ids to pretend to have this
     scifact_keys = [
@@ -323,7 +323,8 @@ def test_mteb_rerank():
         ]
     )
     # create fake first stage results
-    with open("tmp.json", "w") as f:
+    tmp_file = tmp_path / "tmp.json"
+    with open(tmp_file, "w") as f:
         f.write(
             json.dumps(
                 {
@@ -344,10 +345,10 @@ def test_mteb_rerank():
         overwrite_results=True,
         eval_splits=["test"],
         top_k=2,
-        previous_results="tmp.json",
+        previous_results=tmp_file,
         save_predictions=True,
     )
-    os.remove("tmp.json")
+    tmp_file.unlink()
 
     # read in the results
     with open("tests/results/SciFact_default_predictions.json") as f: