From 9e579d9bf6912801c580e4693e34034ccb448404 Mon Sep 17 00:00:00 2001
From: HSILA <a.shiraee@gmail.com>
Date: Fri, 6 Sep 2024 10:34:10 -0400
Subject: [PATCH 1/7] fix: OpenAI BadRequestError by limiting input dimensions
 to 2048 elements (#1201)

Fix OpenAI BadRequestError by limiting input dimensions to 2048 elements

- Ensure the 'sentences' list passed to OpenAI API does not exceed 2048 elements
- Reference: OpenAI's Embedding API documentation on input limits

Co-authored-by: Ali Shiraee <ShiraeA@basfad.basf.net>
---
 mteb/models/openai_models.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py
index 03de44ba3c..a901a1e688 100644
--- a/mteb/models/openai_models.py
+++ b/mteb/models/openai_models.py
@@ -31,14 +31,22 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray:
                 "Reducing embedding size available only for text-embedding-3-* models"
             )
 
-        return self._to_numpy(
-            self._client.embeddings.create(
-                input=sentences,
+        max_batch_size = 2048
+        sublists = [sentences[i:i + max_batch_size]
+                    for i in range(0, len(sentences), max_batch_size)]
+
+        all_embeddings = []
+
+        for sublist in sublists:
+            response = self._client.embeddings.create(
+                input=sublist,
                 model=self._model_name,
                 encoding_format="float",
                 dimensions=self._embed_dim or NotGiven(),
             )
-        )
+            all_embeddings.extend(self._to_numpy(response))
+
+        return np.array(all_embeddings)
 
     def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
         return self.encode(queries, **kwargs)

From 6c44c3e8a15dcd853c1af88222fda50efe361de8 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Fri, 6 Sep 2024 17:04:09 +0200
Subject: [PATCH 2/7] fix ruff formatting

---
 pyproject.toml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 57f8cce332..202cd0d685 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,8 @@ homepage = "https://github.com/embeddings-benchmark/mteb"
 mteb = "mteb.cli:main"
 
 [project.optional-dependencies]
-dev = ["ruff>=0.6.0", "pytest", "pytest-xdist", "pytest-coverage"]
+dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint update
+"pytest", "pytest-xdist", "pytest-coverage"]
 codecarbon = ["codecarbon"]
 speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"]
 
@@ -97,10 +98,8 @@ select = [
     "D",  # formatting for docs
     "UP", # upgrade to latest syntax if possible
     "FA", # Future annotations
-    "C4", # cleaner comprehensions 
-    "ISC",
+    "C4", # cleaner comprehensions
 ]
-unfixable = ["ISC001"]  
 
 
 ignore = ["E501",   # line too long 

From fa83cfd62e9fe8898320c164109bf44c49779db2 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Fri, 6 Sep 2024 17:21:48 +0200
Subject: [PATCH 3/7] Added minor test fixes to ensure reproducility across
 systems

---
 mteb/__main__.py                       |  5 +++++
 tests/test_benchmark/test_benchmark.py | 13 ++++---------
 tests/test_cli.py                      |  9 +++++----
 3 files changed, 14 insertions(+), 13 deletions(-)
 create mode 100644 mteb/__main__.py

diff --git a/mteb/__main__.py b/mteb/__main__.py
new file mode 100644
index 0000000000..709f6d4345
--- /dev/null
+++ b/mteb/__main__.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from mteb.cli import main
+
+main()
diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index 9b86a2bd94..d3fe16e471 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -29,18 +29,13 @@
 @pytest.mark.parametrize("tasks", [MOCK_TASK_TEST_GRID])
 @pytest.mark.parametrize("model", [MockNumpyEncoder()])
 def test_mulitple_mteb_tasks(
-    tasks: list[mteb.AbsTask], model: mteb.Encoder, monkeypatch
-):
+    tasks: list[mteb.AbsTask], model: mteb.Encoder, tmp_path: Path):
     """Test that multiple tasks can be run"""
     eval = mteb.MTEB(tasks=tasks)
-    output_folder = "tests/results"
-    eval.run(model, output_folder=output_folder, overwrite_results=True)
+    eval.run(model, output_folder=str(tmp_path), overwrite_results=True)
 
-    tasks_dict = {task.metadata.name: task for task in tasks}
-    monkeypatch.setattr(
-        mteb, "get_task", lambda task_name, **kwargs: tasks_dict[task_name]
-    )
-    generate_readme(Path(output_folder))
+    # ensure that we can generate a readme from the output folder
+    generate_readme(tmp_path)
 
 
 @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 1d4f8c5ded..ee3677c88e 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import subprocess
+import sys
 from argparse import Namespace
 from pathlib import Path
 
@@ -13,7 +14,7 @@
 
 
 def test_available_tasks():
-    command = "mteb available_tasks"
+    command = "{sys.executable} -m mteb available_tasks"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     assert result.returncode == 0, "Command failed"
     assert (
@@ -111,7 +112,7 @@ def test_create_meta():
         ), f"Value for {key} does not match"
 
     # ensure that the command line interface works as well
-    command = f"mteb create_meta --results_folder {results} --output_path {output_path} --overwrite"
+    command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --overwrite"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     assert result.returncode == 0, "Command failed"
 
@@ -172,13 +173,13 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name:
         ), f"Value for {key} does not match"
     assert readme_output == gold_readme
     # ensure that the command line interface works as well
-    command = f"mteb create_meta --results_folder {results} --output_path {output_path} --from_existing {existing_readme} --overwrite"
+    command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --from_existing {existing_readme} --overwrite"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     assert result.returncode == 0, "Command failed"
 
 
 def test_save_predictions():
-    command = "mteb run -m all-MiniLM-L6-v2 -t NFCorpus --output_folder tests/results --save_predictions"
+    command = f"{sys.executable} -m mteb run -m all-MiniLM-L6-v2 -t NFCorpus --output_folder tests/results --save_predictions"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     assert result.returncode == 0, "Command failed"
     test_folder = Path(__file__).parent

From 15adc7d4f52d54ce0a0671cf436e8ace05ed3fcd Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Fri, 6 Sep 2024 17:38:48 +0200
Subject: [PATCH 4/7] Ensure that tmp.json is not created within repo when
 running tests

---
 mteb/evaluation/evaluators/RetrievalEvaluator.py |  5 +++--
 tests/test_cli.py                                |  2 +-
 tests/test_tasks/test_mteb_rerank.py             | 11 ++++++-----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py
index 2ea70cb9bd..1a8add4c24 100644
--- a/mteb/evaluation/evaluators/RetrievalEvaluator.py
+++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py
@@ -5,6 +5,7 @@
 import logging
 import os
 from collections import defaultdict
+from pathlib import Path
 from typing import Any
 
 import numpy as np
@@ -42,7 +43,7 @@ def __init__(
         model: EncoderWithQueryCorpusEncode,
         encode_kwargs: dict[str, Any] = {},
         corpus_chunk_size: int = 50000,
-        previous_results: str | None = None,
+        previous_results: str | Path | None = None,
         **kwargs: Any,
     ):
         # Model is class that provides encode_corpus() and encode_queries()
@@ -62,7 +63,7 @@ def __init__(
             "dot": "Dot Product",
         }
         self.corpus_chunk_size = corpus_chunk_size
-        self.previous_results = previous_results
+        self.previous_results = str(previous_results)
         self.batch_size = encode_kwargs.get("batch_size")
         self.show_progress_bar = encode_kwargs.get("show_progress_bar")
         self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index ee3677c88e..fdcd1b014a 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -14,7 +14,7 @@
 
 
 def test_available_tasks():
-    command = "{sys.executable} -m mteb available_tasks"
+    command = f"{sys.executable} -m mteb available_tasks"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     assert result.returncode == 0, "Command failed"
     assert (
diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py
index 6920769694..78ad03c8fa 100644
--- a/tests/test_tasks/test_mteb_rerank.py
+++ b/tests/test_tasks/test_mteb_rerank.py
@@ -2,7 +2,7 @@
 
 import json
 import logging
-import os
+from pathlib import Path
 
 from sentence_transformers import CrossEncoder, SentenceTransformer
 
@@ -11,7 +11,7 @@
 logging.basicConfig(level=logging.INFO)
 
 
-def test_mteb_rerank():
+def test_mteb_rerank(tmp_path: Path):
     # Test that reranking works
     # unfortunately, we need all the query ids to pretend to have this
     scifact_keys = [
@@ -323,7 +323,8 @@ def test_mteb_rerank():
         ]
     )
     # create fake first stage results
-    with open("tmp.json", "w") as f:
+    tmp_file = tmp_path / "tmp.json"
+    with open(tmp_file, "w") as f:
         f.write(
             json.dumps(
                 {
@@ -344,10 +345,10 @@ def test_mteb_rerank():
         overwrite_results=True,
         eval_splits=["test"],
         top_k=2,
-        previous_results="tmp.json",
+        previous_results=tmp_file,
         save_predictions=True,
     )
-    os.remove("tmp.json")
+    tmp_file.unlink()
 
     # read in the results
     with open("tests/results/SciFact_default_predictions.json") as f:

From cc7231bdf97c7021e1a346750a32caa276b60f11 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Fri, 6 Sep 2024 17:50:57 +0200
Subject: [PATCH 5/7] format

---
 mteb/models/openai_models.py           | 6 ++++--
 tests/test_benchmark/test_benchmark.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py
index a901a1e688..4e6faf3fbc 100644
--- a/mteb/models/openai_models.py
+++ b/mteb/models/openai_models.py
@@ -32,8 +32,10 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray:
             )
 
         max_batch_size = 2048
-        sublists = [sentences[i:i + max_batch_size]
-                    for i in range(0, len(sentences), max_batch_size)]
+        sublists = [
+            sentences[i : i + max_batch_size]
+            for i in range(0, len(sentences), max_batch_size)
+        ]
 
         all_embeddings = []
 
diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index d3fe16e471..3d32d923bc 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -29,7 +29,8 @@
 @pytest.mark.parametrize("tasks", [MOCK_TASK_TEST_GRID])
 @pytest.mark.parametrize("model", [MockNumpyEncoder()])
 def test_mulitple_mteb_tasks(
-    tasks: list[mteb.AbsTask], model: mteb.Encoder, tmp_path: Path):
+    tasks: list[mteb.AbsTask], model: mteb.Encoder, tmp_path: Path
+):
     """Test that multiple tasks can be run"""
     eval = mteb.MTEB(tasks=tasks)
     eval.run(model, output_folder=str(tmp_path), overwrite_results=True)

From 0208fc64e279ce71fd69ef42f870d61d979372d5 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Fri, 6 Sep 2024 18:41:45 +0200
Subject: [PATCH 6/7] fixes path issues

---
 mteb/evaluation/evaluators/RetrievalEvaluator.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py
index 1a8add4c24..b0d7960312 100644
--- a/mteb/evaluation/evaluators/RetrievalEvaluator.py
+++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py
@@ -63,7 +63,10 @@ def __init__(
             "dot": "Dot Product",
         }
         self.corpus_chunk_size = corpus_chunk_size
-        self.previous_results = str(previous_results)
+        if isinstance(previous_results, Path):
+            self.previous_results = str(previous_results)
+        else:
+            self.previous_results = previous_results
         self.batch_size = encode_kwargs.get("batch_size")
         self.show_progress_bar = encode_kwargs.get("show_progress_bar")
         self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False)

From 85c1a1bdc8bbc6157f7ad6f61f6f839cc111ec26 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Mon, 9 Sep 2024 13:28:26 +0200
Subject: [PATCH 7/7] Rerun CI