From 9e579d9bf6912801c580e4693e34034ccb448404 Mon Sep 17 00:00:00 2001 From: HSILA Date: Fri, 6 Sep 2024 10:34:10 -0400 Subject: [PATCH 1/7] fix: OpenAI BadRequestError by limiting input dimensions to 2048 elements (#1201) Fix OpenAI BadRequestError by limiting input dimensions to 2048 elements - Ensure the 'sentences' list passed to OpenAI API does not exceed 2048 elements - Reference: OpenAI's Embedding API documentation on input limits Co-authored-by: Ali Shiraee --- mteb/models/openai_models.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 03de44ba3c..a901a1e688 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -31,14 +31,22 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: "Reducing embedding size available only for text-embedding-3-* models" ) - return self._to_numpy( - self._client.embeddings.create( - input=sentences, + max_batch_size = 2048 + sublists = [sentences[i:i + max_batch_size] + for i in range(0, len(sentences), max_batch_size)] + + all_embeddings = [] + + for sublist in sublists: + response = self._client.embeddings.create( + input=sublist, model=self._model_name, encoding_format="float", dimensions=self._embed_dim or NotGiven(), ) - ) + all_embeddings.extend(self._to_numpy(response)) + + return np.array(all_embeddings) def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray: return self.encode(queries, **kwargs) From 6c44c3e8a15dcd853c1af88222fda50efe361de8 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 6 Sep 2024 17:04:09 +0200 Subject: [PATCH 2/7] fix ruff formatting --- pyproject.toml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 57f8cce332..202cd0d685 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,8 @@ homepage = "https://github.com/embeddings-benchmark/mteb" mteb = "mteb.cli:main" [project.optional-dependencies] -dev = ["ruff>=0.6.0", "pytest", "pytest-xdist", "pytest-coverage"] +dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint update +"pytest", "pytest-xdist", "pytest-coverage"] codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] @@ -97,10 +98,8 @@ select = [ "D", # formatting for docs "UP", # upgrade to latest syntax if possible "FA", # Future annotations - "C4", # cleaner comprehensions - "ISC", + "C4", # cleaner comprehensions ] -unfixable = ["ISC001"] ignore = ["E501", # line too long From fa83cfd62e9fe8898320c164109bf44c49779db2 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 6 Sep 2024 17:21:48 +0200 Subject: [PATCH 3/7] Added minor test fixes to ensure reproducility across systems --- mteb/__main__.py | 5 +++++ tests/test_benchmark/test_benchmark.py | 13 ++++--------- tests/test_cli.py | 9 +++++---- 3 files changed, 14 insertions(+), 13 deletions(-) create mode 100644 mteb/__main__.py diff --git a/mteb/__main__.py b/mteb/__main__.py new file mode 100644 index 0000000000..709f6d4345 --- /dev/null +++ b/mteb/__main__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from mteb.cli import main + +main() diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 9b86a2bd94..d3fe16e471 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -29,18 +29,13 @@ @pytest.mark.parametrize("tasks", [MOCK_TASK_TEST_GRID]) @pytest.mark.parametrize("model", [MockNumpyEncoder()]) def test_mulitple_mteb_tasks( - tasks: list[mteb.AbsTask], model: mteb.Encoder, monkeypatch -): + tasks: list[mteb.AbsTask], model: mteb.Encoder, tmp_path: Path): """Test that multiple tasks can be run""" eval = mteb.MTEB(tasks=tasks) - output_folder = "tests/results" - eval.run(model, output_folder=output_folder, overwrite_results=True) + eval.run(model, output_folder=str(tmp_path), overwrite_results=True) - tasks_dict = {task.metadata.name: task for task in tasks} - monkeypatch.setattr( - mteb, "get_task", lambda task_name, **kwargs: tasks_dict[task_name] - ) - generate_readme(Path(output_folder)) + # ensure that we can generate a readme from the output folder + generate_readme(tmp_path) @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID) diff --git a/tests/test_cli.py b/tests/test_cli.py index 1d4f8c5ded..ee3677c88e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,6 +3,7 @@ from __future__ import annotations import subprocess +import sys from argparse import Namespace from pathlib import Path @@ -13,7 +14,7 @@ def test_available_tasks(): - command = "mteb available_tasks" + command = "{sys.executable} -m mteb available_tasks" result = subprocess.run(command, shell=True, capture_output=True, text=True) assert result.returncode == 0, "Command failed" assert ( @@ -111,7 +112,7 @@ def test_create_meta(): ), f"Value for {key} does not match" # ensure that the command line interface works as well - command = f"mteb create_meta --results_folder {results} --output_path {output_path} --overwrite" + command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --overwrite" result = subprocess.run(command, shell=True, capture_output=True, text=True) assert result.returncode == 0, "Command failed" @@ -172,13 +173,13 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name: ), f"Value for {key} does not match" assert readme_output == gold_readme # ensure that the command line interface works as well - command = f"mteb create_meta --results_folder {results} --output_path {output_path} --from_existing {existing_readme} --overwrite" + command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --from_existing {existing_readme} --overwrite" result = subprocess.run(command, shell=True, capture_output=True, text=True) assert result.returncode == 0, "Command failed" def test_save_predictions(): - command = "mteb run -m all-MiniLM-L6-v2 -t NFCorpus --output_folder tests/results --save_predictions" + command = f"{sys.executable} -m mteb run -m all-MiniLM-L6-v2 -t NFCorpus --output_folder tests/results --save_predictions" result = subprocess.run(command, shell=True, capture_output=True, text=True) assert result.returncode == 0, "Command failed" test_folder = Path(__file__).parent From 15adc7d4f52d54ce0a0671cf436e8ace05ed3fcd Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 6 Sep 2024 17:38:48 +0200 Subject: [PATCH 4/7] Ensure that tmp.json is not created within repo when running tests --- mteb/evaluation/evaluators/RetrievalEvaluator.py | 5 +++-- tests/test_cli.py | 2 +- tests/test_tasks/test_mteb_rerank.py | 11 ++++++----- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 2ea70cb9bd..1a8add4c24 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -5,6 +5,7 @@ import logging import os from collections import defaultdict +from pathlib import Path from typing import Any import numpy as np @@ -42,7 +43,7 @@ def __init__( model: EncoderWithQueryCorpusEncode, encode_kwargs: dict[str, Any] = {}, corpus_chunk_size: int = 50000, - previous_results: str | None = None, + previous_results: str | Path | None = None, **kwargs: Any, ): # Model is class that provides encode_corpus() and encode_queries() @@ -62,7 +63,7 @@ def __init__( "dot": "Dot Product", } self.corpus_chunk_size = corpus_chunk_size - self.previous_results = previous_results + self.previous_results = str(previous_results) self.batch_size = encode_kwargs.get("batch_size") self.show_progress_bar = encode_kwargs.get("show_progress_bar") self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False) diff --git a/tests/test_cli.py b/tests/test_cli.py index ee3677c88e..fdcd1b014a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -14,7 +14,7 @@ def test_available_tasks(): - command = "{sys.executable} -m mteb available_tasks" + command = f"{sys.executable} -m mteb available_tasks" result = subprocess.run(command, shell=True, capture_output=True, text=True) assert result.returncode == 0, "Command failed" assert ( diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index 6920769694..78ad03c8fa 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -2,7 +2,7 @@ import json import logging -import os +from pathlib import Path from sentence_transformers import CrossEncoder, SentenceTransformer @@ -11,7 +11,7 @@ logging.basicConfig(level=logging.INFO) -def test_mteb_rerank(): +def test_mteb_rerank(tmp_path: Path): # Test that reranking works # unfortunately, we need all the query ids to pretend to have this scifact_keys = [ @@ -323,7 +323,8 @@ def test_mteb_rerank(): ] ) # create fake first stage results - with open("tmp.json", "w") as f: + tmp_file = tmp_path / "tmp.json" + with open(tmp_file, "w") as f: f.write( json.dumps( { @@ -344,10 +345,10 @@ def test_mteb_rerank(): overwrite_results=True, eval_splits=["test"], top_k=2, - previous_results="tmp.json", + previous_results=tmp_file, save_predictions=True, ) - os.remove("tmp.json") + tmp_file.unlink() # read in the results with open("tests/results/SciFact_default_predictions.json") as f: From cc7231bdf97c7021e1a346750a32caa276b60f11 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 6 Sep 2024 17:50:57 +0200 Subject: [PATCH 5/7] format --- mteb/models/openai_models.py | 6 ++++-- tests/test_benchmark/test_benchmark.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index a901a1e688..4e6faf3fbc 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -32,8 +32,10 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: ) max_batch_size = 2048 - sublists = [sentences[i:i + max_batch_size] - for i in range(0, len(sentences), max_batch_size)] + sublists = [ + sentences[i : i + max_batch_size] + for i in range(0, len(sentences), max_batch_size) + ] all_embeddings = [] diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index d3fe16e471..3d32d923bc 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -29,7 +29,8 @@ @pytest.mark.parametrize("tasks", [MOCK_TASK_TEST_GRID]) @pytest.mark.parametrize("model", [MockNumpyEncoder()]) def test_mulitple_mteb_tasks( - tasks: list[mteb.AbsTask], model: mteb.Encoder, tmp_path: Path): + tasks: list[mteb.AbsTask], model: mteb.Encoder, tmp_path: Path +): """Test that multiple tasks can be run""" eval = mteb.MTEB(tasks=tasks) eval.run(model, output_folder=str(tmp_path), overwrite_results=True) From 0208fc64e279ce71fd69ef42f870d61d979372d5 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 6 Sep 2024 18:41:45 +0200 Subject: [PATCH 6/7] fixes path issues --- mteb/evaluation/evaluators/RetrievalEvaluator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 1a8add4c24..b0d7960312 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -63,7 +63,10 @@ def __init__( "dot": "Dot Product", } self.corpus_chunk_size = corpus_chunk_size - self.previous_results = str(previous_results) + if isinstance(previous_results, Path): + self.previous_results = str(previous_results) + else: + self.previous_results = previous_results self.batch_size = encode_kwargs.get("batch_size") self.show_progress_bar = encode_kwargs.get("show_progress_bar") self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False) From 85c1a1bdc8bbc6157f7ad6f61f6f839cc111ec26 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 9 Sep 2024 13:28:26 +0200 Subject: [PATCH 7/7] Rerun CI