Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions mteb/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from __future__ import annotations

from mteb.cli import main

main()
8 changes: 6 additions & 2 deletions mteb/evaluation/evaluators/RetrievalEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging
import os
from collections import defaultdict
from pathlib import Path
from typing import Any

import numpy as np
Expand Down Expand Up @@ -42,7 +43,7 @@ def __init__(
model: EncoderWithQueryCorpusEncode,
encode_kwargs: dict[str, Any] = {},
corpus_chunk_size: int = 50000,
previous_results: str | None = None,
previous_results: str | Path | None = None,
**kwargs: Any,
):
# Model is class that provides encode_corpus() and encode_queries()
Expand All @@ -62,7 +63,10 @@ def __init__(
"dot": "Dot Product",
}
self.corpus_chunk_size = corpus_chunk_size
self.previous_results = previous_results
if isinstance(previous_results, Path):
self.previous_results = str(previous_results)
else:
self.previous_results = previous_results
self.batch_size = encode_kwargs.get("batch_size")
self.show_progress_bar = encode_kwargs.get("show_progress_bar")
self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False)
Expand Down
18 changes: 14 additions & 4 deletions mteb/models/openai_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,24 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray:
"Reducing embedding size available only for text-embedding-3-* models"
)

return self._to_numpy(
self._client.embeddings.create(
input=sentences,
max_batch_size = 2048
sublists = [
sentences[i : i + max_batch_size]
for i in range(0, len(sentences), max_batch_size)
]

all_embeddings = []

for sublist in sublists:
response = self._client.embeddings.create(
input=sublist,
model=self._model_name,
encoding_format="float",
dimensions=self._embed_dim or NotGiven(),
)
)
all_embeddings.extend(self._to_numpy(response))

return np.array(all_embeddings)

def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
return self.encode(queries, **kwargs)
Expand Down
7 changes: 3 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ homepage = "https://github.com/embeddings-benchmark/mteb"
mteb = "mteb.cli:main"

[project.optional-dependencies]
dev = ["ruff>=0.6.0", "pytest", "pytest-xdist", "pytest-coverage"]
dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint update
"pytest", "pytest-xdist", "pytest-coverage"]
codecarbon = ["codecarbon"]
speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"]

Expand Down Expand Up @@ -97,10 +98,8 @@ select = [
"D", # formatting for docs
"UP", # upgrade to latest syntax if possible
"FA", # Future annotations
"C4", # cleaner comprehensions
"ISC",
"C4", # cleaner comprehensions
]
unfixable = ["ISC001"]


ignore = ["E501", # line too long
Expand Down
12 changes: 4 additions & 8 deletions tests/test_benchmark/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,14 @@
@pytest.mark.parametrize("tasks", [MOCK_TASK_TEST_GRID])
@pytest.mark.parametrize("model", [MockNumpyEncoder()])
def test_mulitple_mteb_tasks(
tasks: list[mteb.AbsTask], model: mteb.Encoder, monkeypatch
tasks: list[mteb.AbsTask], model: mteb.Encoder, tmp_path: Path
):
"""Test that multiple tasks can be run"""
eval = mteb.MTEB(tasks=tasks)
output_folder = "tests/results"
eval.run(model, output_folder=output_folder, overwrite_results=True)
eval.run(model, output_folder=str(tmp_path), overwrite_results=True)

tasks_dict = {task.metadata.name: task for task in tasks}
monkeypatch.setattr(
mteb, "get_task", lambda task_name, **kwargs: tasks_dict[task_name]
)
generate_readme(Path(output_folder))
# ensure that we can generate a readme from the output folder
generate_readme(tmp_path)


@pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID)
Expand Down
9 changes: 5 additions & 4 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import subprocess
import sys
from argparse import Namespace
from pathlib import Path

Expand All @@ -13,7 +14,7 @@


def test_available_tasks():
command = "mteb available_tasks"
command = f"{sys.executable} -m mteb available_tasks"
result = subprocess.run(command, shell=True, capture_output=True, text=True)
assert result.returncode == 0, "Command failed"
assert (
Expand Down Expand Up @@ -111,7 +112,7 @@ def test_create_meta():
), f"Value for {key} does not match"

# ensure that the command line interface works as well
command = f"mteb create_meta --results_folder {results} --output_path {output_path} --overwrite"
command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --overwrite"
result = subprocess.run(command, shell=True, capture_output=True, text=True)
assert result.returncode == 0, "Command failed"

Expand Down Expand Up @@ -172,13 +173,13 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name:
), f"Value for {key} does not match"
assert readme_output == gold_readme
# ensure that the command line interface works as well
command = f"mteb create_meta --results_folder {results} --output_path {output_path} --from_existing {existing_readme} --overwrite"
command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --from_existing {existing_readme} --overwrite"
result = subprocess.run(command, shell=True, capture_output=True, text=True)
assert result.returncode == 0, "Command failed"


def test_save_predictions():
command = "mteb run -m all-MiniLM-L6-v2 -t NFCorpus --output_folder tests/results --save_predictions"
command = f"{sys.executable} -m mteb run -m all-MiniLM-L6-v2 -t NFCorpus --output_folder tests/results --save_predictions"
result = subprocess.run(command, shell=True, capture_output=True, text=True)
assert result.returncode == 0, "Command failed"
test_folder = Path(__file__).parent
Expand Down
11 changes: 6 additions & 5 deletions tests/test_tasks/test_mteb_rerank.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
import logging
import os
from pathlib import Path

from sentence_transformers import CrossEncoder, SentenceTransformer

Expand All @@ -11,7 +11,7 @@
logging.basicConfig(level=logging.INFO)


def test_mteb_rerank():
def test_mteb_rerank(tmp_path: Path):
# Test that reranking works
# unfortunately, we need all the query ids to pretend to have this
scifact_keys = [
Expand Down Expand Up @@ -323,7 +323,8 @@ def test_mteb_rerank():
]
)
# create fake first stage results
with open("tmp.json", "w") as f:
tmp_file = tmp_path / "tmp.json"
with open(tmp_file, "w") as f:
f.write(
json.dumps(
{
Expand All @@ -344,10 +345,10 @@ def test_mteb_rerank():
overwrite_results=True,
eval_splits=["test"],
top_k=2,
previous_results="tmp.json",
previous_results=tmp_file,
save_predictions=True,
)
os.remove("tmp.json")
tmp_file.unlink()

# read in the results
with open("tests/results/SciFact_default_predictions.json") as f:
Expand Down