diff --git a/Makefile b/Makefile index 3c1f737fee..463b66548d 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,12 @@ install: @echo "--- 🚀 Installing project dependencies ---" - pip install -e ".[dev,docs]" + pip install -e ".[dev,docs,image]" pre-commit install install-for-tests: @echo "--- 🚀 Installing project dependencies for test ---" @echo "This ensures that the project is not installed in editable mode" - pip install ".[dev,speedtask,bm25s,pylate]" + pip install ".[dev,speedtask,bm25s,pylate,image]" lint: @echo "--- 🧹 Running linters ---" diff --git a/README.md b/README.md index dc83dd1b46..ebcc3e1a26 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@

- +

@@ -36,9 +36,11 @@ pip install mteb ``` + ## Example Usage -* Using a Python script: + +### Using a script ```python import mteb @@ -53,42 +55,10 @@ evaluation = mteb.MTEB(tasks=tasks) results = evaluation.run(model, output_folder=f"results/{model_name}") ``` -
- Running SentenceTransformer model with prompts - -Prompts can be passed to the SentenceTransformer model using the `prompts` parameter. The following code shows how to use prompts with SentenceTransformer: - -```python -from sentence_transformers import SentenceTransformer - - -model = SentenceTransformer("average_word_embeddings_komninos", prompts={"query": "Query:", "passage": "Passage:"}) -evaluation = mteb.MTEB(tasks=tasks) -``` - -In prompts the key can be: -1. Prompt types (`passage`, `query`) - they will be used in reranking and retrieval tasks -2. Task type - these prompts will be used in all tasks of the given type - 1. `BitextMining` - 2. `Classification` - 3. `MultilabelClassification` - 4. `Clustering` - 5. `PairClassification` - 6. `Reranking` - 7. `Retrieval` - 8. `STS` - 9. `Summarization` - 10. `InstructionRetrieval` - 11. `InstructionReranking` -3. Pair of task type and prompt type like `Retrival-query` - these prompts will be used in all classification tasks -4. Task name - these prompts will be used in the specific task -5. Pair of task name and prompt type like `NFCorpus-query` - these prompts will be used in the specific task -
- -* Using CLI +### Using the CLI ```bash -mteb available_tasks +mteb available_tasks # list _all_ available tasks mteb run -m sentence-transformers/all-MiniLM-L6-v2 \ -t Banking77Classification \ @@ -97,427 +67,52 @@ mteb run -m sentence-transformers/all-MiniLM-L6-v2 \ # if nothing is specified default to saving the results in the results/{model_name} folder ``` -* Using multiple GPUs in parallel can be done by just having a custom encode function that distributes the inputs to multiple GPUs like e.g. [here](https://github.com/microsoft/unilm/blob/b60c741f746877293bb85eed6806736fc8fa0ffd/e5/mteb_eval.py#L60) or [here](https://github.com/ContextualAI/gritlm/blob/09d8630f0c95ac6a456354bcb6f964d7b9b6a609/gritlm/gritlm.py#L75). +Note that using multiple GPUs in parallel can be done by just having a custom encode function that distributes the inputs to multiple GPUs like e.g. [here](https://github.com/microsoft/unilm/blob/b60c741f746877293bb85eed6806736fc8fa0ffd/e5/mteb_eval.py#L60) or [here](https://github.com/ContextualAI/gritlm/blob/09d8630f0c95ac6a456354bcb6f964d7b9b6a609/gritlm/gritlm.py#L75). See [custom models](docs/usage/usage.md#using-a-custom-model) for more information. ## Usage Documentation -Click on each section below to see the details. - -
- -
- Task selection - -### Task selection - -Tasks can be selected by providing the list of datasets, but also - -* by their task (e.g. "Clustering" or "Classification") - -```python -tasks = mteb.get_tasks(task_types=["Clustering", "Retrieval"]) # Only select clustering and retrieval tasks -``` - -* by their categories e.g. "s2s" (sentence to sentence) or "p2p" (paragraph to paragraph) - -```python -tasks = mteb.get_tasks(categories=["s2s", "p2p"]) # Only select sentence2sentence and paragraph2paragraph datasets -``` - -* by their languages - -```python -tasks = mteb.get_tasks(languages=["eng", "deu"]) # Only select datasets which contain "eng" or "deu" (iso 639-3 codes) -``` - -You can also specify which languages to load for multilingual/cross-lingual tasks like below: - -```python -import mteb - -tasks = [ - mteb.get_task("AmazonReviewsClassification", languages = ["eng", "fra"]), - mteb.get_task("BUCCBitextMining", languages = ["deu"]), # all subsets containing "deu" -] - -# or you can select specific huggingface subsets like this: -from mteb.tasks import AmazonReviewsClassification, BUCCBitextMining - -evaluation = mteb.MTEB(tasks=[ - AmazonReviewsClassification(hf_subsets=["en", "fr"]) # Only load "en" and "fr" subsets of Amazon Reviews - BUCCBitextMining(hf_subsets=["de-en"]), # Only load "de-en" subset of BUCC -]) -# for an example of a HF subset see "Subset" in the dataset viewer at: https://huggingface.co/datasets/mteb/bucc-bitext-mining -``` - -* by their modalities - -```python -tasks = mteb.get_tasks(modalities=["text", "image"]) # Only select tasks with text or image modalities -``` - - You can also specify exclusive modality filtering to only get tasks with exactly the requested modalities (default behavior with exclusive_modality_filter=False): -```python -# Get tasks with text modality, this will also include tasks having both text and image modalities -tasks = mteb.get_tasks(modalities=["text"], exclusive_modality_filter=False) - -# Get tasks that have ONLY text modality (no image or other modalities) -tasks = mteb.get_tasks(modalities=["text"], exclusive_modality_filter=True) -``` - -
- -
- Running a benchmark - -### Running a Benchmark - -`mteb` comes with a set of predefined benchmarks. These can be fetched using `get_benchmark` and run in a similar fashion to other sets of tasks. -For instance to select the 56 English datasets that form the "Overall MTEB English leaderboard": - -```python -import mteb -benchmark = mteb.get_benchmark("MTEB(eng, v1)") -evaluation = mteb.MTEB(tasks=benchmark) -``` - -The benchmark specified not only a list of tasks, but also what splits and language to run on. To get an overview of all available benchmarks simply run: - -```python -import mteb -benchmarks = mteb.get_benchmarks() -``` - -Generally we use the naming scheme for benchmarks `MTEB(*)`, where the "*" denotes the target of the benchmark. In the case of a language, we use the three-letter language code. For large groups of languages, we use the group notation, e.g., `MTEB(Scandinavian, v1)` for Scandinavian languages. External benchmarks implemented in MTEB like `CoIR` use their original name. When using a benchmark from MTEB please cite `mteb` along with the citations of the benchmark which you can access using: - -```python -benchmark.citation -``` - -
- -
- Passing in `encode` arguments - - -### Passing in `encode` arguments - -To pass in arguments to the model's `encode` function, you can use the encode keyword arguments (`encode_kwargs`): - -```python -evaluation.run(model, encode_kwargs={"batch_size": 32}) -``` -
- - -
- Selecting evaluation split - -### Selecting evaluation split -You can evaluate only on `test` splits of all tasks by doing the following: - -```python -evaluation.run(model, eval_splits=["test"]) -``` - -Note that the public leaderboard uses the test splits for all datasets except MSMARCO, where the "dev" split is used. - -
- - -
- Selecting evaluation subset - -### Selecting evaluation subset -You can evaluate only on selected subsets. For example, if you want to evaluate only the `subset_name_to_run` subset of all tasks, do the following: - -```python -evaluation.run(model, eval_subsets=["subset_name_to_run"]) -``` - -Monolingual tasks have `default` subset, other tasks have subsets that are specific to the dataset. - -
- -
- Using a custom model - - -### Using a custom model - -Models should implement the following interface, implementing an `encode` function taking as inputs a list of sentences, and returning a list of embeddings (embeddings can be `np.array`, `torch.tensor`, etc.). For inspiration, you can look at the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts) used for running diverse models via SLURM scripts for the paper. - -```python -import mteb -from mteb.encoder_interface import PromptType -import numpy as np - - -class CustomModel: - def encode( - self, - sentences: list[str], - task_name: str, - prompt_type: PromptType | None = None, - **kwargs, - ) -> np.ndarray: - """Encodes the given sentences using the encoder. - - Args: - sentences: The sentences to encode. - task_name: The name of the task. - prompt_type: The prompt type to use. - **kwargs: Additional arguments to pass to the encoder. - - Returns: - The encoded sentences. - """ - pass - -model = CustomModel() -tasks = mteb.get_tasks(tasks=["Banking77Classification"]) -evaluation = mteb.MTEB(tasks=tasks) -evaluation.run(model) -``` - -
- -
- Evaluating on a custom dataset - - -### Evaluating on a custom dataset - -To evaluate on a custom task, you can run the following code on your custom task. See [how to add a new task](docs/adding_a_dataset.md), for how to create a new task in MTEB. - -```python -from mteb import MTEB -from mteb.abstasks.AbsTaskReranking import AbsTaskReranking -from sentence_transformers import SentenceTransformer - - -class MyCustomTask(AbsTaskReranking): - ... - -model = SentenceTransformer("average_word_embeddings_komninos") -evaluation = MTEB(tasks=[MyCustomTask()]) -evaluation.run(model) -``` - -
- -
- Using a cross encoder for reranking - - -### Using a cross encoder for reranking - -To use a cross encoder for reranking, you can directly use a CrossEncoder from SentenceTransformers. The following code shows a two-stage run with the second stage reading results saved from the first stage. - -```python -from mteb import MTEB -import mteb -from sentence_transformers import CrossEncoder, SentenceTransformer - -cross_encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2") -dual_encoder = SentenceTransformer("all-MiniLM-L6-v2") - -tasks = mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"]) - -subset = "default" # subset name used in the NFCorpus dataset -eval_splits = ["test"] - -evaluation = MTEB(tasks=tasks) -evaluation.run( - dual_encoder, - eval_splits=eval_splits, - save_predictions=True, - output_folder="results/stage1", -) -evaluation.run( - cross_encoder, - eval_splits=eval_splits, - top_k=5, - save_predictions=True, - output_folder="results/stage2", - previous_results=f"results/stage1/NFCorpus_{subset}_predictions.json", -) -``` - -
- -
- Late Interaction (ColBERT) - -### Using Late Interaction models for retrieval - -```python -from mteb import MTEB -import mteb - - -colbert = mteb.get_model("colbert-ir/colbertv2.0") -tasks = mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"]) - -eval_splits = ["test"] - -evaluation = MTEB(tasks=tasks) - -evaluation.run( - colbert, - eval_splits=eval_splits, - corpus_chunk_size=500, -) -``` -This implementation employs the MaxSim operation to compute the similarity between sentences. While MaxSim provides high-quality results, it processes a larger number of embeddings, potentially leading to increased resource usage. To manage resource consumption, consider lowering the `corpus_chunk_size` parameter. - - -
- -
- Saving retrieval task predictions - -### Saving retrieval task predictions - -To save the predictions from a retrieval task, add the `--save_predictions` flag in the CLI or set `save_predictions=True` in the run method. The filename will be in the "{task_name}_{subset}_predictions.json" format. - -Python: -```python -from mteb import MTEB -import mteb -from sentence_transformers import SentenceTransformer - -model = SentenceTransformer("all-MiniLM-L6-v2") - -tasks = mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"]) - -evaluation = MTEB(tasks=tasks) -evaluation.run( - model, - eval_splits=["test"], - save_predictions=True, - output_folder="results", -) -``` - -CLI: -```bash -mteb run -t NFCorpus -m all-MiniLM-L6-v2 --output_folder results --save_predictions -``` - -
- -
- Fetching result from the results repository - -### Fetching results from the results repository - -Multiple models have already been run on tasks available within MTEB. These results are available results [repository](https://github.com/embeddings-benchmark/results). - -To make the results more easily accessible, we have designed custom functionality for retrieving from the repository. For instance, if you are selecting the best model for your French and English retrieval task on legal documents you could fetch the relevant tasks and create a dataframe of the results using the following code: - -```python -import mteb -from mteb.task_selection import results_to_dataframe - -tasks = mteb.get_tasks( - task_types=["Retrieval"], languages=["eng", "fra"], domains=["Legal"] -) - -model_names = [ - "GritLM/GritLM-7B", - "intfloat/multilingual-e5-small", - "intfloat/multilingual-e5-base", - "intfloat/multilingual-e5-large", -] -models = [mteb.get_model_meta(name) for name in model_names] - -results = mteb.load_results(models=models, tasks=tasks) - -df = results_to_dataframe(results) -``` - -
- - -
- Annotate Contamination in the training data of a model - -### Annotate Contamination - -have your found contamination in the training data of a model? Please let us know, either by opening an issue or ideally by submitting a PR -annotatig the training datasets of the model: - -```py -model_w_contamination = ModelMeta( - name = "model-with-contamination" - ... - training_datasets: {"ArguAna": # name of dataset within MTEB - ["test"]} # the splits that have been trained on - ... -) -``` - - -
- -
- Running the leaderboard locally - - -### Running the Leaderboard - -It is possible to completely deploy the leaderboard locally or self-host it. This can e.g. be relevant for companies that might want to -integrate build their own benchmarks or integrate custom tasks into existing benchmarks. - -Running the leaderboard is quite easy. Simply run: -```py -python -m mteb.leaderboard.app -``` - -The leaderboard requires gradio install, which can be installed using `pip install mteb[gradio]` and requires python >3.10. - -
- -
- Caching Embeddings To Re-Use Them - - -### Caching Embeddings To Re-Use Them - -There are times you may want to cache the embeddings so you can re-use them. This may be true if you have multiple query sets for the same corpus (e.g. Wikipedia) or are doing some optimization over the queries (e.g. prompting, other experiments). You can setup a cache by using a simple wrapper, which will save the cache per task in the `cache_embeddings/{task_name}` folder: - -```python -# define your task and model above as normal -... -# wrap the model with the cache wrapper -from mteb.models.cache_wrapper import CachedEmbeddingWrapper -model_with_cached_emb = CachedEmbeddingWrapper(model, cache_path='path_to_cache_dir') -# run as normal -evaluation.run(model, ...) -``` - -
- -
- - - -## Documentation - -| Documentation | | +The following links to the main sections in the usage documentation. + +| Section | | +| ------- |- | +| **General** | | +| [Evaluating a Model](docs/usage/usage.md#evaluating-a-model) | How to evaluate a model | +| [Evaluating on different Modalities](docs/usage/usage.md#evaluating-on-different-modalities) | How to evaluate image and image-text tasks | +| **Selecting Tasks** | | +| [Selecting a benchmark](docs/usage/usage.md#selecting-a-benchmark) | How to select and filter tasks | +| [Task selection](docs/usage/usage.md#task-selection) | How to select and filter tasks | +| [Selecting Split and Subsets](docs/usage/usage.md#selecting-evaluation-split-or-subsets) | How to select evaluation splits or subsets | +| [Using a Custom Task](docs/usage/usage.md#using-a-custom-task) | How to evaluate on a custom task | +| **Selecting a Model** | | +| [Using a Pre-defined Model](docs/usage/usage.md#using-a-pre-defined-model) | How to run a pre-defined model | +| [Using a SentenceTransformer Model](docs/usage/usage.md#using-a-sentence-transformer-model) | How to run a model loaded using sentence-transformers | +| [Using a Custom Model](docs/usage/usage.md#using-a-custom-model) | How to run and implement a custom model | +| **Running Evaluation** | | +| [Passing Arguments to the model](docs/usage/usage.md#passing-in-encode-arguments) | How to pass `encode` arguments to the model | +| [Running Cross Encoders](docs/usage/usage.md#running-cross-encoders-on-reranking) | How to run cross encoders for reranking | +| [Running Late Interaction (ColBERT)](docs/usage/usage.md#using-late-interaction-models) | How to run late interaction models | +| [Saving Retrieval Predictions](docs/usage/usage.md#saving-retrieval-task-predictions) | How to save prediction for later analysis | +| [Caching Embeddings](docs/usage/usage.md#caching-embeddings-to-re-use-them) | How to cache and re-use embeddings | +| **Leaderboard** | | +| [Running the Leaderboard Locally](docs/usage/usage.md#running-the-leaderboard-locally) | How to run the leaderboard locally | +| [Report Data Contamination](docs/usage/usage.md#annotate-contamination) | How to report data contamination for a model | +| [Fetching Result from the Leaderboard](docs/usage/usage.md#fetching-results-from-the-leaderboard) | How to fetch the raw results from the leaderboard | + + +## Overview + +| Overview | | |--------------------------------|-------------------------------------------------------------------------------------| +| 📈 [Leaderboard] | The interactive leaderboard of the benchmark | | 📋 [Tasks] | Overview of available tasks | | 📐 [Benchmarks] | Overview of available benchmarks | -| 📈 [Leaderboard] | The interactive leaderboard of the benchmark | +| **Contributing** | | | 🤖 [Adding a model] | Information related to how to submit a model to MTEB and to the leaderboard | -| 👩‍🔬 [Reproducible workflows] | Information related to how to reproduce and create reproducible workflows with MTEB | +| 👩‍🔬 [Reproducible workflows] | Information related to how to create reproducible workflows with MTEB | | 👩‍💻 [Adding a dataset] | How to add a new task/dataset to MTEB | | 👩‍💻 [Adding a benchmark] | How to add a new benchmark to MTEB and to the leaderboard | | 🤝 [Contributing] | How to contribute to MTEB and set it up for development | -| 🌐 [MMTEB] | An open-source effort to extend MTEB to cover a broad set of languages | -| 🖼️ [MIEB] | Extension of MTEB to image embeddings | [Tasks]: docs/tasks.md [Benchmarks]: docs/benchmarks.md @@ -526,27 +121,51 @@ evaluation.run(model, ...) [Adding a dataset]: docs/adding_a_dataset.md [Adding a benchmark]: docs/adding_a_benchmark.md [Leaderboard]: https://huggingface.co/spaces/mteb/leaderboard -[MMTEB]: docs/mmteb/readme.md -[MIEB]: docs/mieb.md [Reproducible workflows]: docs/reproducible_workflow.md ## Citing -MTEB was introduced in "[MTEB: Massive Text Embedding Benchmark](https://aclanthology.org/2023.eacl-main.148/)", feel free to cite: +MTEB was introduced in "[MTEB: Massive Text Embedding Benchmark](https://arxiv.org/abs/2210.07316)", and heavily expanded in "[MMTEB: Massive Multilingual Text Embedding Benchmark](https://arxiv.org/abs/2502.13595)". When using `mteb` we recommend that you cite both articles. + +
+ Bibtex Citation (click to unfold) + ```bibtex +@article{enevoldsen2025mmtebmassivemultilingualtext, + title={MMTEB: Massive Multilingual Text Embedding Benchmark}, + author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff}, + publisher = {arXiv}, + journal={arXiv preprint arXiv:2502.13595}, + year={2025}, + url={https://arxiv.org/abs/2502.13595}, + doi = {10.48550/arXiv.2502.13595}, +} + @article{muennighoff2022mteb, - doi = {10.48550/ARXIV.2210.07316}, - url = {https://arxiv.org/abs/2210.07316}, author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\i}c and Reimers, Nils}, title = {MTEB: Massive Text Embedding Benchmark}, publisher = {arXiv}, journal={arXiv preprint arXiv:2210.07316}, year = {2022} + url = {https://arxiv.org/abs/2210.07316}, + doi = {10.48550/ARXIV.2210.07316}, } ``` +
+ + +If you use any of the specific benchmark we also recommend that you cite the authors. + +```py +benchmark = mteb.get_benchmark("MTEB(eng, v2)") +benchmark.citation # get citation for a specific benchmarks + +# you can also create a table of the task for the appendix using: +benchmark.tasks.to_latex() +``` -You may also want to read and cite the amazing work that has extended MTEB & integrated new datasets: +Some of these amazing publications include: - Shitao Xiao, Zheng Liu, Peitian Zhang, Niklas Muennighoff. "[C-Pack: Packaged Resources To Advance General Chinese Embedding](https://arxiv.org/abs/2309.07597)" arXiv 2023 - Michael Günther, Jackmin Ong, Isabelle Mohr, Alaeddine Abdessalem, Tanguy Abel, Mohammad Kalim Akram, Susana Guzman, Georgios Mastrapas, Saba Sturua, Bo Wang, Maximilian Werk, Nan Wang, Han Xiao. "[Jina Embeddings 2: 8192-Token General-Purpose Text Embeddings for Long Documents](https://arxiv.org/abs/2310.19923)" arXiv 2023 - Silvan Wehrli, Bert Arnrich, Christopher Irrgang. "[German Text Embedding Clustering Benchmark](https://arxiv.org/abs/2401.02709)" arXiv 2024 @@ -554,5 +173,3 @@ You may also want to read and cite the amazing work that has extended MTEB & int - Dawei Zhu, Liang Wang, Nan Yang, Yifan Song, Wenhao Wu, Furu Wei, Sujian Li. "[LongEmbed: Extending Embedding Models for Long Context Retrieval](https://arxiv.org/abs/2404.12096)" arXiv 2024 - Kenneth Enevoldsen, Márton Kardos, Niklas Muennighoff, Kristoffer Laigaard Nielbo. "[The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding](https://arxiv.org/abs/2406.02396)" arXiv 2024 - Ali Shiraee Kasmaee, Mohammad Khodadad, Mohammad Arshi Saloot, Nick Sherck, Stephen Dokas, Hamidreza Mahyar, Soheila Samiee. "[ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance & Efficiency on a Specific Domain](https://arxiv.org/abs/2412.00532)" arXiv 2024 - -For works that have used MTEB for benchmarking, you can find them on the [leaderboard](https://huggingface.co/spaces/mteb/leaderboard). diff --git a/docs/mieb.md b/docs/mieb/readme.md similarity index 69% rename from docs/mieb.md rename to docs/mieb/readme.md index e059926137..af23c8573e 100644 --- a/docs/mieb.md +++ b/docs/mieb/readme.md @@ -1,3 +1,6 @@ +**NOTE**: This collaboration have been finalized and the paper is soon to be released. This document remains for documentation. + + # Welcome to MIEB! 👋 The Massive Image Embedding Benchmark (MIEB) is an image extension of [MTEB](https://arxiv.org/abs/2210.07316) to cover embedding tasks for image-text tasks. @@ -34,9 +37,9 @@ class OpenCLIPWrapper: See also [adding a model](adding_a_model.md) for reference. ### X Evaluator -With the model, [ZeroshotClassificationEvaluator](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py) is implemented here. This defines how the model are used to do zero-shot classification and get back results on desired metrics. +With the model, [ZeroShotClassificationEvaluator](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/evaluation/evaluators/Image/ZeroShotClassificationEvaluator.py) is implemented here. This defines how the model are used to do zero-shot classification and get back results on desired metrics. ```python -class ZeroshotClassificationEvaluator(Evaluator): +class ZeroShotClassificationEvaluator(Evaluator): def __init__(self, ...): ... def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): @@ -45,17 +48,17 @@ class ZeroshotClassificationEvaluator(Evaluator): ``` ### AbsTask X -With the evaluator, [AbsTaskZeroshotClassification](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/abstasks/Image/AbsTaskZeroshotClassification.py) is defined, operating on the dataset, calling the defined Evaluator, and gives out results. +With the evaluator, [AbsTaskZeroShotClassification](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/abstasks/Image/AbsTaskZeroShotClassification.py) is defined, operating on the dataset, calling the defined Evaluator, and gives out results. ```python -class AbsTaskZeroshotClassification(AbsTask): +class AbsTaskZeroShotClassification(AbsTask): ... ``` ### Dataset class -With all these, we can then define the dataset. [CIFAR10](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py) is implemented like this, subclassing `AbsTaskZeroshotClassification`, and overwrite the `get_candidate_labels` function, which gives `["a photo of {label_name}"]` to be used in the evaluator. +With all these, we can then define the dataset. [CIFAR10](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/tasks/Image/ZeroShotClassification/eng/CIFAR.py) is implemented like this, subclassing `AbsTaskZeroShotClassification`, and overwrite the `get_candidate_labels` function, which gives `["a photo of {label_name}"]` to be used in the evaluator. ```python -class CIFAR10ZeroShotClassification(AbsTaskZeroshotClassification): +class CIFAR10ZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata(...) def get_candidate_labels(self) -> list[str]: @@ -76,41 +79,4 @@ evaluation = mteb.MTEB(tasks=tasks) results = evaluation.run(model) ``` -By default, results will be under `results/laion__CLIP-ViT-L-14-laion2B-s32B-b82K/REVISION/CIFAR10ZeroShot.json`. Sometimes metrics can be a bit different than what the original paper claimed. This might be due to the resolution/layout difference of images in the remake of the dataset. - - -## Specific Model running Instructions - -Some models require some specific steps before running. Those are collected here. - -
- Vista - - ## set up VISTA - - ``` - git clone https://github.com/FlagOpen/FlagEmbedding.git - cd FlagEmbedding/research/visual_bge - pip install -e . - pip install torchvision timm einops ftfy - ``` - back to the root folder of mteb; download the vision tower for bge-base - ``` - cd .. - wget https://huggingface.co/BAAI/bge-visualized/resolve/main/Visualized_base_en_v1.5.pth?download=true - ``` - rename it to `visualized_base_en_V1.5.pth` - ``` - mv Visualized_base_en_v1.5.pth?download=true visualized_base_en_V1.5.pth - ``` - download the vision tower for bge-m3 - ``` - wget https://huggingface.co/BAAI/bge-visualized/resolve/main/Visualized_m3.pth?download=true - ``` - rename it to `visualized_m3.pth` - ``` - mv Visualized_m3.pth?download=true visualized_m3.pth - ``` - - -
+By default, results will be under `results/laion__CLIP-ViT-L-14-laion2B-s32B-b82K/REVISION/CIFAR10ZeroShot.json`. Sometimes metrics can be a bit different than what the original paper claimed. This might be due to the resolution/layout difference of images in the remake of the dataset. diff --git a/docs/mmteb/readme.md b/docs/mmteb/readme.md index 56b7a0bef4..ef5768b0be 100644 --- a/docs/mmteb/readme.md +++ b/docs/mmteb/readme.md @@ -1,3 +1,6 @@ + +**NOTE**: This open collaboration have been finalized and the [paper](https://arxiv.org/abs/2502.13595) released. This document remains for documentation. + # Welcome to MMTEB! 👋 The Massive Multilingual Text Embedding Benchmark (MMTEB) is a community-led extension of [MTEB](https://arxiv.org/abs/2210.07316) to cover embedding tasks for a massive number of languages. diff --git a/docs/tasks.md b/docs/tasks.md index d03ee14c2f..f5dcc916a5 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -67,6 +67,7 @@ The following tables give you an overview of the tasks in MTEB. | [BlurbsClusteringS2S.v2](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html) (Steffen Remus, 2019) | ['deu'] | Clustering | s2s | [Fiction, Written] | None | None | | [BornholmBitextMining](https://aclanthology.org/W19-6138/) | ['dan'] | BitextMining | s2s | [Fiction, Social, Web, Written] | {'test': 500} | {'test': {'num_samples': 500, 'number_of_characters': 44361, 'unique_pairs': 500, 'min_sentence1_length': 1, 'average_sentence1_length': 49.83, 'max_sentence1_length': 555, 'unique_sentence1': 497, 'min_sentence2_length': 5, 'average_sentence2_length': 38.89, 'max_sentence2_length': 453, 'unique_sentence2': 491}} | | [BrazilianToxicTweetsClassification](https://paperswithcode.com/dataset/told-br) (Joao Augusto Leite and Diego F. Silva and Kalina Bontcheva and Carolina Scarton, 2020) | ['por'] | MultilabelClassification | s2s | [Constructed, Written] | None | None | +| [BrightLongRetrieval](https://huggingface.co/datasets/xlangai/BRIGHT) (Hongjin Su, 2024) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [BrightRetrieval](https://huggingface.co/datasets/xlangai/BRIGHT) (Hongjin Su, 2024) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [BuiltBenchClusteringP2P](https://arxiv.org/abs/2411.12056) (Shahinmoghadam et al., 2024) | ['eng'] | Clustering | p2p | [Engineering, Written] | None | None | | [BuiltBenchClusteringS2S](https://arxiv.org/abs/2411.12056) (Shahinmoghadam et al., 2024) | ['eng'] | Clustering | s2s | [Engineering, Written] | None | None | @@ -190,6 +191,7 @@ The following tables give you an overview of the tasks in MTEB. | [ClimateFEVER-NL](https://huggingface.co/datasets/clips/beir-nl-climate-fever) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [ClimateFEVER.v2](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Academic, Written] | None | None | | [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [ClusTREC-Covid](https://github.com/katzurik/Knowledge_Navigator/tree/main/Benchmarks/CLUSTREC%20COVID) | ['eng'] | Clustering | p2p | [Academic, Medical, Written] | {'test': 4568} | {'test': {'num_samples': 4568, 'number_of_characters': 2977845, 'min_text_length': 14, 'average_text_length': 651.89, 'max_text_length': 8364, 'min_labels_per_text': 6, 'average_labels_per_text': 1.0, 'max_labels_per_text': 100, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 100}, 'coronavirus response to weather changes': {'count': 100}, 'coronavirus immunity': {'count': 78}, 'how do people die from the coronavirus': {'count': 100}, 'animal models of COVID-19': {'count': 100}, 'coronavirus test rapid testing': {'count': 100}, 'serological tests for coronavirus': {'count': 100}, 'coronavirus under reporting': {'count': 100}, 'coronavirus in Canada': {'count': 92}, 'coronavirus social distancing impact': {'count': 100}, 'coronavirus hospital rationing': {'count': 100}, 'coronavirus quarantine': {'count': 100}, 'how does coronavirus spread': {'count': 100}, 'coronavirus super spreaders': {'count': 98}, 'coronavirus outside body': {'count': 34}, 'how long does coronavirus survive on surfaces': {'count': 74}, 'coronavirus clinical trials': {'count': 100}, 'masks prevent coronavirus': {'count': 100}, 'what alcohol sanitizer kills coronavirus': {'count': 64}, 'coronavirus and ACE inhibitors': {'count': 100}, 'coronavirus mortality': {'count': 100}, 'coronavirus heart impacts': {'count': 100}, 'coronavirus hypertension': {'count': 74}, 'coronavirus diabetes': {'count': 100}, 'coronavirus biomarkers': {'count': 100}, 'coronavirus early symptoms': {'count': 100}, 'coronavirus asymptomatic': {'count': 100}, 'coronavirus hydroxychloroquine': {'count': 100}, 'coronavirus drug repurposing': {'count': 100}, 'coronavirus remdesivir': {'count': 100}, 'difference between coronavirus and flu': {'count': 100}, 'coronavirus subtypes': {'count': 6}, 'coronavirus vaccine candidates': {'count': 36}, 'coronavirus recovery': {'count': 100}, 'coronavirus public datasets': {'count': 100}, 'SARS-CoV-2 spike structure': {'count': 100}, 'SARS-CoV-2 phylogenetic analysis': {'count': 100}, 'COVID inflammatory response': {'count': 100}, 'COVID-19 cytokine storm': {'count': 100}, 'coronavirus mutations': {'count': 100}, 'COVID-19 in African-Americans': {'count': 100}, 'Vitamin D and COVID-19': {'count': 100}, 'violence during pandemic': {'count': 100}, 'impact of masks on coronavirus transmission': {'count': 100}, 'coronavirus mental health impact': {'count': 100}, 'dexamethasone coronavirus': {'count': 92}, 'COVID-19 outcomes in children': {'count': 100}, 'school reopening coronavirus': {'count': 100}, 'post-infection COVID-19 immunity': {'count': 88}, 'mRNA vaccine coronavirus': {'count': 32}}, 'hf_subset_descriptive_stats': {'title and abstract': {'num_samples': 2284, 'number_of_characters': 2755462, 'min_text_length': 14, 'average_text_length': 1206.42, 'max_text_length': 8364, 'min_labels_per_text': 3, 'average_labels_per_text': 1.0, 'max_labels_per_text': 50, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 50}, 'coronavirus response to weather changes': {'count': 50}, 'coronavirus immunity': {'count': 39}, 'how do people die from the coronavirus': {'count': 50}, 'animal models of COVID-19': {'count': 50}, 'coronavirus test rapid testing': {'count': 50}, 'serological tests for coronavirus': {'count': 50}, 'coronavirus under reporting': {'count': 50}, 'coronavirus in Canada': {'count': 46}, 'coronavirus social distancing impact': {'count': 50}, 'coronavirus hospital rationing': {'count': 50}, 'coronavirus quarantine': {'count': 50}, 'how does coronavirus spread': {'count': 50}, 'coronavirus super spreaders': {'count': 49}, 'coronavirus outside body': {'count': 17}, 'how long does coronavirus survive on surfaces': {'count': 37}, 'coronavirus clinical trials': {'count': 50}, 'masks prevent coronavirus': {'count': 50}, 'what alcohol sanitizer kills coronavirus': {'count': 32}, 'coronavirus and ACE inhibitors': {'count': 50}, 'coronavirus mortality': {'count': 50}, 'coronavirus heart impacts': {'count': 50}, 'coronavirus hypertension': {'count': 37}, 'coronavirus diabetes': {'count': 50}, 'coronavirus biomarkers': {'count': 50}, 'coronavirus early symptoms': {'count': 50}, 'coronavirus asymptomatic': {'count': 50}, 'coronavirus hydroxychloroquine': {'count': 50}, 'coronavirus drug repurposing': {'count': 50}, 'coronavirus remdesivir': {'count': 50}, 'difference between coronavirus and flu': {'count': 50}, 'coronavirus subtypes': {'count': 3}, 'coronavirus vaccine candidates': {'count': 18}, 'coronavirus recovery': {'count': 50}, 'coronavirus public datasets': {'count': 50}, 'SARS-CoV-2 spike structure': {'count': 50}, 'SARS-CoV-2 phylogenetic analysis': {'count': 50}, 'COVID inflammatory response': {'count': 50}, 'COVID-19 cytokine storm': {'count': 50}, 'coronavirus mutations': {'count': 50}, 'COVID-19 in African-Americans': {'count': 50}, 'Vitamin D and COVID-19': {'count': 50}, 'violence during pandemic': {'count': 50}, 'impact of masks on coronavirus transmission': {'count': 50}, 'coronavirus mental health impact': {'count': 50}, 'dexamethasone coronavirus': {'count': 46}, 'COVID-19 outcomes in children': {'count': 50}, 'school reopening coronavirus': {'count': 50}, 'post-infection COVID-19 immunity': {'count': 44}, 'mRNA vaccine coronavirus': {'count': 16}}}, 'title': {'num_samples': 2284, 'number_of_characters': 222383, 'min_text_length': 14, 'average_text_length': 97.37, 'max_text_length': 348, 'min_labels_per_text': 3, 'average_labels_per_text': 1.0, 'max_labels_per_text': 50, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 50}, 'coronavirus response to weather changes': {'count': 50}, 'coronavirus immunity': {'count': 39}, 'how do people die from the coronavirus': {'count': 50}, 'animal models of COVID-19': {'count': 50}, 'coronavirus test rapid testing': {'count': 50}, 'serological tests for coronavirus': {'count': 50}, 'coronavirus under reporting': {'count': 50}, 'coronavirus in Canada': {'count': 46}, 'coronavirus social distancing impact': {'count': 50}, 'coronavirus hospital rationing': {'count': 50}, 'coronavirus quarantine': {'count': 50}, 'how does coronavirus spread': {'count': 50}, 'coronavirus super spreaders': {'count': 49}, 'coronavirus outside body': {'count': 17}, 'how long does coronavirus survive on surfaces': {'count': 37}, 'coronavirus clinical trials': {'count': 50}, 'masks prevent coronavirus': {'count': 50}, 'what alcohol sanitizer kills coronavirus': {'count': 32}, 'coronavirus and ACE inhibitors': {'count': 50}, 'coronavirus mortality': {'count': 50}, 'coronavirus heart impacts': {'count': 50}, 'coronavirus hypertension': {'count': 37}, 'coronavirus diabetes': {'count': 50}, 'coronavirus biomarkers': {'count': 50}, 'coronavirus early symptoms': {'count': 50}, 'coronavirus asymptomatic': {'count': 50}, 'coronavirus hydroxychloroquine': {'count': 50}, 'coronavirus drug repurposing': {'count': 50}, 'coronavirus remdesivir': {'count': 50}, 'difference between coronavirus and flu': {'count': 50}, 'coronavirus subtypes': {'count': 3}, 'coronavirus vaccine candidates': {'count': 18}, 'coronavirus recovery': {'count': 50}, 'coronavirus public datasets': {'count': 50}, 'SARS-CoV-2 spike structure': {'count': 50}, 'SARS-CoV-2 phylogenetic analysis': {'count': 50}, 'COVID inflammatory response': {'count': 50}, 'COVID-19 cytokine storm': {'count': 50}, 'coronavirus mutations': {'count': 50}, 'COVID-19 in African-Americans': {'count': 50}, 'Vitamin D and COVID-19': {'count': 50}, 'violence during pandemic': {'count': 50}, 'impact of masks on coronavirus transmission': {'count': 50}, 'coronavirus mental health impact': {'count': 50}, 'dexamethasone coronavirus': {'count': 46}, 'COVID-19 outcomes in children': {'count': 50}, 'school reopening coronavirus': {'count': 50}, 'post-infection COVID-19 immunity': {'count': 44}, 'mRNA vaccine coronavirus': {'count': 16}}}}}} | | [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | ['cmn'] | Retrieval | s2p | [Medical, Written] | None | None | | [Cmnli](https://huggingface.co/datasets/clue/viewer/cmnli) | ['cmn'] | PairClassification | s2s | | None | None | | [CodeEditSearchRetrieval](https://huggingface.co/datasets/cassanof/CodeEditSearch/viewer) (Niklas Muennighoff, 2023) | ['c', 'c++', 'go', 'java', 'javascript', 'php', 'python', 'ruby', 'rust', 'scala', 'shell', 'swift', 'typescript'] | Retrieval | p2p | [Programming, Written] | {'train': 26000} | {'train': {'number_of_characters': 935841, 'num_samples': 26000, 'num_queries': 13000, 'num_documents': 13000, 'min_document_length': 18, 'average_document_length': 70.99, 'max_document_length': 2532, 'unique_documents': 13000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 13000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 70519, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 69.52, 'max_document_length': 1811, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 57880, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 56.88, 'max_document_length': 601, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'typescript': {'number_of_characters': 61092, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 60.09, 'max_document_length': 659, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 71797, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 70.8, 'max_document_length': 1529, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 67900, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 66.9, 'max_document_length': 751, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 63984, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 23, 'average_document_length': 62.98, 'max_document_length': 807, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 62927, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 61.93, 'max_document_length': 766, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c': {'number_of_characters': 98588, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 97.59, 'max_document_length': 1672, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c++': {'number_of_characters': 115480, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 114.48, 'max_document_length': 1856, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'rust': {'number_of_characters': 68503, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 67.5, 'max_document_length': 2532, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'swift': {'number_of_characters': 58279, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 57.28, 'max_document_length': 727, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'scala': {'number_of_characters': 65833, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 64.83, 'max_document_length': 685, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'shell': {'number_of_characters': 73059, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 72.06, 'max_document_length': 813, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | @@ -1186,7 +1188,7 @@ The following tables give you an overview of the tasks in MTEB. | ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | | emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | emp | Northern Emberá | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 0 | 3 | 55 | 19 | 160 | 20 | 7 | 10 | 22 | 5 | 0 | 3 | 1 | 13 | 9 | 111 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 494 | +| eng | English | Indo-European | 0 | 3 | 55 | 19 | 160 | 21 | 7 | 10 | 22 | 5 | 0 | 3 | 1 | 13 | 9 | 112 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 496 | | enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | epo | Esperanto | Artificial Language | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1959,7 +1961,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 0 | 55 | 55 | 1492 | 836 | 313 | 7 | 10 | 22 | 5 | 0 | 3 | 28 | 91 | 56 | 585 | 88 | 2 | 2 | 6 | 7 | 37 | 24 | +| Total | None | None | None | 0 | 55 | 55 | 1492 | 836 | 314 | 7 | 10 | 22 | 5 | 0 | 3 | 28 | 91 | 56 | 586 | 88 | 2 | 2 | 6 | 7 | 37 | 24 | diff --git a/docs/usage/usage.md b/docs/usage/usage.md new file mode 100644 index 0000000000..cba88f21ea --- /dev/null +++ b/docs/usage/usage.md @@ -0,0 +1,510 @@ +# Usage + +This usage documentation follows a structure similar first it introduces a simple example of how to evaluate a model in MTEB. +Then introduces model detailed section of defining model, selecting tasks and running the evaluation. Each section contain subsection pertaining to +these. + + +## Evaluating a Model + +Evaluating a model on MTEB follows a three step approach, 1) defining model, 2) selecting the tasks and 3) running the evaluation + +```python +import mteb + +# Specify the model that we want to evaluate +model = ... + +# specify what you want to evaluate it on +tasks = mteb.get_tasks(tasks=["{task1}", "{task1}"]) + +# run the evaluation +evaluation = mteb.MTEB(tasks=tasks) +results = evaluation.run(model) +``` + +For instance if we want to run [`"sentence-transformers/all-MiniLM-L6-v2"`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) on +`"Banking77Classification"` we can do this using the following code: + +```python +model_name = "sentence-transformers/all-MiniLM-L6-v2" + +# or using SentenceTransformers +model = SentenceTransformers(model_name) +# load the model using MTEB +model = mteb.get_model(model_name) # will default to SentenceTransformers(model_name) if not implemented in MTEB + +# select the desired tasks and evaluate +tasks = mteb.get_tasks(tasks=["Banking77Classification"]) +evaluation = mteb.MTEB(tasks=tasks) +results = evaluation.run(model) +``` + + +### Evaluating on Different Modalities +MTEB is not only text evaluating, but also allow you to evaluate image and image-text embeddings. + +> [!NOTE] +> Running MTEB on images requires you to install the optional dependencies using `pip install mteb[image]` + +To evaluate image embeddings you can follows the same approach for any other task in `mteb`. Simply ensuring that the task contains the modality "image": + +```python +tasks = mteb.get_tasks(modalities=["image"]) # Only select tasks with image modalities +task = task[0] + +print(task.metadata.modalites) +# ['text', 'image'] +``` + +However, we recommend starting with one of the predefined benchmarks: + +```python +import mteb +benchmark = mteb.get_benchmark("MIEB(eng)") +evaluation = mteb.MTEB(tasks=benchmark) + +model = mteb.get_model("{model-of-choice}") +evaluation.run(model) +``` + +You can also specify exclusive modality filtering to only get tasks with exactly the requested modalities (default behavior with `exclusive_modality_filter=False`): +```python +# Get tasks with image modality, this will also include tasks having both text and image modalities +tasks = mteb.get_tasks(modalities=["image"], exclusive_modality_filter=False) + +# Get tasks that have ONLY image modality +tasks = mteb.get_tasks(modalities=["image"], exclusive_modality_filter=True) +``` + + + + + + +## Defining a Model + +### Using a pre-defined Model + +MTEB comes with an implementation of many popular models and APIs. These can be loaded using `mteb.get_model_meta`: + +```python +model_name = "intfloat/multilingual-e5-small" +meta = mteb.get_model_meta(model_name) +model = meta.load_model() +# or directly using +model = mteb.get_model(model_name) +``` + +You can get an overview of on the models available in `mteb` as follows: + +```py +model_metas = mteb.get_model_metas() + +# You can e.g. use the model metas to find all openai models +openai_models = [meta for meta in model_metas if "openai" in meta.name] +``` +> [!TIP] +> Some models require additional dependencies to run on MTEB. An example of such a model is the OpenAI APIs. +> These dependencies can be installed using `pip install mteb[openai]` + +### Using a Sentence Transformer Model + +MTEB is made to be compatible with sentence transformers and thus you can readily evaluate any model that can be loaded via. sentence transformers +on `MTEB`: + +```python +model = SentenceTransformers("sentence-transformers/LaBSE") + +# select the desired tasks and evaluate +tasks = mteb.get_tasks(tasks=["Banking77Classification"]) +evaluation = mteb.MTEB(tasks=tasks) +results = evaluation.run(model) +``` + +However, we do recommend check in mteb include an implementation of the model before using sentence transformers since some models (e.g. the [multilingual e5 models](https://huggingface.co/collections/intfloat/multilingual-e5-text-embeddings-67b2b8bb9bff40dec9fb3534)) require a prompt and not specifying it may reduce performance. + +> [!NOTE] +> If you want to evaluate a cross encoder on a reranking task, see section on [running cross encoders for reranking](#running-cross-encoders-on-reranking) + +### Using a Custom Model + +It is also possible to implement your own custom model in MTEB as long as it adheres to the [encoder interface](https://github.com/embeddings-benchmark/mteb/blob/main/mteb/encoder_interface.py#L21). + +This entails implementing an `encode` function taking as inputs a list of sentences, and returning a list of embeddings (embeddings can be `np.array`, `torch.tensor`, etc.). + +```python +import mteb +from mteb.encoder_interface import PromptType +import numpy as np + + +class CustomModel: + def encode( + self, + sentences: list[str], + task_name: str, + prompt_type: PromptType | None = None, + **kwargs, + ) -> np.ndarray: + """Encodes the given sentences using the encoder. + + Args: + sentences: The sentences to encode. + task_name: The name of the task. + prompt_type: The prompt type to use. + **kwargs: Additional arguments to pass to the encoder. + + Returns: + The encoded sentences. + """ + pass + + +# evaluating the model: +model = CustomModel() +tasks = mteb.get_tasks(tasks=["Banking77Classification"]) +evaluation = mteb.MTEB(tasks=tasks) +evaluation.run(model) +``` + +If you want to submit your implementation to be included in the leaderboard see the section on [submitting a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md). + +## Selecting Tasks + +This section describes how to select benchmarks and task to evaluate, including selecting specific subsets or splits to run. + +### Selecting a Benchmark + +`mteb` comes with a set of predefined benchmarks. These can be fetched using `mteb.get_benchmark` and run in a similar fashion to other sets of tasks. +For instance to select the 56 English datasets that form the English leaderboard: + +```python +import mteb +benchmark = mteb.get_benchmark("MTEB(eng, v2)") +evaluation = mteb.MTEB(tasks=benchmark) +``` + +The benchmark specified not only a list of tasks, but also what splits and language to run on. + +To get an overview of all available benchmarks simply run: + +```python +import mteb +benchmarks = mteb.get_benchmarks() +``` + +> [!NOTE] +> Generally we use the naming scheme for benchmarks `MTEB(*)`, where the "*" denotes the target of the benchmark. +> In the case of a language, we use the three-letter language code. +> For large groups of languages, we use the group notation, e.g., `MTEB(Scandinavian, v1)` for Scandinavian languages. +> External benchmarks implemented in MTEB like `CoIR` use their original name. + +When using a benchmark from MTEB please cite `mteb` along with the citations of the benchmark which you can access using: + +```python +benchmark.citation +``` + +### Task selection + +`mteb` comes the utility function `mteb.get_task` and `mteb_get_tasks` for fetching and analysing the tasks of interest. + +This can be done in multiple ways, e.g.: + +* by the task name +* by their type (e.g. "Clustering" or "Classification") +* by their languages +* by their domains +* by their modalities +* and many more + +```python +# by name +tasks = mteb.get_tasks(tasks=["Banking77Classification"]) +# by type +tasks = mteb.get_tasks(task_types=["Clustering", "Retrieval"]) # Only select clustering and retrieval tasks +# by language +tasks = mteb.get_tasks(languages=["eng", "deu"]) # Only select datasets which contain "eng" or "deu" (iso 639-3 codes) +# by domain +tasks = get_tasks(domains=["Legal"]) +# by modality +tasks = mteb.get_tasks(modalities=["text", "image"]) # Only select tasks with text or image modalities +# or using multiple +tasks = get_tasks(languages=["eng", "deu"], script=["Latn"], domains=["Legal"]) +``` + +For more information see the documention for `mteb.get_tasks` + +You can also specify which languages to load for multilingual/cross-lingual tasks like below: + +```python +import mteb + +tasks = [ + mteb.get_task("AmazonReviewsClassification", languages = ["eng", "fra"]), + mteb.get_task("BUCCBitextMining", languages = ["deu"]), # all subsets containing "deu" +] +``` + +### Selecting Evaluation Split or Subsets +A task in `mteb` mirrors the structure of a dataset on Huggingface. It includes a splits (i.e. "test") and a subset. + +```python +# selecting an evaluation split +task = mteb.get_task("Banking77Classification", eval_splits=["test"]) +# selecting a Huggingface subset +task = mteb.get_task("AmazonReviewsClassification", hf_subsets=["en", "fr"]) +``` + +> [!NOTE] +> **What is a subset?** A subset on a Huggingface dataset is what you specify after the dataset name, e.g. `datasets.load_dataset("nyu-mll/glue", "cola")`. +> Often the subset does not need to be defined and is left as "default". The subset is however useful, especially for multilingual datasets to specify the +> desired language or language pair e.g. in [`mteb/bucc-bitext-mining`](https://huggingface.co/datasets/mteb/bucc-bitext-mining) we might want to evaluate only on the French-English subset `"fr-en"`. + + + + +### Using a Custom Task + +To evaluate on a custom task, you can run the following code on your custom task. +See [how to add a new task](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_dataset.md), for how to create a new task in MTEB. + + +```python +import mteb +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking + + +class MyCustomTask(AbsTaskReranking): + ... + +model = mteb.get_model(...) +evaluation = mteb.MTEB(tasks=[MyCustomTask()]) +evaluation.run(model) +``` + + +## Running the Evaluation + +This section contain documentation related to the runtime of the evalution. How to pass arguments to the encoder, saving outputs and similar. + + +### Introduction to the runner + +By default `mteb` with save the results in the `results/{model_name}` folder, however if you want to saving the results in a specific folder you +can specify it as follows: + +```python +evaluation = mteb.MTEB(tasks=tasks) +results = evaluation.run(model, output_folder="my_results_folder") +``` + +### Tracking Carbon Emissions + +`mteb` allows for easy tracking of carbon emission eq. using `codecarbon`. You simply need to install `mteb[codecarbon]` and enable co2 tracking: + +```python +evaluation = mteb.MTEB(tasks=tasks) +results = evaluation.run(model, co2_tracker=True) +``` + + +### Passing in `encode` arguments + +To pass in arguments to the model's `encode` function, you can use the encode keyword arguments (`encode_kwargs`): + +```python +evaluation.run(model, encode_kwargs={"batch_size": 32}) +``` + +### Running SentenceTransformer model with prompts + +Prompts can be passed to the SentenceTransformer model using the `prompts` parameter. The following code shows how to use prompts with SentenceTransformer: + +```python +from sentence_transformers import SentenceTransformer + + +model = SentenceTransformer("average_word_embeddings_komninos", prompts={"query": "Query:", "passage": "Passage:"}) +evaluation = mteb.MTEB(tasks=tasks) +``` + +In prompts the key can be: +1. Prompt types (`passage`, `query`) - they will be used in reranking and retrieval tasks +2. Task type - these prompts will be used in all tasks of the given type + 1. `BitextMining` + 2. `Classification` + 3. `MultilabelClassification` + 4. `Clustering` + 5. `PairClassification` + 6. `Reranking` + 7. `Retrieval` + 8. `STS` + 9. `Summarization` + 10. `InstructionRetrieval` +3. Pair of task type and prompt type like `Retrival-query` - these prompts will be used in all classification tasks +4. Task name - these prompts will be used in the specific task +5. Pair of task name and prompt type like `NFCorpus-query` - these prompts will be used in the specific task + + +### Running Cross Encoders on Reranking + +To use a cross encoder for reranking, you can directly use a CrossEncoder from SentenceTransformers. The following code shows a two-stage run with the second stage reading results saved from the first stage. + +```python +from mteb import MTEB +import mteb +from sentence_transformers import CrossEncoder, SentenceTransformer + +cross_encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2") +dual_encoder = SentenceTransformer("all-MiniLM-L6-v2") + +tasks = mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"]) + +subset = "default" # subset name used in the NFCorpus dataset +eval_splits = ["test"] + +evaluation = MTEB(tasks=tasks) +evaluation.run( + dual_encoder, + eval_splits=eval_splits, + save_predictions=True, + output_folder="results/stage1", +) +evaluation.run( + cross_encoder, + eval_splits=eval_splits, + top_k=5, + save_predictions=True, + output_folder="results/stage2", + previous_results=f"results/stage1/NFCorpus_{subset}_predictions.json", +) +``` + + +### Using Late Interaction Models + +This section outlines how to use late interaction models for retrieval. + +```python +from mteb import MTEB +import mteb + + +colbert = mteb.get_model("colbert-ir/colbertv2.0") +tasks = mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"]) + +eval_splits = ["test"] + +evaluation = MTEB(tasks=tasks) + +evaluation.run( + colbert, + eval_splits=eval_splits, + corpus_chunk_size=500, +) +``` +This implementation employs the MaxSim operation to compute the similarity between sentences. While MaxSim provides high-quality results, it processes a larger number of embeddings, potentially leading to increased resource usage. To manage resource consumption, consider lowering the `corpus_chunk_size` parameter. + + +### Saving retrieval task predictions + +To save the predictions from a retrieval task, add the `--save_predictions` flag in the CLI or set `save_predictions=True` in the run method. The filename will be in the "{task_name}_{subset}_predictions.json" format. + +Python: +```python +from mteb import MTEB +import mteb +from sentence_transformers import SentenceTransformer + +model = SentenceTransformer("all-MiniLM-L6-v2") + +tasks = mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"]) + +evaluation = MTEB(tasks=tasks) +evaluation.run( + model, + eval_splits=["test"], + save_predictions=True, + output_folder="results", +) +``` + +CLI: +```bash +mteb run -t NFCorpus -m all-MiniLM-L6-v2 --output_folder results --save_predictions +``` + +### Caching Embeddings To Re-Use Them + +There are times you may want to cache the embeddings so you can re-use them. This may be true if you have multiple query sets for the same corpus (e.g. Wikipedia) or are doing some optimization over the queries (e.g. prompting, other experiments). You can setup a cache by using a simple wrapper, which will save the cache per task in the `cache_embeddings/{task_name}` folder: + +```python +# define your task and model above as normal +... +# wrap the model with the cache wrapper +from mteb.models.cache_wrapper import CachedEmbeddingWrapper +model_with_cached_emb = CachedEmbeddingWrapper(model, cache_path='path_to_cache_dir') +# run as normal +evaluation.run(model, ...) +``` + +## Leaderboard + +This section contains information on how to interact with the leaderboard including running it locally, analysing the results, annotating contamination and more. + +### Fetching results from the Leaderboard + +Multiple models have already been run on tasks available within MTEB. These results are available results [repository](https://github.com/embeddings-benchmark/results). + +To make the results more easily accessible, we have designed custom functionality for retrieving from the repository. For instance, if you are selecting the best model for your French and English retrieval task on legal documents you could fetch the relevant tasks and create a dataframe of the results using the following code: + +```python +import mteb +from mteb.task_selection import results_to_dataframe + +tasks = mteb.get_tasks( + task_types=["Retrieval"], languages=["eng", "fra"], domains=["Legal"] +) + +model_names = [ + "GritLM/GritLM-7B", + "intfloat/multilingual-e5-small", + "intfloat/multilingual-e5-base", + "intfloat/multilingual-e5-large", +] +models = [mteb.get_model_meta(name) for name in model_names] + +results = mteb.load_results(models=models, tasks=tasks) + +df = results_to_dataframe(results) +``` + +### Annotate Contamination + +have your found contamination in the training data of a model? Please let us know, either by opening an issue or ideally by submitting a PR +annotating the training datasets of the model: + +```py +model_w_contamination = ModelMeta( + name = "model-with-contamination" + ... + training_datasets: {"ArguAna": # name of dataset within MTEB + ["test"]} # the splits that have been trained on + ... +) +``` + + +### Running the Leaderboard Locally + +It is possible to completely deploy the leaderboard locally or self-host it. This can e.g. be relevant for companies that might want to +integrate build their own benchmarks or integrate custom tasks into existing benchmarks. + +Running the leaderboard is quite easy. Simply run: +```py +python -m mteb.leaderboard.app +``` + +The leaderboard requires gradio install, which can be installed using `pip install mteb[gradio]` and requires python >3.10. diff --git a/mteb/__init__.py b/mteb/__init__.py index 80d1650fde..807a730c6e 100644 --- a/mteb/__init__.py +++ b/mteb/__init__.py @@ -15,7 +15,7 @@ ) from mteb.overview import TASKS_REGISTRY, get_task, get_tasks -from .benchmarks.benchmarks import Benchmark +from .benchmarks.benchmark import Benchmark from .benchmarks.get_benchmark import BENCHMARK_REGISTRY, get_benchmark, get_benchmarks __version__ = version("mteb") # fetch version from install metadata diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py index e072ca498c..146819c457 100644 --- a/mteb/abstasks/AbsTaskBitextMining.py +++ b/mteb/abstasks/AbsTaskBitextMining.py @@ -152,8 +152,7 @@ def _calculate_metrics_from_split( sentence1 = self.dataset[hf_subset][split][sent_1] sentence2 = self.dataset[hf_subset][split][sent_2] elif compute_overall: - sentence1 = [] - sentence2 = [] + sentence1, sentence2 = [], [] if self.parallel_subsets: for hf_subset in self.metadata.eval_langs: sent_1, sent_2 = hf_subset.split("-") diff --git a/mteb/abstasks/Image/AbsTaskImageClassification.py b/mteb/abstasks/Image/AbsTaskImageClassification.py index 0f2ee4e449..76fcc52625 100644 --- a/mteb/abstasks/Image/AbsTaskImageClassification.py +++ b/mteb/abstasks/Image/AbsTaskImageClassification.py @@ -98,8 +98,7 @@ def _calculate_metrics_from_split( imgs = self.dataset[hf_subset][split][self.image_column_name] labels = self.dataset[hf_subset][split][self.label_column_name] elif compute_overall: - imgs = [] - labels = [] + imgs, labels = [], [] for hf_subset in self.metadata.eval_langs: imgs.extend(self.dataset[hf_subset][split][self.image_column_name]) labels.extend(self.dataset[hf_subset][split][self.label_column_name]) @@ -113,7 +112,7 @@ def _calculate_metrics_from_split( img_widths, img_heights = [], [] for img in imgs: - width, height = img.size + width, height = img.size # type: ignore img_heights.append(height) img_widths.append(width) diff --git a/mteb/abstasks/Image/AbsTaskImageClustering.py b/mteb/abstasks/Image/AbsTaskImageClustering.py index 12cd489a60..efd5a6cab8 100644 --- a/mteb/abstasks/Image/AbsTaskImageClustering.py +++ b/mteb/abstasks/Image/AbsTaskImageClustering.py @@ -73,8 +73,7 @@ def _calculate_metrics_from_split( imgs = self.dataset[hf_subset][split][self.image_column_name] labels = self.dataset[hf_subset][split][self.label_column_name] elif compute_overall: - imgs = [] - labels = [] + imgs, labels = [], [] for hf_subset in self.metadata.eval_langs: imgs.extend(self.dataset[hf_subset][split][self.image_column_name]) labels.extend(self.dataset[hf_subset][split][self.label_column_name]) @@ -88,7 +87,7 @@ def _calculate_metrics_from_split( img_widths, img_heights = [], [] for img in imgs: - width, height = img.size + width, height = img.size # type: ignore img_heights.append(height) img_widths.append(width) diff --git a/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py b/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py index 6c07535605..bdcc084dd5 100644 --- a/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py +++ b/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py @@ -132,8 +132,7 @@ def _calculate_metrics_from_split( imgs = self.dataset[hf_subset][split][self.image_column_name] labels = self.dataset[hf_subset][split][self.label_column_name] elif compute_overall: - imgs = [] - labels = [] + imgs, labels = [], [] for hf_subset in self.metadata.eval_langs: imgs.extend(self.dataset[hf_subset][split][self.image_column_name]) labels.extend(self.dataset[hf_subset][split][self.label_column_name]) diff --git a/mteb/abstasks/Image/AbsTaskZeroshotClassification.py b/mteb/abstasks/Image/AbsTaskZeroShotClassification.py similarity index 85% rename from mteb/abstasks/Image/AbsTaskZeroshotClassification.py rename to mteb/abstasks/Image/AbsTaskZeroShotClassification.py index 7ef475217c..1dcbb9d1ec 100644 --- a/mteb/abstasks/Image/AbsTaskZeroshotClassification.py +++ b/mteb/abstasks/Image/AbsTaskZeroShotClassification.py @@ -7,15 +7,15 @@ from datasets import Dataset from ...encoder_interface import Encoder -from ...evaluation.evaluators import ZeroshotClassificationEvaluator +from ...evaluation.evaluators import ZeroShotClassificationEvaluator from ..AbsTask import AbsTask, ScoresDict from ..TaskMetadata import DescriptiveStatistics logger = logging.getLogger(__name__) -class ZeroshotClassificationDescriptiveStatistics(DescriptiveStatistics): - """Descriptive statistics for ZeroshotClassification +class ZeroShotClassificationDescriptiveStatistics(DescriptiveStatistics): + """Descriptive statistics for ZeroShotClassification Attributes: num_samples: number of samples in the dataset. @@ -54,8 +54,8 @@ class ZeroshotClassificationDescriptiveStatistics(DescriptiveStatistics): max_label_text_length: int -class AbsTaskZeroshotClassification(AbsTask): - """Abstract class for ZeroshotClassification tasks +class AbsTaskZeroShotClassification(AbsTask): + """Abstract class for ZeroShotClassification tasks The similarity between an images and candidate text prompts, such as this is a dog/this is a cat. self.load_data() must generate a huggingface dataset with a split matching self.metadata.eval_splits, and assign it to self.dataset. It must contain the following columns: @@ -74,13 +74,12 @@ def _add_main_score(self, scores) -> None: def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ) -> ZeroshotClassificationDescriptiveStatistics: + ) -> ZeroShotClassificationDescriptiveStatistics: if hf_subset: imgs = self.dataset[hf_subset][split][self.image_column_name] labels = self.dataset[hf_subset][split][self.label_column_name] elif compute_overall: - imgs = [] - labels = [] + imgs, labels = [], [] for hf_subset in self.metadata.eval_langs: imgs.extend(self.dataset[hf_subset][split][self.image_column_name]) labels.extend(self.dataset[hf_subset][split][self.label_column_name]) @@ -94,14 +93,13 @@ def _calculate_metrics_from_split( img_widths, img_heights = [], [] for img in imgs: - width, height = img.size + width, height = img.size # type: ignore img_heights.append(height) img_widths.append(width) - candidate_labels = self.get_candidate_labels() - candidate_labels_len = [len(c) for c in candidate_labels] + candidate_labels_len = [len(c) for c in self.get_candidate_labels()] - return ZeroshotClassificationDescriptiveStatistics( + return ZeroShotClassificationDescriptiveStatistics( num_samples=num_samples, unique_num_labels=unique_num_labels, min_image_width=min(img_widths), @@ -128,10 +126,9 @@ def _evaluate_subset( **kwargs, ) -> ScoresDict: candidate_labels = self.get_candidate_labels() - evaluator = ZeroshotClassificationEvaluator( + evaluator = ZeroShotClassificationEvaluator( dataset, self.image_column_name, - # dataset[self.image_column_name],#broken into dataset and self.image_column_name dataset[self.label_column_name], candidate_labels, task_name=self.metadata.name, diff --git a/mteb/abstasks/Image/__init__.py b/mteb/abstasks/Image/__init__.py index 4440615af1..70c453bcef 100644 --- a/mteb/abstasks/Image/__init__.py +++ b/mteb/abstasks/Image/__init__.py @@ -8,10 +8,10 @@ from .AbsTaskImageMultilabelClassification import AbsTaskImageMultilabelClassification from .AbsTaskImageTextPairClassification import AbsTaskImageTextPairClassification from .AbsTaskVisualSTS import AbsTaskVisualSTS -from .AbsTaskZeroshotClassification import AbsTaskZeroshotClassification +from .AbsTaskZeroShotClassification import AbsTaskZeroShotClassification __all__ = [ - "AbsTaskZeroshotClassification", + "AbsTaskZeroShotClassification", "AbsTaskVisualSTS", "AbsTaskImageTextPairClassification", "AbsTaskImageMultilabelClassification", diff --git a/mteb/abstasks/__init__.py b/mteb/abstasks/__init__.py index fb6930c70d..9b79e4d565 100644 --- a/mteb/abstasks/__init__.py +++ b/mteb/abstasks/__init__.py @@ -20,7 +20,7 @@ AbsTaskImageMultilabelClassification, AbsTaskImageTextPairClassification, AbsTaskVisualSTS, - AbsTaskZeroshotClassification, + AbsTaskZeroShotClassification, ) from .TaskMetadata import TaskMetadata @@ -45,5 +45,5 @@ "AbsTaskImageMultilabelClassification", "AbsTaskImageTextPairClassification", "AbsTaskVisualSTS", - "AbsTaskZeroshotClassification", + "AbsTaskZeroShotClassification", ] diff --git a/mteb/benchmarks/__init__.py b/mteb/benchmarks/__init__.py index ad28869e62..895c48f288 100644 --- a/mteb/benchmarks/__init__.py +++ b/mteb/benchmarks/__init__.py @@ -3,6 +3,8 @@ from mteb.benchmarks.benchmark import Benchmark from mteb.benchmarks.benchmarks import ( BRIGHT, + C_MTEB, + FA_MTEB, LONG_EMBED, MTEB_DEU, MTEB_EN, @@ -51,6 +53,8 @@ "MTEB_EU", "LONG_EMBED", "BRIGHT", + "FA_MTEB", + "C_MTEB", "BENCHMARK_REGISTRY", "get_benchmarks", "get_benchmark", diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 2e03fd0d8b..ea0a9a652b 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -13,6 +13,18 @@ ] # Allows the type to be a string, but ensures that the string is a URL +MMTEB_CITATION = """ +@article{enevoldsen2025mmtebmassivemultilingualtext, + title={MMTEB: Massive Multilingual Text Embedding Benchmark}, + author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff}, + publisher = {arXiv}, + journal={arXiv preprint arXiv:2502.13595}, + year={2025}, + url={https://arxiv.org/abs/2502.13595}, + doi = {10.48550/arXiv.2502.13595}, +} +""" + MTEB_EN = Benchmark( name="MTEB(eng, v2)", tasks=MTEBTasks( @@ -78,7 +90,7 @@ The original MTEB leaderboard is available under the [MTEB(eng, v1)](http://mteb-leaderboard.hf.space/?benchmark_name=MTEB%28eng%2C+v1%29) tab. """, - citation="", + citation=MMTEB_CITATION, contacts=["KennethEnevoldsen", "Muennighoff"], ) @@ -643,7 +655,7 @@ ), description="A massive code embedding benchmark covering retrieval tasks in a miriad of popular programming languages.", reference=None, - citation=None, + citation=MMTEB_CITATION, ) MTEB_multilingual = Benchmark( @@ -787,7 +799,7 @@ ), description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages.", reference=None, - citation=None, + citation=MMTEB_CITATION, contacts=["KennethEnevoldsen", "isaac-chung"], ) @@ -902,7 +914,7 @@ ), description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.", reference=None, - citation=None, + citation=MMTEB_CITATION, contacts=["KennethEnevoldsen", "isaac-chung"], ) @@ -1034,7 +1046,7 @@ ), description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.", reference=None, - citation=None, + citation=MMTEB_CITATION, contacts=["KennethEnevoldsen", "isaac-chung"], ) @@ -1082,10 +1094,15 @@ }""", ) - BRIGHT_LONG = Benchmark( name="BRIGHT (long)", - tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["long"]), + tasks=MTEBTasks( + ( + get_task( + "BrightLongRetrieval", + ), + ) + ), description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval. BRIGHT is the first text retrieval benchmark that requires intensive reasoning to retrieve relevant documents with diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py index 00da1cb1ba..6a12fe4fe1 100644 --- a/mteb/benchmarks/get_benchmark.py +++ b/mteb/benchmarks/get_benchmark.py @@ -5,7 +5,7 @@ import warnings import mteb.benchmarks.benchmarks as benchmark_module -from mteb.benchmarks.benchmarks import ( +from mteb.benchmarks import ( C_MTEB, FA_MTEB, MTEB_DEU, @@ -22,11 +22,12 @@ MTEB_RETRIEVAL_MEDICAL, MTEB_RETRIEVAL_WITH_INSTRUCTIONS, SEB, - Benchmark, MTEB_code, MTEB_multilingual, ) +from .benchmark import Benchmark + logger = logging.getLogger(__name__) BENCHMARK_REGISTRY = { diff --git a/mteb/create_dataloaders.py b/mteb/create_dataloaders.py index 53abe8a598..e4db35bddb 100644 --- a/mteb/create_dataloaders.py +++ b/mteb/create_dataloaders.py @@ -5,7 +5,6 @@ import torch from datasets import Dataset from torch.utils.data import DataLoader, default_collate -from torchvision import transforms from mteb.encoder_interface import BatchedInput, Conversation @@ -183,9 +182,6 @@ def create_dataloader_for_queries_conversation( return torch.utils.data.DataLoader(dataset, **dataloader_kwargs) -DEFAULT_TRANSFORM = transforms.Compose([transforms.PILToTensor()]) - - def transform_image_to_rgb( image: Any, transform: Callable[[Any], Any] | None = None ) -> Any: diff --git a/mteb/descriptive_stats/Clustering/ClusTREC-Covid.json b/mteb/descriptive_stats/Clustering/ClusTREC-Covid.json new file mode 100644 index 0000000000..194f0939b7 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/ClusTREC-Covid.json @@ -0,0 +1,493 @@ +{ + "test": { + "num_samples": 4568, + "number_of_characters": 2977845, + "min_text_length": 14, + "average_text_length": 651.8925131348511, + "max_text_length": 8364, + "min_labels_per_text": 6, + "average_labels_per_text": 1.0, + "max_labels_per_text": 100, + "unique_labels": 50, + "labels": { + "coronavirus origin": { + "count": 100 + }, + "coronavirus response to weather changes": { + "count": 100 + }, + "coronavirus immunity": { + "count": 78 + }, + "how do people die from the coronavirus": { + "count": 100 + }, + "animal models of COVID-19": { + "count": 100 + }, + "coronavirus test rapid testing": { + "count": 100 + }, + "serological tests for coronavirus": { + "count": 100 + }, + "coronavirus under reporting": { + "count": 100 + }, + "coronavirus in Canada": { + "count": 92 + }, + "coronavirus social distancing impact": { + "count": 100 + }, + "coronavirus hospital rationing": { + "count": 100 + }, + "coronavirus quarantine": { + "count": 100 + }, + "how does coronavirus spread": { + "count": 100 + }, + "coronavirus super spreaders": { + "count": 98 + }, + "coronavirus outside body": { + "count": 34 + }, + "how long does coronavirus survive on surfaces": { + "count": 74 + }, + "coronavirus clinical trials": { + "count": 100 + }, + "masks prevent coronavirus": { + "count": 100 + }, + "what alcohol sanitizer kills coronavirus": { + "count": 64 + }, + "coronavirus and ACE inhibitors": { + "count": 100 + }, + "coronavirus mortality": { + "count": 100 + }, + "coronavirus heart impacts": { + "count": 100 + }, + "coronavirus hypertension": { + "count": 74 + }, + "coronavirus diabetes": { + "count": 100 + }, + "coronavirus biomarkers": { + "count": 100 + }, + "coronavirus early symptoms": { + "count": 100 + }, + "coronavirus asymptomatic": { + "count": 100 + }, + "coronavirus hydroxychloroquine": { + "count": 100 + }, + "coronavirus drug repurposing": { + "count": 100 + }, + "coronavirus remdesivir": { + "count": 100 + }, + "difference between coronavirus and flu": { + "count": 100 + }, + "coronavirus subtypes": { + "count": 6 + }, + "coronavirus vaccine candidates": { + "count": 36 + }, + "coronavirus recovery": { + "count": 100 + }, + "coronavirus public datasets": { + "count": 100 + }, + "SARS-CoV-2 spike structure": { + "count": 100 + }, + "SARS-CoV-2 phylogenetic analysis": { + "count": 100 + }, + "COVID inflammatory response": { + "count": 100 + }, + "COVID-19 cytokine storm": { + "count": 100 + }, + "coronavirus mutations": { + "count": 100 + }, + "COVID-19 in African-Americans": { + "count": 100 + }, + "Vitamin D and COVID-19": { + "count": 100 + }, + "violence during pandemic": { + "count": 100 + }, + "impact of masks on coronavirus transmission": { + "count": 100 + }, + "coronavirus mental health impact": { + "count": 100 + }, + "dexamethasone coronavirus": { + "count": 92 + }, + "COVID-19 outcomes in children": { + "count": 100 + }, + "school reopening coronavirus": { + "count": 100 + }, + "post-infection COVID-19 immunity": { + "count": 88 + }, + "mRNA vaccine coronavirus": { + "count": 32 + } + }, + "hf_subset_descriptive_stats": { + "title and abstract": { + "num_samples": 2284, + "number_of_characters": 2755462, + "min_text_length": 14, + "average_text_length": 1206.4194395796849, + "max_text_length": 8364, + "min_labels_per_text": 3, + "average_labels_per_text": 1.0, + "max_labels_per_text": 50, + "unique_labels": 50, + "labels": { + "coronavirus origin": { + "count": 50 + }, + "coronavirus response to weather changes": { + "count": 50 + }, + "coronavirus immunity": { + "count": 39 + }, + "how do people die from the coronavirus": { + "count": 50 + }, + "animal models of COVID-19": { + "count": 50 + }, + "coronavirus test rapid testing": { + "count": 50 + }, + "serological tests for coronavirus": { + "count": 50 + }, + "coronavirus under reporting": { + "count": 50 + }, + "coronavirus in Canada": { + "count": 46 + }, + "coronavirus social distancing impact": { + "count": 50 + }, + "coronavirus hospital rationing": { + "count": 50 + }, + "coronavirus quarantine": { + "count": 50 + }, + "how does coronavirus spread": { + "count": 50 + }, + "coronavirus super spreaders": { + "count": 49 + }, + "coronavirus outside body": { + "count": 17 + }, + "how long does coronavirus survive on surfaces": { + "count": 37 + }, + "coronavirus clinical trials": { + "count": 50 + }, + "masks prevent coronavirus": { + "count": 50 + }, + "what alcohol sanitizer kills coronavirus": { + "count": 32 + }, + "coronavirus and ACE inhibitors": { + "count": 50 + }, + "coronavirus mortality": { + "count": 50 + }, + "coronavirus heart impacts": { + "count": 50 + }, + "coronavirus hypertension": { + "count": 37 + }, + "coronavirus diabetes": { + "count": 50 + }, + "coronavirus biomarkers": { + "count": 50 + }, + "coronavirus early symptoms": { + "count": 50 + }, + "coronavirus asymptomatic": { + "count": 50 + }, + "coronavirus hydroxychloroquine": { + "count": 50 + }, + "coronavirus drug repurposing": { + "count": 50 + }, + "coronavirus remdesivir": { + "count": 50 + }, + "difference between coronavirus and flu": { + "count": 50 + }, + "coronavirus subtypes": { + "count": 3 + }, + "coronavirus vaccine candidates": { + "count": 18 + }, + "coronavirus recovery": { + "count": 50 + }, + "coronavirus public datasets": { + "count": 50 + }, + "SARS-CoV-2 spike structure": { + "count": 50 + }, + "SARS-CoV-2 phylogenetic analysis": { + "count": 50 + }, + "COVID inflammatory response": { + "count": 50 + }, + "COVID-19 cytokine storm": { + "count": 50 + }, + "coronavirus mutations": { + "count": 50 + }, + "COVID-19 in African-Americans": { + "count": 50 + }, + "Vitamin D and COVID-19": { + "count": 50 + }, + "violence during pandemic": { + "count": 50 + }, + "impact of masks on coronavirus transmission": { + "count": 50 + }, + "coronavirus mental health impact": { + "count": 50 + }, + "dexamethasone coronavirus": { + "count": 46 + }, + "COVID-19 outcomes in children": { + "count": 50 + }, + "school reopening coronavirus": { + "count": 50 + }, + "post-infection COVID-19 immunity": { + "count": 44 + }, + "mRNA vaccine coronavirus": { + "count": 16 + } + } + }, + "title": { + "num_samples": 2284, + "number_of_characters": 222383, + "min_text_length": 14, + "average_text_length": 97.36558669001751, + "max_text_length": 348, + "min_labels_per_text": 3, + "average_labels_per_text": 1.0, + "max_labels_per_text": 50, + "unique_labels": 50, + "labels": { + "coronavirus origin": { + "count": 50 + }, + "coronavirus response to weather changes": { + "count": 50 + }, + "coronavirus immunity": { + "count": 39 + }, + "how do people die from the coronavirus": { + "count": 50 + }, + "animal models of COVID-19": { + "count": 50 + }, + "coronavirus test rapid testing": { + "count": 50 + }, + "serological tests for coronavirus": { + "count": 50 + }, + "coronavirus under reporting": { + "count": 50 + }, + "coronavirus in Canada": { + "count": 46 + }, + "coronavirus social distancing impact": { + "count": 50 + }, + "coronavirus hospital rationing": { + "count": 50 + }, + "coronavirus quarantine": { + "count": 50 + }, + "how does coronavirus spread": { + "count": 50 + }, + "coronavirus super spreaders": { + "count": 49 + }, + "coronavirus outside body": { + "count": 17 + }, + "how long does coronavirus survive on surfaces": { + "count": 37 + }, + "coronavirus clinical trials": { + "count": 50 + }, + "masks prevent coronavirus": { + "count": 50 + }, + "what alcohol sanitizer kills coronavirus": { + "count": 32 + }, + "coronavirus and ACE inhibitors": { + "count": 50 + }, + "coronavirus mortality": { + "count": 50 + }, + "coronavirus heart impacts": { + "count": 50 + }, + "coronavirus hypertension": { + "count": 37 + }, + "coronavirus diabetes": { + "count": 50 + }, + "coronavirus biomarkers": { + "count": 50 + }, + "coronavirus early symptoms": { + "count": 50 + }, + "coronavirus asymptomatic": { + "count": 50 + }, + "coronavirus hydroxychloroquine": { + "count": 50 + }, + "coronavirus drug repurposing": { + "count": 50 + }, + "coronavirus remdesivir": { + "count": 50 + }, + "difference between coronavirus and flu": { + "count": 50 + }, + "coronavirus subtypes": { + "count": 3 + }, + "coronavirus vaccine candidates": { + "count": 18 + }, + "coronavirus recovery": { + "count": 50 + }, + "coronavirus public datasets": { + "count": 50 + }, + "SARS-CoV-2 spike structure": { + "count": 50 + }, + "SARS-CoV-2 phylogenetic analysis": { + "count": 50 + }, + "COVID inflammatory response": { + "count": 50 + }, + "COVID-19 cytokine storm": { + "count": 50 + }, + "coronavirus mutations": { + "count": 50 + }, + "COVID-19 in African-Americans": { + "count": 50 + }, + "Vitamin D and COVID-19": { + "count": 50 + }, + "violence during pandemic": { + "count": 50 + }, + "impact of masks on coronavirus transmission": { + "count": 50 + }, + "coronavirus mental health impact": { + "count": 50 + }, + "dexamethasone coronavirus": { + "count": 46 + }, + "COVID-19 outcomes in children": { + "count": 50 + }, + "school reopening coronavirus": { + "count": 50 + }, + "post-infection COVID-19 immunity": { + "count": 44 + }, + "mRNA vaccine coronavirus": { + "count": 16 + } + } + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/KlueMrcDomainClustering.json b/mteb/descriptive_stats/Clustering/KlueMrcDomainClustering.json new file mode 100644 index 0000000000..669cd2687b --- /dev/null +++ b/mteb/descriptive_stats/Clustering/KlueMrcDomainClustering.json @@ -0,0 +1,34 @@ +{ + "test": { + "num_samples": 1, + "number_of_characters": 794, + "min_text_length": 794, + "average_text_length": 794.0, + "max_text_length": 794, + "unique_texts": 794, + "min_labels_per_text": 12, + "average_labels_per_text": 794.0, + "max_labels_per_text": 251, + "unique_labels": 6, + "labels": { + "4": { + "count": 82 + }, + "3": { + "count": 231 + }, + "2": { + "count": 251 + }, + "0": { + "count": 120 + }, + "5": { + "count": 98 + }, + "1": { + "count": 12 + } + } + } +} diff --git a/mteb/descriptive_stats/Clustering/KlueYnatMrcCategoryClustering.json b/mteb/descriptive_stats/Clustering/KlueYnatMrcCategoryClustering.json new file mode 100644 index 0000000000..01c6e676af --- /dev/null +++ b/mteb/descriptive_stats/Clustering/KlueYnatMrcCategoryClustering.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 1, + "number_of_characters": 904, + "min_text_length": 904, + "average_text_length": 904.0, + "max_text_length": 904, + "unique_texts": 904, + "min_labels_per_text": 99, + "average_labels_per_text": 904.0, + "max_labels_per_text": 240, + "unique_labels": 5, + "labels": { + "3": { + "count": 173 + }, + "2": { + "count": 164 + }, + "1": { + "count": 99 + }, + "0": { + "count": 240 + }, + "5": { + "count": 228 + } + } + } +} diff --git a/mteb/descriptive_stats/Retrieval/BrightLongRetrieval.json b/mteb/descriptive_stats/Retrieval/BrightLongRetrieval.json new file mode 100644 index 0000000000..8427ffe9ea --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/BrightLongRetrieval.json @@ -0,0 +1,265 @@ +{ + "long": { + "num_samples": 6511, + "number_of_characters": 327567114, + "num_documents": 5650, + "min_document_length": 25, + "average_document_length": 57844.780530973454, + "max_document_length": 9182740, + "unique_documents": 5650, + "num_queries": 861, + "min_query_length": 83, + "average_query_length": 864.2322880371661, + "max_query_length": 19341, + "unique_queries": 861, + "none_queries": 0, + "num_relevant_docs": 1679, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.9500580720092915, + "max_relevant_docs_per_query": 12, + "unique_relevant_docs": 920, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "biology": { + "num_samples": 627, + "number_of_characters": 19398768, + "num_documents": 524, + "min_document_length": 142, + "average_document_length": 36917.73854961832, + "max_document_length": 1324203, + "unique_documents": 524, + "num_queries": 103, + "min_query_length": 89, + "average_query_length": 523.0388349514564, + "max_query_length": 2195, + "unique_queries": 103, + "none_queries": 0, + "num_relevant_docs": 134, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.3009708737864079, + "max_relevant_docs_per_query": 4, + "unique_relevant_docs": 134, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "earth_science": { + "num_samples": 717, + "number_of_characters": 41705609, + "num_documents": 601, + "min_document_length": 33, + "average_document_length": 69301.6622296173, + "max_document_length": 2627263, + "unique_documents": 601, + "num_queries": 116, + "min_query_length": 83, + "average_query_length": 476.8103448275862, + "max_query_length": 1565, + "unique_queries": 116, + "none_queries": 0, + "num_relevant_docs": 187, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.6120689655172413, + "max_relevant_docs_per_query": 4, + "unique_relevant_docs": 187, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "economics": { + "num_samples": 619, + "number_of_characters": 19994187, + "num_documents": 516, + "min_document_length": 45, + "average_document_length": 38600.78488372093, + "max_document_length": 429509, + "unique_documents": 516, + "num_queries": 103, + "min_query_length": 164, + "average_query_length": 739.6310679611651, + "max_query_length": 2223, + "unique_queries": 103, + "none_queries": 0, + "num_relevant_docs": 109, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.058252427184466, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 109, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "psychology": { + "num_samples": 613, + "number_of_characters": 20490305, + "num_documents": 512, + "min_document_length": 25, + "average_document_length": 39883.3828125, + "max_document_length": 669577, + "unique_documents": 512, + "num_queries": 101, + "min_query_length": 166, + "average_query_length": 693.1980198019802, + "max_query_length": 2334, + "unique_queries": 101, + "none_queries": 0, + "num_relevant_docs": 116, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1485148514851484, + "max_relevant_docs_per_query": 5, + "unique_relevant_docs": 113, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "robotics": { + "num_samples": 609, + "number_of_characters": 18387998, + "num_documents": 508, + "min_document_length": 120, + "average_document_length": 35763.509842519685, + "max_document_length": 3589950, + "unique_documents": 508, + "num_queries": 101, + "min_query_length": 165, + "average_query_length": 2179.5544554455446, + "max_query_length": 19341, + "unique_queries": 101, + "none_queries": 0, + "num_relevant_docs": 106, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0495049504950495, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 106, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "stackoverflow": { + "num_samples": 1975, + "number_of_characters": 184328188, + "num_documents": 1858, + "min_document_length": 43, + "average_document_length": 99126.43110871906, + "max_document_length": 9182740, + "unique_documents": 1858, + "num_queries": 117, + "min_query_length": 185, + "average_query_length": 1292.982905982906, + "max_query_length": 12432, + "unique_queries": 117, + "none_queries": 0, + "num_relevant_docs": 129, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1025641025641026, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 125, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "sustainable_living": { + "num_samples": 662, + "number_of_characters": 21155433, + "num_documents": 554, + "min_document_length": 32, + "average_document_length": 38053.584837545124, + "max_document_length": 5732347, + "unique_documents": 554, + "num_queries": 108, + "min_query_length": 158, + "average_query_length": 682.8425925925926, + "max_query_length": 2843, + "unique_queries": 108, + "none_queries": 0, + "num_relevant_docs": 129, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.1944444444444444, + "max_relevant_docs_per_query": 5, + "unique_relevant_docs": 129, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "pony": { + "num_samples": 689, + "number_of_characters": 2106626, + "num_documents": 577, + "min_document_length": 54, + "average_document_length": 3575.4956672443673, + "max_document_length": 108909, + "unique_documents": 577, + "num_queries": 112, + "min_query_length": 182, + "average_query_length": 388.9732142857143, + "max_query_length": 946, + "unique_queries": 112, + "none_queries": 0, + "num_relevant_docs": 769, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 6.866071428571429, + "max_relevant_docs_per_query": 12, + "unique_relevant_docs": 17, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + } +} diff --git a/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py b/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py index b91a5fb1bb..5a9234ec80 100644 --- a/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py +++ b/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py @@ -74,7 +74,7 @@ def search( qrels: Dataset, top_k: int, score_function: str, - task_name: str, + task_name: str | None = None, return_sorted: bool = False, **kwargs, ) -> dict[str, dict[str, float]]: diff --git a/mteb/evaluation/evaluators/Image/Any2TextMultipleChoiceEvaluator.py b/mteb/evaluation/evaluators/Image/Any2TextMultipleChoiceEvaluator.py index 8cf4327130..d84c239c6c 100644 --- a/mteb/evaluation/evaluators/Image/Any2TextMultipleChoiceEvaluator.py +++ b/mteb/evaluation/evaluators/Image/Any2TextMultipleChoiceEvaluator.py @@ -7,7 +7,6 @@ import torch from sklearn.metrics import accuracy_score from sklearn.metrics.pairwise import cosine_similarity -from torchvision import transforms from tqdm import tqdm from mteb.create_dataloaders import ( @@ -19,8 +18,6 @@ logger = logging.getLogger(__name__) -transform = transforms.Compose([transforms.PILToTensor()]) - class Any2TextMultipleChoiceEvaluator(Evaluator): """Evaluate a model based on the similarity of queries (can be interleaved) and candidate answers. @@ -42,7 +39,6 @@ def __init__( label_column_name: str, choices_column_name: str, task_name: str | None = None, - transform=None, limit: int | None = None, **kwargs, ): @@ -55,7 +51,6 @@ def __init__( self.label_column_name = label_column_name self.choices_column_name = choices_column_name self.task_name = task_name - self.transform = transform def __call__( self, diff --git a/mteb/evaluation/evaluators/Image/ImageTextPairClassificationEvaluator.py b/mteb/evaluation/evaluators/Image/ImageTextPairClassificationEvaluator.py index c59520c1cd..6b92664275 100644 --- a/mteb/evaluation/evaluators/Image/ImageTextPairClassificationEvaluator.py +++ b/mteb/evaluation/evaluators/Image/ImageTextPairClassificationEvaluator.py @@ -7,18 +7,16 @@ import torch.nn.functional as F from datasets import Dataset from torch.utils.data import DataLoader -from torchvision import transforms from mteb.create_dataloaders import ( transform_image_to_rgb, ) from mteb.encoder_interface import Encoder, EncoderWithSimilarity from mteb.evaluation.evaluators.Evaluator import Evaluator +from mteb.requires_package import requires_image_dependencies logger = logging.getLogger(__name__) -transform = transforms.Compose([transforms.PILToTensor()]) - class CustomImageDataset(torch.utils.data.Dataset): def __init__( @@ -64,10 +62,14 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + requires_image_dependencies() + from torchvision import transforms + self.dataset = dataset self.images_column_names = images_column_names self.texts_column_names = texts_column_names self.task_name = task_name + self.transform = transforms.Compose([transforms.PILToTensor()]) def __call__( self, @@ -96,7 +98,7 @@ def __call__( else: images = self.dataset[self.images_column_names] - images = [transform(transform_image_to_rgb(img)) for img in images] + images = [self.transform(transform_image_to_rgb(img)) for img in images] texts = [] if isinstance(self.texts_column_names, list): diff --git a/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py b/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py index 04b0792be5..b9819394b0 100644 --- a/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py +++ b/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py @@ -10,7 +10,6 @@ paired_euclidean_distances, paired_manhattan_distances, ) -from torchvision import transforms from mteb.create_dataloaders import create_image_dataloader @@ -18,8 +17,6 @@ logger = logging.getLogger(__name__) -transform = transforms.Compose([transforms.PILToTensor()]) - class VisualSTSEvaluator(Evaluator): def __init__( diff --git a/mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py b/mteb/evaluation/evaluators/Image/ZeroShotClassificationEvaluator.py similarity index 97% rename from mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py rename to mteb/evaluation/evaluators/Image/ZeroShotClassificationEvaluator.py index 65643a4b91..678532cc1a 100644 --- a/mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py +++ b/mteb/evaluation/evaluators/Image/ZeroShotClassificationEvaluator.py @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) -class ZeroshotClassificationEvaluator(Evaluator): +class ZeroShotClassificationEvaluator(Evaluator): def __init__( self, dataset: Dataset, diff --git a/mteb/evaluation/evaluators/Image/__init__.py b/mteb/evaluation/evaluators/Image/__init__.py index b49b7c5226..2405efa0a1 100644 --- a/mteb/evaluation/evaluators/Image/__init__.py +++ b/mteb/evaluation/evaluators/Image/__init__.py @@ -11,7 +11,7 @@ from .ClusteringEvaluator import ImageClusteringEvaluator from .ImageTextPairClassificationEvaluator import ImageTextPairClassificationEvaluator from .VisualSTSEvaluator import VisualSTSEvaluator -from .ZeroshotClassificationEvaluator import ZeroshotClassificationEvaluator +from .ZeroShotClassificationEvaluator import ZeroShotClassificationEvaluator __all__ = [ "Any2AnyMultiChoiceEvaluator", @@ -23,5 +23,5 @@ "ImageClusteringEvaluator", "ImageTextPairClassificationEvaluator", "VisualSTSEvaluator", - "ZeroshotClassificationEvaluator", + "ZeroShotClassificationEvaluator", ] diff --git a/mteb/evaluation/evaluators/__init__.py b/mteb/evaluation/evaluators/__init__.py index c92648b5b2..4405751cbd 100644 --- a/mteb/evaluation/evaluators/__init__.py +++ b/mteb/evaluation/evaluators/__init__.py @@ -18,7 +18,7 @@ ImagelogRegClassificationEvaluator, ImageTextPairClassificationEvaluator, VisualSTSEvaluator, - ZeroshotClassificationEvaluator, + ZeroShotClassificationEvaluator, ) from .model_classes import DenseRetrievalExactSearch from .PairClassificationEvaluator import PairClassificationEvaluator @@ -52,5 +52,5 @@ "ImageClusteringEvaluator", "ImageTextPairClassificationEvaluator", "VisualSTSEvaluator", - "ZeroshotClassificationEvaluator", + "ZeroShotClassificationEvaluator", ] diff --git a/mteb/models/Arabic_Triplet_Matryoshka_V2.py b/mteb/models/Arabic_Triplet_Matryoshka_V2.py deleted file mode 100644 index 1ade23ff32..0000000000 --- a/mteb/models/Arabic_Triplet_Matryoshka_V2.py +++ /dev/null @@ -1,26 +0,0 @@ -from __future__ import annotations - -from mteb.model_meta import ModelMeta - -arabic_triplet_matryoshka = ModelMeta( - name="Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2", - languages=["ara-Arab"], - open_weights=True, - revision="ed357f222f0b6ea6670d2c9b5a1cb93950d34200", - release_date="2024-07-28", - n_parameters=135_000_000, - memory_usage_mb=516, - embed_dim=768, - license="apache-2.0", - max_tokens=768, - reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - public_training_code=None, - adapted_from="aubmindlab/bert-base-arabertv02", - public_training_data="akhooli/arabic-triplets-1m-curated-sims-len", - training_datasets={ - # "akhooli/arabic-triplets-1m-curated-sims-len" - }, -) diff --git a/mteb/models/b1ade_models.py b/mteb/models/b1ade_models.py index 9672747bba..317368f0e7 100644 --- a/mteb/models/b1ade_models.py +++ b/mteb/models/b1ade_models.py @@ -11,7 +11,7 @@ b1ade_embed = ModelMeta( loader=sentence_transformers_loader, - name="b1ade-embed", + name="w601sxs/b1ade-embed", languages=["eng-Latn"], revision="3bdac13927fdc888b903db93b2ffdbd90b295a69", open_weights=True, diff --git a/mteb/models/cohere_v.py b/mteb/models/cohere_v.py index c4187e1dc0..023d2d59dc 100644 --- a/mteb/models/cohere_v.py +++ b/mteb/models/cohere_v.py @@ -9,14 +9,11 @@ import numpy as np import torch from torch.utils.data import DataLoader -from torchvision import transforms from tqdm import tqdm from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta, ScoringFunction - -api_key = os.getenv("COHERE_API_KEY") -tensor_to_image = transforms.Compose([transforms.ToPILImage()]) +from mteb.requires_package import requires_image_dependencies def cohere_v_loader(**kwargs): @@ -37,9 +34,14 @@ def __init__( Cohere currently supports 40 images/min, thus time.sleep(1.5) is applied after each image. Remove or adjust this after Cohere API changes capacity. """ + requires_image_dependencies() + from torchvision import transforms + self.model_name = model_name + api_key = os.getenv("COHERE_API_KEY") self.client = cohere.ClientV2(api_key) self.image_format = "JPEG" + self.transform = transforms.Compose([transforms.PILToTensor()]) def get_text_embeddings( self, @@ -76,7 +78,7 @@ def get_image_embeddings( for image in batch: # cohere only supports 1 image per call buffered = io.BytesIO() - image = tensor_to_image(image) + image = self.transform(image) image.save(buffered, format=self.image_format) image_bytes = buffered.getvalue() stringified_buffer = base64.b64encode(image_bytes).decode("utf-8") diff --git a/mteb/models/evaclip_models.py b/mteb/models/evaclip_models.py index a69d26495d..f907f9f26e 100644 --- a/mteb/models/evaclip_models.py +++ b/mteb/models/evaclip_models.py @@ -9,6 +9,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta +from mteb.requires_package import requires_image_dependencies def evaclip_loader(**kwargs): @@ -35,6 +36,8 @@ def __init__( device: str = "cuda" if torch.cuda.is_available() else "cpu", **kwargs: Any, ): + requires_image_dependencies() + self.model_name = model_name self.device = device pretrained = "eva_clip" # or "/path/to/EVA02_CLIP_B_psz16_s8B.pt" diff --git a/mteb/models/jina_clip.py b/mteb/models/jina_clip.py index 75c0a1d77a..8d2b49a8b0 100644 --- a/mteb/models/jina_clip.py +++ b/mteb/models/jina_clip.py @@ -11,6 +11,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_image_dependencies class JinaCLIPModelWrapper(Wrapper): @@ -20,6 +21,8 @@ def __init__( device: str = "cuda" if torch.cuda.is_available() else "cpu", **kwargs: Any, ): + requires_image_dependencies() + self.model_name = model_name self.device = device self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to( diff --git a/mteb/models/llm2clip_models.py b/mteb/models/llm2clip_models.py index 4dbaaea04c..2a83f3ff85 100644 --- a/mteb/models/llm2clip_models.py +++ b/mteb/models/llm2clip_models.py @@ -12,6 +12,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_image_dependencies MODEL2PROCESSOR = { "microsoft/LLM2CLIP-Openai-L-14-336": "openai/clip-vit-large-patch14-336", @@ -36,6 +37,8 @@ def __init__( device: str = "cuda" if torch.cuda.is_available() else "cpu", **kwargs: Any, ): + requires_image_dependencies() + if model_name not in MODEL2PROCESSOR: raise Exception( f"This model {model_name} is not in the supported mode list: {list(MODEL2PROCESSOR.keys())}." diff --git a/mteb/models/moco_models.py b/mteb/models/moco_models.py index 1c8680a795..3036f76b5c 100644 --- a/mteb/models/moco_models.py +++ b/mteb/models/moco_models.py @@ -10,6 +10,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_image_dependencies def mocov3_loader(**kwargs): @@ -29,6 +30,8 @@ def __init__( device: str = "cuda" if torch.cuda.is_available() else "cpu", **kwargs: Any, ): + requires_image_dependencies() + self.model_name = model_name self.device = device name = "vit_base_patch16_224" @@ -97,7 +100,6 @@ def encode( prompt_type: PromptType | None = None, **kwargs: Any, ) -> np.ndarray | torch.Tensor: - 0 / 0 if "text" in inputs.dataset.features: raise ValueError( "MOCO models only support image encoding. Text encoding is not supported." diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 94312fb31d..aeccb8692f 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -27,10 +27,20 @@ def __init__( """Wrapper for OpenAIs embedding API. To handle documents larger than 8191 tokens, we truncate the document to the specified sequence length. """ - requires_package(self, "openai", "Openai text embedding") + requires_package( + self, + "openai", + "Openai text embedding", + install_instruction="pip install mteb[openai]", + ) from openai import OpenAI - requires_package(self, "tiktoken", "Tiktoken package") + requires_package( + self, + "tiktoken", + "Tiktoken package", + install_instruction="pip install mteb[openai]", + ) import tiktoken self._client = OpenAI() diff --git a/mteb/models/openclip_models.py b/mteb/models/openclip_models.py index 94214254cd..c2ee5bb2e6 100644 --- a/mteb/models/openclip_models.py +++ b/mteb/models/openclip_models.py @@ -10,6 +10,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_image_dependencies def openclip_loader(**kwargs): @@ -25,6 +26,8 @@ def __init__( device: str = "cuda" if torch.cuda.is_available() else "cpu", **kwargs: Any, ): + requires_image_dependencies() + self.model_name = model_name self.device = device self.model, _, self.img_preprocess = open_clip.create_model_and_transforms( diff --git a/mteb/models/vista_models.py b/mteb/models/vista_models.py index 72159ace8e..e6de5cfd8f 100644 --- a/mteb/models/vista_models.py +++ b/mteb/models/vista_models.py @@ -5,14 +5,11 @@ import numpy as np import torch from torch.utils.data import DataLoader -from torchvision import transforms from tqdm import tqdm from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta - -tensor_to_image = transforms.Compose([transforms.ToPILImage()]) -pil_to_tensor = transforms.Compose([transforms.PILToTensor()]) +from mteb.requires_package import requires_image_dependencies def vista_loader(**kwargs): @@ -24,18 +21,47 @@ def vista_loader(**kwargs): ) class VisualizedBGEWrapper(Visualized_BGE): + """Setting up VISTA + + ``` + git clone https://github.com/FlagOpen/FlagEmbedding.git + cd FlagEmbedding/research/visual_bge + pip install -e . + pip install torchvision timm einops ftfy + ``` + back to the root folder of mteb; download the vision tower for bge-base + ``` + cd .. + wget https://huggingface.co/BAAI/bge-visualized/resolve/main/Visualized_base_en_v1.5.pth?download=true + ``` + rename it to `visualized_base_en_V1.5.pth` + ``` + mv Visualized_base_en_v1.5.pth?download=true visualized_base_en_V1.5.pth + ``` + download the vision tower for bge-m3 + ``` + wget https://huggingface.co/BAAI/bge-visualized/resolve/main/Visualized_m3.pth?download=true + ``` + rename it to `visualized_m3.pth` + ``` + mv Visualized_m3.pth?download=true visualized_m3.pth + ``` + """ + def __init__( self, - model_name_bge: str = None, + model_name_bge: str | None = None, model_weight=None, normlized: bool = True, sentence_pooling_method: str = "cls", negatives_cross_device: bool = False, temperature: float = 0.02, from_pretrained=None, - image_tokens_num: int = None, + image_tokens_num: int | None = None, **kwargs: Any, ): + requires_image_dependencies() + super().__init__( model_name_bge=model_name_bge, model_weight=model_weight, diff --git a/mteb/models/vlm2vec_models.py b/mteb/models/vlm2vec_models.py index e15b093d01..7a0070edf0 100644 --- a/mteb/models/vlm2vec_models.py +++ b/mteb/models/vlm2vec_models.py @@ -11,6 +11,7 @@ from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta +from mteb.requires_package import requires_image_dependencies logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) @@ -27,6 +28,7 @@ def __init__( device: str = "cuda" if torch.cuda.is_available() else "cpu", **kwargs, ): + requires_image_dependencies() try: import flash_attn # noqa from peft import LoraConfig, PeftModel # noqa diff --git a/mteb/models/voyage_v.py b/mteb/models/voyage_v.py index 3d83031d5f..c270d5d363 100644 --- a/mteb/models/voyage_v.py +++ b/mteb/models/voyage_v.py @@ -1,21 +1,17 @@ from __future__ import annotations import logging -import os from typing import Any, Literal import numpy as np import torch from PIL import Image from torch.utils.data import DataLoader -from torchvision import transforms from tqdm import tqdm from mteb.encoder_interface import BatchedInput, PromptType from mteb.model_meta import ModelMeta, ScoringFunction - -api_key = os.getenv("VOYAGE_API_KEY") -tensor_to_image = transforms.Compose([transforms.ToPILImage()]) +from mteb.requires_package import requires_image_dependencies def downsample_image( @@ -37,15 +33,15 @@ def downsample_image( logging.info( f"Downsampling image from {width}x{height} to {new_width}x{new_height}" ) - return image.resize(new_size, Image.LANCZOS) + return image.resize(new_size, Image.LANCZOS) # type: ignore if width > height: if width > 10000: logging.error("Processing extremely wide images.") - return image.resize((10000, height), Image.LANCZOS) + return image.resize((10000, height), Image.LANCZOS) # type: ignore else: if height > 10000: logging.error("Processing extremely high images.") - return image.resize((width, 10000), Image.LANCZOS) + return image.resize((width, 10000), Image.LANCZOS) # type: ignore return image @@ -67,8 +63,12 @@ def __init__( model_name: str, **kwargs: Any, ): + requires_image_dependencies() + from torchvision import transforms + self.model_name = model_name.split("/")[-1] self.vo = voyageai.Client() + self.tensor_to_image = transforms.Compose([transforms.PILToTensor()]) @retry( stop=stop_after_attempt(6), # Stop after 6 attempts @@ -126,7 +126,7 @@ def get_image_embeddings( images, disable=not show_progress_bar, desc="Image Encoding" ): batch_images = [ - [downsample_image(tensor_to_image(image))] + [downsample_image(self.tensor_to_image(image))] for image in batch["image"] ] embeddings = self._multimodal_embed( @@ -171,7 +171,7 @@ def encode( inputs, disable=not show_progress_bar, desc="Interleaved Encoding" ): batch_images = [ - downsample_image(tensor_to_image(image)) + downsample_image(self.tensor_to_image(image)) for image in batch["image"] ] batch_texts = batch["text"] diff --git a/mteb/requires_package.py b/mteb/requires_package.py index a91c2ba093..d261acdffb 100644 --- a/mteb/requires_package.py +++ b/mteb/requires_package.py @@ -8,10 +8,25 @@ def _is_package_available(pkg_name: str) -> bool: return package_exists -def requires_package(obj, package_name: str, model_name: str) -> None: +def requires_package( + obj, package_name: str, model_name: str, install_instruction: str | None = None +) -> None: if not _is_package_available(package_name): + install_instruction = ( + f"pip install {package_name}" + if install_instruction is None + else install_instruction + ) name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__ raise ImportError( f"{name} requires the `{package_name}` library but it was not found in your environment. " - + f"If you want to load {model_name} models, please `pip install {package_name}` else they will not be available." + + f"If you want to load {model_name} models, please `{install_instruction}` to install the package." + ) + + +def requires_image_dependencies() -> None: + if not _is_package_available("torchvision"): + raise ImportError( + "You are trying to running the image subset of mteb without having installed the required dependencies (`torchvision`). " + + "You can install the required dependencies using `pip install 'mteb[image]'` to install the required dependencies." ) diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index 20d29ca49e..01ab24beee 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -30,8 +30,6 @@ from .vie import VieMedEVBitextMining __all__ = [ - "WebFAQBitextMiningQAs", - "WebFAQBitextMiningQuestions", "TbilisiCityHallBitextMining", "VieMedEVBitextMining", "BornholmBitextMining", @@ -49,6 +47,8 @@ "NTREXBitextMining", "IndicGenBenchFloresBitextMining", "NollySentiBitextMining", + "WebFAQBitextMiningQAs", + "WebFAQBitextMiningQuestions", "BUCCBitextMiningFast", "PhincBitextMining", "TatoebaBitextMining", diff --git a/mteb/tasks/BitextMining/multilingual/__init__.py b/mteb/tasks/BitextMining/multilingual/__init__.py index 06838caae7..cb48017ff3 100644 --- a/mteb/tasks/BitextMining/multilingual/__init__.py +++ b/mteb/tasks/BitextMining/multilingual/__init__.py @@ -22,8 +22,6 @@ __all__ = [ "IN22ConvBitextMining", - "WebFAQBitextMiningQuestions", - "WebFAQBitextMiningQAs", "IN22GenBitextMining", "BUCCBitextMining", "LinceMTBitextMining", @@ -32,6 +30,8 @@ "NTREXBitextMining", "IndicGenBenchFloresBitextMining", "NollySentiBitextMining", + "WebFAQBitextMiningQAs", + "WebFAQBitextMiningQuestions", "BUCCBitextMiningFast", "PhincBitextMining", "TatoebaBitextMining", diff --git a/mteb/tasks/Classification/eng/AmazonPolarityClassification.py b/mteb/tasks/Classification/eng/AmazonPolarityClassification.py index 47b68b5e0d..7ed07bddbc 100644 --- a/mteb/tasks/Classification/eng/AmazonPolarityClassification.py +++ b/mteb/tasks/Classification/eng/AmazonPolarityClassification.py @@ -25,8 +25,8 @@ class AmazonPolarityClassification(AbsTaskClassification): ), # Estimated range for the collection of reviews domains=["Reviews", "Written"], task_subtypes=["Sentiment/Hate speech"], - license="not specified", annotations_creators="derived", + license="apache-2.0", dialect=[], sample_creation="found", bibtex_citation="""@article{McAuley2013HiddenFA, diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index d5c2ef042b..ae99b5b2ab 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -58,6 +58,7 @@ HALClusteringS2SFast, ) from .jpn import LivedoorNewsClustering, LivedoorNewsClusteringv2, MewsC16JaClustering +from .kor import KlueMrcDomainClustering, KlueYnatMrcCategoryClustering from .multilingual import ( IndicReviewsClusteringP2P, MasakhaNEWSClusteringP2P, @@ -177,6 +178,8 @@ "LivedoorNewsClustering", "LivedoorNewsClusteringv2", "MewsC16JaClustering", + "KlueYnatMrcCategoryClustering", + "KlueMrcDomainClustering", "WikiClusteringFastP2P", "WikiClusteringP2P", "MLSUMClusteringS2S", diff --git a/mteb/tasks/Clustering/eng/ClusTrecCovid.py b/mteb/tasks/Clustering/eng/ClusTrecCovid.py new file mode 100644 index 0000000000..51fb455cd6 --- /dev/null +++ b/mteb/tasks/Clustering/eng/ClusTrecCovid.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClusteringFast import ( + AbsTaskClusteringFast, +) +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ClusTrecCovid(AbsTaskClusteringFast, MultilingualTask): + metadata = TaskMetadata( + name="ClusTREC-Covid", + description="A Topical Clustering Benchmark for COVID-19 Scientific Research across 50 covid-19 related topics.", + reference="https://github.com/katzurik/Knowledge_Navigator/tree/main/Benchmarks/CLUSTREC%20COVID", + dataset={ + "path": "Uri-ka/ClusTREC-Covid", + "revision": "7f3489153b8dad7336a54f63202deb1414c33309", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs={"title and abstract": ["eng-Latn"], "title": ["eng-Latn"]}, + main_score="v_measure", + date=("2020-04-10", "2020-07-16"), + domains=["Academic", "Medical", "Written"], + task_subtypes=["Thematic clustering"], + license="cc-by-sa-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="created", + bibtex_citation="""@inproceedings{katz-etal-2024-knowledge, + title = "Knowledge Navigator: {LLM}-guided Browsing Framework for Exploratory Search in Scientific Literature", + author = "Katz, Uri and + Levy, Mosh and + Goldberg, Yoav", + booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024", + month = nov, + year = "2024", + address = "Miami, Florida, USA", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2024.findings-emnlp.516", + pages = "8838--8855", + } + """, + prompt="Identify the main category of the covid-19 papers based on the titles and abstracts", + ) diff --git a/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py b/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py index 0b94e671c6..def8cd0c91 100644 --- a/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py +++ b/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py @@ -12,7 +12,7 @@ class KlueMrcDomainClustering(AbsTaskClustering): description="this dataset is a processed and redistributed version of the KLUE-MRC dataset. Domain: Game / Media / Automotive / Finance / Real Estate / Education ", reference="https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_context_domain", type="Clustering", - category="p2p", + category="t2c", modalities=["text"], eval_splits=["test"], eval_langs=["kor-Hang"], @@ -22,13 +22,12 @@ class KlueMrcDomainClustering(AbsTaskClustering): "revision": "a814b5ef0b6814991785f2c31af8e38ef7bb3f0d", }, date=("2016-01-01", "2020-12-31"), - form="Written", domains=["News", "Written"], task_subtypes=[], license="cc-by-sa-4.0", annotations_creators="human-annotated", dialect=[], - text_creation="found", + sample_creation="found", bibtex_citation="""@misc{park2021klue, title={KLUE: Korean Language Understanding Evaluation}, author={Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, diff --git a/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py b/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py index 86fc961f04..0d2d1ee9f7 100644 --- a/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py +++ b/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py @@ -12,7 +12,7 @@ class KlueYnatMrcCategoryClustering(AbsTaskClustering): description="this dataset is a processed and redistributed version of the KLUE-Ynat & KLUE-MRC dataset. News_category: IT/Science, Sports, Media/Culture, Ecomomy/Finance, Real Estate ", reference="https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_ynat_title", type="Clustering", - category="p2p", + category="t2t", modalities=["text"], eval_splits=["test"], eval_langs=["kor-Hang"], @@ -22,13 +22,12 @@ class KlueYnatMrcCategoryClustering(AbsTaskClustering): "revision": "5bbded98f39e3bf6e81e15aa79c6616008519e29", }, date=("2016-01-01", "2020-12-31"), - form="Written", domains=["News", "Written"], task_subtypes=[], license="cc-by-sa-4.0", annotations_creators="human-annotated", dialect=[], - text_creation="found", + sample_creation="found", bibtex_citation="""@misc{park2021klue, title={KLUE: Korean Language Understanding Evaluation}, author={Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, diff --git a/mteb/tasks/Clustering/kor/__init__.py b/mteb/tasks/Clustering/kor/__init__.py new file mode 100644 index 0000000000..3b7591cb2b --- /dev/null +++ b/mteb/tasks/Clustering/kor/__init__.py @@ -0,0 +1,6 @@ +from __future__ import annotations + +from .KlueMrcDomainClustering import KlueMrcDomainClustering +from .KlueYnatMrcCategoryClustering import KlueYnatMrcCategoryClustering + +__all__ = ["KlueYnatMrcCategoryClustering", "KlueMrcDomainClustering"] diff --git a/mteb/tasks/Image/ZeroShotClassification/__init__.py b/mteb/tasks/Image/ZeroShotClassification/__init__.py new file mode 100644 index 0000000000..db3391ad1a --- /dev/null +++ b/mteb/tasks/Image/ZeroShotClassification/__init__.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from .eng import ( + CLEVR, + BirdsnapZeroShotClassification, + Caltech101ZeroShotClassification, + CIFAR10ZeroShotClassification, + CIFAR100ZeroShotClassification, + CLEVRCount, + Country211ZeroShotClassification, + DTDZeroShotClassification, + EuroSATZeroShotClassification, + FER2013ZeroShotClassification, + FGVCAircraftZeroShotClassification, + Food101ZeroShotClassification, + GTSRBZeroShotClassification, + Imagenet1kZeroShotClassification, + MNISTZeroShotClassification, + OxfordPetsZeroShotClassification, + PatchCamelyonZeroShotClassification, + RenderedSST2, + RESISC45ZeroShotClassification, + SciMMIR, + StanfordCarsZeroShotClassification, + STL10ZeroShotClassification, + SUN397ZeroShotClassification, + UCF101ZeroShotClassification, +) + +__all__ = [ + "MNISTZeroShotClassification", + "CLEVR", + "CLEVRCount", + "SciMMIR", + "PatchCamelyonZeroShotClassification", + "OxfordPetsZeroShotClassification", + "EuroSATZeroShotClassification", + "StanfordCarsZeroShotClassification", + "CIFAR100ZeroShotClassification", + "CIFAR10ZeroShotClassification", + "Country211ZeroShotClassification", + "Food101ZeroShotClassification", + "SUN397ZeroShotClassification", + "GTSRBZeroShotClassification", + "Imagenet1kZeroShotClassification", + "DTDZeroShotClassification", + "RESISC45ZeroShotClassification", + "STL10ZeroShotClassification", + "Caltech101ZeroShotClassification", + "BirdsnapZeroShotClassification", + "RenderedSST2", + "UCF101ZeroShotClassification", + "FER2013ZeroShotClassification", + "FGVCAircraftZeroShotClassification", +] diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py b/mteb/tasks/Image/ZeroShotClassification/eng/Birdsnap.py similarity index 91% rename from mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py rename to mteb/tasks/Image/ZeroShotClassification/eng/Birdsnap.py index 3888478b41..599fdeee4d 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Birdsnap.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class BirdsnapZeroshotClassification(AbsTaskZeroshotClassification): +class BirdsnapZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="BirdsnapZeroShot", description="Classifying bird images from 500 species.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py b/mteb/tasks/Image/ZeroShotClassification/eng/CIFAR.py similarity index 92% rename from mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py rename to mteb/tasks/Image/ZeroShotClassification/eng/CIFAR.py index 4745220a30..ea6127137b 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/CIFAR.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class CIFAR10ZeroShotClassification(AbsTaskZeroshotClassification): +class CIFAR10ZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="CIFAR10ZeroShot", description="Classifying images from 10 classes.", @@ -48,7 +48,7 @@ def get_candidate_labels(self) -> list[str]: ] -class CIFAR100ZeroShotClassification(AbsTaskZeroshotClassification): +class CIFAR100ZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="CIFAR100ZeroShot", description="Classifying images from 100 classes.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/CLEVR.py b/mteb/tasks/Image/ZeroShotClassification/eng/CLEVR.py similarity index 94% rename from mteb/tasks/Image/ZeroshotClassification/eng/CLEVR.py rename to mteb/tasks/Image/ZeroShotClassification/eng/CLEVR.py index a8a2d1d8ee..1f1422ccd0 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/CLEVR.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/CLEVR.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class CLEVR(AbsTaskZeroshotClassification): +class CLEVR(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="CLEVRZeroShot", description="CLEVR closest object distance identification task.", @@ -54,7 +54,7 @@ def get_candidate_labels(self) -> list[str]: return [f"{c} shapes." for c in labels] -class CLEVRCount(AbsTaskZeroshotClassification): +class CLEVRCount(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="CLEVRCountZeroShot", description="CLEVR count objects task.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py b/mteb/tasks/Image/ZeroShotClassification/eng/Caltech101.py similarity index 92% rename from mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py rename to mteb/tasks/Image/ZeroShotClassification/eng/Caltech101.py index 1e171de11a..b7ba798e21 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Caltech101.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class Caltech101ZeroshotClassification(AbsTaskZeroshotClassification): +class Caltech101ZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="Caltech101ZeroShot", description="Classifying images of 101 widely varied objects.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Country211.py b/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py similarity index 91% rename from mteb/tasks/Image/ZeroshotClassification/eng/Country211.py rename to mteb/tasks/Image/ZeroShotClassification/eng/Country211.py index f04dab1959..d801312532 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Country211.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py @@ -2,13 +2,13 @@ import os -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class Country211ZeroshotClassification(AbsTaskZeroshotClassification): +class Country211ZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="Country211ZeroShot", description="Classifying images of 211 countries.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py b/mteb/tasks/Image/ZeroShotClassification/eng/DTD.py similarity index 90% rename from mteb/tasks/Image/ZeroshotClassification/eng/DTD.py rename to mteb/tasks/Image/ZeroShotClassification/eng/DTD.py index 9092d5fb4b..eb77b4dee3 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/DTD.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class DTDZeroshotClassification(AbsTaskZeroshotClassification): +class DTDZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="DTDZeroShot", description="Describable Textures Dataset in 47 categories.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py b/mteb/tasks/Image/ZeroShotClassification/eng/EuroSAT.py similarity index 93% rename from mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py rename to mteb/tasks/Image/ZeroShotClassification/eng/EuroSAT.py index 255c436625..fedb9542a3 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/EuroSAT.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class EuroSATZeroshotClassification(AbsTaskZeroshotClassification): +class EuroSATZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="EuroSATZeroShot", description="Classifying satellite images.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py b/mteb/tasks/Image/ZeroShotClassification/eng/FER2013.py similarity index 90% rename from mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py rename to mteb/tasks/Image/ZeroShotClassification/eng/FER2013.py index 98fb6af823..0b39c3d221 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/FER2013.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class FER2013ZeroshotClassification(AbsTaskZeroshotClassification): +class FER2013ZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="FER2013ZeroShot", description="Classifying facial emotions.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py b/mteb/tasks/Image/ZeroShotClassification/eng/FGVCAircraft.py similarity index 91% rename from mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py rename to mteb/tasks/Image/ZeroShotClassification/eng/FGVCAircraft.py index 330d0ebe96..e69b5c73ea 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/FGVCAircraft.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class FGVCAircraftZeroShotClassification(AbsTaskZeroshotClassification): +class FGVCAircraftZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="FGVCAircraftZeroShot", description="Classifying aircraft images from 41 manufacturers and 102 variants.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py b/mteb/tasks/Image/ZeroShotClassification/eng/Food101.py similarity index 90% rename from mteb/tasks/Image/ZeroshotClassification/eng/Food101.py rename to mteb/tasks/Image/ZeroShotClassification/eng/Food101.py index d231054695..2df9c679c5 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Food101.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class Food101ZeroShotClassification(AbsTaskZeroshotClassification): +class Food101ZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="Food101ZeroShot", description="Classifying food.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/GTSRB.py b/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py similarity index 92% rename from mteb/tasks/Image/ZeroshotClassification/eng/GTSRB.py rename to mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py index d23aea16b5..3ea73e3055 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/GTSRB.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py @@ -2,13 +2,13 @@ import os -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class GTSRBZeroshotClassification(AbsTaskZeroshotClassification): +class GTSRBZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="GTSRBZeroShot", description="""The German Traffic Sign Recognition Benchmark (GTSRB) is a multi-class classification dataset for traffic signs. It consists of dataset of more than 50,000 traffic sign images. The dataset comprises 43 classes with unbalanced class frequencies.""", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Imagenet1k.py b/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py similarity index 91% rename from mteb/tasks/Image/ZeroshotClassification/eng/Imagenet1k.py rename to mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py index 42ad9693c4..8d74a4d378 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Imagenet1k.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py @@ -2,13 +2,13 @@ import os -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class Imagenet1kZeroshotClassification(AbsTaskZeroshotClassification): +class Imagenet1kZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="Imagenet1kZeroShot", description="ImageNet, a large-scale ontology of images built upon the backbone of the WordNet structure.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py b/mteb/tasks/Image/ZeroShotClassification/eng/MNIST.py similarity index 89% rename from mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py rename to mteb/tasks/Image/ZeroShotClassification/eng/MNIST.py index d5ec3b2405..90be5866aa 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/MNIST.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class MNISTZeroshotClassification(AbsTaskZeroshotClassification): +class MNISTZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="MNISTZeroShot", description="Classifying handwritten digits.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py b/mteb/tasks/Image/ZeroShotClassification/eng/OxfordPets.py similarity index 90% rename from mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py rename to mteb/tasks/Image/ZeroShotClassification/eng/OxfordPets.py index d237b4faf4..f244d42f71 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/OxfordPets.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class OxfordPetsZeroshotClassification(AbsTaskZeroshotClassification): +class OxfordPetsZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="OxfordPetsZeroShot", description="Classifying animal images.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/PatchCamelyon.py b/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py similarity index 94% rename from mteb/tasks/Image/ZeroshotClassification/eng/PatchCamelyon.py rename to mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py index 6aab1b87c8..ab5d4ca670 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/PatchCamelyon.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py @@ -2,13 +2,13 @@ import os -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class PatchCamelyonZeroshotClassification(AbsTaskZeroshotClassification): +class PatchCamelyonZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="PatchCamelyonZeroShot", description="""Histopathology diagnosis classification dataset.""", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py b/mteb/tasks/Image/ZeroShotClassification/eng/RESISC45.py similarity index 92% rename from mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py rename to mteb/tasks/Image/ZeroShotClassification/eng/RESISC45.py index 412f4e5743..b01550b3c3 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/RESISC45.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class RESISC45ZeroshotClassification(AbsTaskZeroshotClassification): +class RESISC45ZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="RESISC45ZeroShot", description="Remote Sensing Image Scene Classification by Northwestern Polytechnical University (NWPU).", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/RenderedSST2.py b/mteb/tasks/Image/ZeroShotClassification/eng/RenderedSST2.py similarity index 88% rename from mteb/tasks/Image/ZeroshotClassification/eng/RenderedSST2.py rename to mteb/tasks/Image/ZeroShotClassification/eng/RenderedSST2.py index fa7db8c77f..d51573df7b 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/RenderedSST2.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/RenderedSST2.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class RenderedSST2(AbsTaskZeroshotClassification): +class RenderedSST2(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="RenderedSST2", description="RenderedSST2.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py b/mteb/tasks/Image/ZeroShotClassification/eng/STL10.py similarity index 92% rename from mteb/tasks/Image/ZeroshotClassification/eng/STL10.py rename to mteb/tasks/Image/ZeroShotClassification/eng/STL10.py index 853cd673f2..80e576a07a 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/STL10.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class STL10ZeroshotClassification(AbsTaskZeroshotClassification): +class STL10ZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="STL10ZeroShot", description="Classifying 96x96 images from 10 classes.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py b/mteb/tasks/Image/ZeroShotClassification/eng/SUN397.py similarity index 92% rename from mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py rename to mteb/tasks/Image/ZeroShotClassification/eng/SUN397.py index f64931bbfd..638fd7ed8a 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/SUN397.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class SUN397ZeroshotClassification(AbsTaskZeroshotClassification): +class SUN397ZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="SUN397ZeroShot", description="Large scale scene recognition in 397 categories.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/SciMMIR.py b/mteb/tasks/Image/ZeroShotClassification/eng/SciMMIR.py similarity index 93% rename from mteb/tasks/Image/ZeroshotClassification/eng/SciMMIR.py rename to mteb/tasks/Image/ZeroShotClassification/eng/SciMMIR.py index cd1188bc39..735ecd60e6 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/SciMMIR.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/SciMMIR.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class SciMMIR(AbsTaskZeroshotClassification): +class SciMMIR(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="SciMMIR", description="SciMMIR.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py b/mteb/tasks/Image/ZeroShotClassification/eng/StanfordCars.py similarity index 90% rename from mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py rename to mteb/tasks/Image/ZeroShotClassification/eng/StanfordCars.py index 9e90340146..36a3efb196 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/StanfordCars.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class StanfordCarsZeroshotClassification(AbsTaskZeroshotClassification): +class StanfordCarsZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="StanfordCarsZeroShot", description="Classifying car images from 96 makes.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/UCF101.py b/mteb/tasks/Image/ZeroShotClassification/eng/UCF101.py similarity index 91% rename from mteb/tasks/Image/ZeroshotClassification/eng/UCF101.py rename to mteb/tasks/Image/ZeroShotClassification/eng/UCF101.py index 94ce11b107..63470c0885 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/UCF101.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/UCF101.py @@ -1,12 +1,12 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata -class UCF101ZeroshotClassification(AbsTaskZeroshotClassification): +class UCF101ZeroShotClassification(AbsTaskZeroShotClassification): metadata = TaskMetadata( name="UCF101ZeroShot", description="""UCF101 is an action recognition data set of realistic diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/__init__.py b/mteb/tasks/Image/ZeroShotClassification/eng/__init__.py new file mode 100644 index 0000000000..5158a54688 --- /dev/null +++ b/mteb/tasks/Image/ZeroShotClassification/eng/__init__.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from .Birdsnap import BirdsnapZeroShotClassification +from .Caltech101 import Caltech101ZeroShotClassification +from .CIFAR import CIFAR10ZeroShotClassification, CIFAR100ZeroShotClassification +from .CLEVR import CLEVR, CLEVRCount +from .Country211 import Country211ZeroShotClassification +from .DTD import DTDZeroShotClassification +from .EuroSAT import EuroSATZeroShotClassification +from .FER2013 import FER2013ZeroShotClassification +from .FGVCAircraft import FGVCAircraftZeroShotClassification +from .Food101 import Food101ZeroShotClassification +from .GTSRB import GTSRBZeroShotClassification +from .Imagenet1k import Imagenet1kZeroShotClassification +from .MNIST import MNISTZeroShotClassification +from .OxfordPets import OxfordPetsZeroShotClassification +from .PatchCamelyon import PatchCamelyonZeroShotClassification +from .RenderedSST2 import RenderedSST2 +from .RESISC45 import RESISC45ZeroShotClassification +from .SciMMIR import SciMMIR +from .StanfordCars import StanfordCarsZeroShotClassification +from .STL10 import STL10ZeroShotClassification +from .SUN397 import SUN397ZeroShotClassification +from .UCF101 import UCF101ZeroShotClassification + +__all__ = [ + "MNISTZeroShotClassification", + "CLEVR", + "CLEVRCount", + "SciMMIR", + "PatchCamelyonZeroShotClassification", + "OxfordPetsZeroShotClassification", + "EuroSATZeroShotClassification", + "StanfordCarsZeroShotClassification", + "CIFAR100ZeroShotClassification", + "CIFAR10ZeroShotClassification", + "Country211ZeroShotClassification", + "Food101ZeroShotClassification", + "SUN397ZeroShotClassification", + "GTSRBZeroShotClassification", + "Imagenet1kZeroShotClassification", + "DTDZeroShotClassification", + "RESISC45ZeroShotClassification", + "STL10ZeroShotClassification", + "Caltech101ZeroShotClassification", + "BirdsnapZeroShotClassification", + "RenderedSST2", + "UCF101ZeroShotClassification", + "FER2013ZeroShotClassification", + "FGVCAircraftZeroShotClassification", +] diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/templates/Country211_labels.txt b/mteb/tasks/Image/ZeroShotClassification/eng/templates/Country211_labels.txt similarity index 100% rename from mteb/tasks/Image/ZeroshotClassification/eng/templates/Country211_labels.txt rename to mteb/tasks/Image/ZeroShotClassification/eng/templates/Country211_labels.txt diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/templates/GTSRB_labels.txt b/mteb/tasks/Image/ZeroShotClassification/eng/templates/GTSRB_labels.txt similarity index 100% rename from mteb/tasks/Image/ZeroshotClassification/eng/templates/GTSRB_labels.txt rename to mteb/tasks/Image/ZeroShotClassification/eng/templates/GTSRB_labels.txt diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/templates/Imagenet1k_labels.txt b/mteb/tasks/Image/ZeroShotClassification/eng/templates/Imagenet1k_labels.txt similarity index 100% rename from mteb/tasks/Image/ZeroshotClassification/eng/templates/Imagenet1k_labels.txt rename to mteb/tasks/Image/ZeroShotClassification/eng/templates/Imagenet1k_labels.txt diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/templates/PatchCamelyon_labels.txt b/mteb/tasks/Image/ZeroShotClassification/eng/templates/PatchCamelyon_labels.txt similarity index 100% rename from mteb/tasks/Image/ZeroshotClassification/eng/templates/PatchCamelyon_labels.txt rename to mteb/tasks/Image/ZeroShotClassification/eng/templates/PatchCamelyon_labels.txt diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/templates/__init__.py b/mteb/tasks/Image/ZeroShotClassification/eng/templates/__init__.py similarity index 100% rename from mteb/tasks/Image/ZeroshotClassification/eng/templates/__init__.py rename to mteb/tasks/Image/ZeroShotClassification/eng/templates/__init__.py diff --git a/mteb/tasks/Image/ZeroshotClassification/__init__.py b/mteb/tasks/Image/ZeroshotClassification/__init__.py deleted file mode 100644 index 5a95aa4b95..0000000000 --- a/mteb/tasks/Image/ZeroshotClassification/__init__.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import annotations - -from .eng import ( - CLEVR, - BirdsnapZeroshotClassification, - Caltech101ZeroshotClassification, - CIFAR10ZeroShotClassification, - CIFAR100ZeroShotClassification, - CLEVRCount, - Country211ZeroshotClassification, - DTDZeroshotClassification, - EuroSATZeroshotClassification, - FER2013ZeroshotClassification, - FGVCAircraftZeroShotClassification, - Food101ZeroShotClassification, - GTSRBZeroshotClassification, - Imagenet1kZeroshotClassification, - MNISTZeroshotClassification, - OxfordPetsZeroshotClassification, - PatchCamelyonZeroshotClassification, - RenderedSST2, - RESISC45ZeroshotClassification, - SciMMIR, - StanfordCarsZeroshotClassification, - STL10ZeroshotClassification, - SUN397ZeroshotClassification, - UCF101ZeroshotClassification, -) - -__all__ = [ - "MNISTZeroshotClassification", - "CLEVR", - "CLEVRCount", - "SciMMIR", - "PatchCamelyonZeroshotClassification", - "OxfordPetsZeroshotClassification", - "EuroSATZeroshotClassification", - "StanfordCarsZeroshotClassification", - "CIFAR100ZeroShotClassification", - "CIFAR10ZeroShotClassification", - "Country211ZeroshotClassification", - "Food101ZeroShotClassification", - "SUN397ZeroshotClassification", - "GTSRBZeroshotClassification", - "Imagenet1kZeroshotClassification", - "DTDZeroshotClassification", - "RESISC45ZeroshotClassification", - "STL10ZeroshotClassification", - "Caltech101ZeroshotClassification", - "BirdsnapZeroshotClassification", - "RenderedSST2", - "UCF101ZeroshotClassification", - "FER2013ZeroshotClassification", - "FGVCAircraftZeroShotClassification", -] diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/__init__.py b/mteb/tasks/Image/ZeroshotClassification/eng/__init__.py deleted file mode 100644 index fde9f71f71..0000000000 --- a/mteb/tasks/Image/ZeroshotClassification/eng/__init__.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from .Birdsnap import BirdsnapZeroshotClassification -from .Caltech101 import Caltech101ZeroshotClassification -from .CIFAR import CIFAR10ZeroShotClassification, CIFAR100ZeroShotClassification -from .CLEVR import CLEVR, CLEVRCount -from .Country211 import Country211ZeroshotClassification -from .DTD import DTDZeroshotClassification -from .EuroSAT import EuroSATZeroshotClassification -from .FER2013 import FER2013ZeroshotClassification -from .FGVCAircraft import FGVCAircraftZeroShotClassification -from .Food101 import Food101ZeroShotClassification -from .GTSRB import GTSRBZeroshotClassification -from .Imagenet1k import Imagenet1kZeroshotClassification -from .MNIST import MNISTZeroshotClassification -from .OxfordPets import OxfordPetsZeroshotClassification -from .PatchCamelyon import PatchCamelyonZeroshotClassification -from .RenderedSST2 import RenderedSST2 -from .RESISC45 import RESISC45ZeroshotClassification -from .SciMMIR import SciMMIR -from .StanfordCars import StanfordCarsZeroshotClassification -from .STL10 import STL10ZeroshotClassification -from .SUN397 import SUN397ZeroshotClassification -from .UCF101 import UCF101ZeroshotClassification - -__all__ = [ - "MNISTZeroshotClassification", - "CLEVR", - "CLEVRCount", - "SciMMIR", - "PatchCamelyonZeroshotClassification", - "OxfordPetsZeroshotClassification", - "EuroSATZeroshotClassification", - "StanfordCarsZeroshotClassification", - "CIFAR100ZeroShotClassification", - "CIFAR10ZeroShotClassification", - "Country211ZeroshotClassification", - "Food101ZeroShotClassification", - "SUN397ZeroshotClassification", - "GTSRBZeroshotClassification", - "Imagenet1kZeroshotClassification", - "DTDZeroshotClassification", - "RESISC45ZeroshotClassification", - "STL10ZeroshotClassification", - "Caltech101ZeroshotClassification", - "BirdsnapZeroshotClassification", - "RenderedSST2", - "UCF101ZeroshotClassification", - "FER2013ZeroshotClassification", - "FGVCAircraftZeroShotClassification", -] diff --git a/mteb/tasks/Image/__init__.py b/mteb/tasks/Image/__init__.py index f17c2d410e..eed836d54d 100644 --- a/mteb/tasks/Image/__init__.py +++ b/mteb/tasks/Image/__init__.py @@ -129,59 +129,35 @@ STS17MultilingualVisualSTS, STSBenchmarkMultilingualVisualSTS, ) -from .ZeroshotClassification import ( +from .ZeroShotClassification import ( CLEVR, - BirdsnapZeroshotClassification, - Caltech101ZeroshotClassification, + BirdsnapZeroShotClassification, + Caltech101ZeroShotClassification, CIFAR10ZeroShotClassification, CIFAR100ZeroShotClassification, CLEVRCount, - Country211ZeroshotClassification, - DTDZeroshotClassification, - EuroSATZeroshotClassification, - FER2013ZeroshotClassification, + Country211ZeroShotClassification, + DTDZeroShotClassification, + EuroSATZeroShotClassification, + FER2013ZeroShotClassification, FGVCAircraftZeroShotClassification, Food101ZeroShotClassification, - GTSRBZeroshotClassification, - Imagenet1kZeroshotClassification, - MNISTZeroshotClassification, - OxfordPetsZeroshotClassification, - PatchCamelyonZeroshotClassification, + GTSRBZeroShotClassification, + Imagenet1kZeroShotClassification, + MNISTZeroShotClassification, + OxfordPetsZeroShotClassification, + PatchCamelyonZeroShotClassification, RenderedSST2, - RESISC45ZeroshotClassification, + RESISC45ZeroShotClassification, SciMMIR, - StanfordCarsZeroshotClassification, - STL10ZeroshotClassification, - SUN397ZeroshotClassification, - UCF101ZeroshotClassification, + StanfordCarsZeroShotClassification, + STL10ZeroShotClassification, + SUN397ZeroShotClassification, + UCF101ZeroShotClassification, ) __all__ = [ "VOC2007Classification", - "MNISTZeroshotClassification", - "CLEVR", - "CLEVRCount", - "SciMMIR", - "PatchCamelyonZeroshotClassification", - "OxfordPetsZeroshotClassification", - "EuroSATZeroshotClassification", - "StanfordCarsZeroshotClassification", - "CIFAR100ZeroShotClassification", - "CIFAR10ZeroShotClassification", - "Country211ZeroshotClassification", - "Food101ZeroShotClassification", - "SUN397ZeroshotClassification", - "GTSRBZeroshotClassification", - "Imagenet1kZeroshotClassification", - "DTDZeroshotClassification", - "RESISC45ZeroshotClassification", - "STL10ZeroshotClassification", - "Caltech101ZeroshotClassification", - "BirdsnapZeroshotClassification", - "RenderedSST2", - "UCF101ZeroshotClassification", - "FER2013ZeroshotClassification", - "FGVCAircraftZeroShotClassification", "STS17MultilingualVisualSTS", "STSBenchmarkMultilingualVisualSTS", "STS13VisualSTS", @@ -296,4 +272,28 @@ "XFlickr30kCoT2IRetrieval", "WITT2IRetrieval", "XM3600T2IRetrieval", + "MNISTZeroShotClassification", + "CLEVR", + "CLEVRCount", + "SciMMIR", + "PatchCamelyonZeroShotClassification", + "OxfordPetsZeroShotClassification", + "EuroSATZeroShotClassification", + "StanfordCarsZeroShotClassification", + "CIFAR100ZeroShotClassification", + "CIFAR10ZeroShotClassification", + "Country211ZeroShotClassification", + "Food101ZeroShotClassification", + "SUN397ZeroShotClassification", + "GTSRBZeroShotClassification", + "Imagenet1kZeroShotClassification", + "DTDZeroShotClassification", + "RESISC45ZeroShotClassification", + "STL10ZeroShotClassification", + "Caltech101ZeroShotClassification", + "BirdsnapZeroShotClassification", + "RenderedSST2", + "UCF101ZeroShotClassification", + "FER2013ZeroShotClassification", + "FGVCAircraftZeroShotClassification", ] diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 4907263808..220b80a3df 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -42,6 +42,7 @@ AlphaNLI, ARCChallenge, ArguAna, + BrightLongRetrieval, BrightRetrieval, BuiltBenchRetrieval, ChemHotpotQARetrieval, @@ -271,7 +272,6 @@ ) __all__ = [ - "WebFAQRetrieval", "CmedqaRetrieval", "CovidRetrieval", "DuRetrieval", @@ -429,6 +429,7 @@ "HellaSwag", "PIQA", "SpartQA", + "BrightLongRetrieval", "BrightRetrieval", "TempReasonL1", "HotpotQA", @@ -490,6 +491,7 @@ "HunSum2AbstractiveRetrieval", "AutoRAGRetrieval", "KoStrategyQA", + "WebFAQRetrieval", "WikipediaRetrievalMultilingual", "MintakaRetrieval", "PublicHealthQARetrieval", diff --git a/mteb/tasks/Retrieval/eng/BrightRetrieval.py b/mteb/tasks/Retrieval/eng/BrightRetrieval.py index 7757b212ab..79fa6a0f07 100644 --- a/mteb/tasks/Retrieval/eng/BrightRetrieval.py +++ b/mteb/tasks/Retrieval/eng/BrightRetrieval.py @@ -28,7 +28,77 @@ DOMAINS_langs = {split: ["eng-Latn"] for split in DOMAINS} -EVAL_SPLITS = ["standard", "long"] +def load_bright_data( + self, + path: str, + domains: list, + eval_splits: list, + cache_dir: str | None = None, + revision: str | None = None, +): + corpus = {domain: {split: None for split in eval_splits} for domain in DOMAINS} + queries = {domain: {split: None for split in eval_splits} for domain in DOMAINS} + relevant_docs = { + domain: {split: None for split in eval_splits} for domain in DOMAINS + } + + for domain in domains: + domain_corpus = datasets.load_dataset( + path, "documents", split=domain, cache_dir=cache_dir, revision=revision + ) + examples = datasets.load_dataset( + path, "examples", split=domain, cache_dir=cache_dir, revision=revision + ) + if domain in DOMAINS_LONG: + domain_corpus_long = datasets.load_dataset( + path, + "long_documents", + split=domain, + cache_dir=cache_dir, + revision=revision, + ) + corpus[domain]["standard"] = { + e["id"]: {"text": e["content"]} for e in domain_corpus + } + if domain in DOMAINS_LONG: + corpus[domain]["long"] = { + e["id"]: {"text": e["content"]} for e in domain_corpus_long + } + queries[domain]["standard"] = queries[domain]["long"] = { + e["id"]: e["query"] for e in examples + } + relevant_docs[domain]["standard"] = {} + relevant_docs[domain]["long"] = {} + + for e in examples: + qid = e["id"] + gold_ids = e["gold_ids"] + gold_ids_long = e["gold_ids_long"] + relevant_docs[domain]["standard"][qid] = defaultdict(dict) + relevant_docs[domain]["long"][qid] = defaultdict(dict) + for gid in gold_ids: + relevant_docs[domain]["standard"][qid].update({gid: 1}) + for gid in gold_ids_long: + relevant_docs[domain]["long"][qid].update({gid: 1}) + + corpus = datasets.DatasetDict(corpus) + queries = datasets.DatasetDict(queries) + relevant_docs = datasets.DatasetDict(relevant_docs) + return corpus, queries, relevant_docs + + +def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = self.load_bright_data( + path=self.metadata.dataset["path"], + domains=DOMAINS, + eval_splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + self.data_loaded = True class BrightRetrieval(AbsTaskRetrieval): @@ -42,7 +112,7 @@ class BrightRetrieval(AbsTaskRetrieval): description="Bright retrieval dataset.", type="Retrieval", category="t2t", - eval_splits=EVAL_SPLITS, + eval_splits=["standard"], eval_langs=DOMAINS_langs, main_score="ndcg_at_10", date=("2024-03-01", "2024-06-01"), @@ -65,74 +135,21 @@ class BrightRetrieval(AbsTaskRetrieval): } """, ) + load_bright_data = load_bright_data + load_data = load_data - def load_bright_data( - self, - path: str, - domains: list, - eval_splits: list, - cache_dir: str = None, - revision: str = None, - ): - corpus = {domain: {split: None for split in eval_splits} for domain in DOMAINS} - queries = {domain: {split: None for split in eval_splits} for domain in DOMAINS} - relevant_docs = { - domain: {split: None for split in eval_splits} for domain in DOMAINS - } - for domain in domains: - domain_corpus = datasets.load_dataset( - path, "documents", split=domain, cache_dir=cache_dir, revision=revision - ) - examples = datasets.load_dataset( - path, "examples", split=domain, cache_dir=cache_dir, revision=revision - ) - if domain in DOMAINS_LONG: - domain_corpus_long = datasets.load_dataset( - path, - "long_documents", - split=domain, - cache_dir=cache_dir, - revision=revision, - ) - corpus[domain]["standard"] = { - e["id"]: {"text": e["content"]} for e in domain_corpus - } - if domain in DOMAINS_LONG: - corpus[domain]["long"] = { - e["id"]: {"text": e["content"]} for e in domain_corpus_long - } - queries[domain]["standard"] = queries[domain]["long"] = { - e["id"]: e["query"] for e in examples - } - relevant_docs[domain]["standard"] = {} - relevant_docs[domain]["long"] = {} - - for e in examples: - qid = e["id"] - gold_ids = e["gold_ids"] - gold_ids_long = e["gold_ids_long"] - relevant_docs[domain]["standard"][qid] = defaultdict(dict) - relevant_docs[domain]["long"][qid] = defaultdict(dict) - for gid in gold_ids: - relevant_docs[domain]["standard"][qid].update({gid: 1}) - for gid in gold_ids_long: - relevant_docs[domain]["long"][qid].update({gid: 1}) - - corpus = datasets.DatasetDict(corpus) - queries = datasets.DatasetDict(queries) - relevant_docs = datasets.DatasetDict(relevant_docs) - return corpus, queries, relevant_docs - - def load_data(self, **kwargs): - if self.data_loaded: - return - - self.corpus, self.queries, self.relevant_docs = self.load_bright_data( - path=self.metadata.dataset["path"], - domains=DOMAINS, - eval_splits=self.metadata.eval_splits, - cache_dir=kwargs.get("cache_dir", None), - revision=self.metadata.dataset["revision"], - ) - self.data_loaded = True +long_metadata = BrightRetrieval.metadata.model_copy() +long_metadata.eval_splits = ["long"] +long_metadata.description = "Bright retrieval dataset with long documents." +long_metadata.name = "BrightLongRetrieval" + +dom_langs_long = {split: ["eng-Latn"] for split in DOMAINS_LONG} +long_metadata.eval_langs = dom_langs_long + + +class BrightLongRetrieval(AbsTaskRetrieval): + metadata = long_metadata + + load_bright_data = load_bright_data + load_data = load_data diff --git a/mteb/tasks/Retrieval/eng/__init__.py b/mteb/tasks/Retrieval/eng/__init__.py index 4a6f24d09d..0379f27773 100644 --- a/mteb/tasks/Retrieval/eng/__init__.py +++ b/mteb/tasks/Retrieval/eng/__init__.py @@ -5,7 +5,7 @@ from .AlphaNLIRetrieval import AlphaNLI from .ARCChallengeRetrieval import ARCChallenge from .ArguAnaRetrieval import ArguAna -from .BrightRetrieval import BrightRetrieval +from .BrightRetrieval import BrightLongRetrieval, BrightRetrieval from .BuiltBenchRetrieval import BuiltBenchRetrieval from .ChemHotpotQARetrieval import ChemHotpotQARetrieval from .ChemNQRetrieval import ChemNQRetrieval @@ -156,6 +156,7 @@ "HellaSwag", "PIQA", "SpartQA", + "BrightLongRetrieval", "BrightRetrieval", "TempReasonL1", "HotpotQA", diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index f8ff67b94b..d09cc73a64 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -38,6 +38,8 @@ TatoebaBitextMining, TbilisiCityHallBitextMining, VieMedEVBitextMining, + WebFAQBitextMiningQAs, + WebFAQBitextMiningQuestions, ) from .Classification import ( AJGT, @@ -379,6 +381,8 @@ HALClusteringS2SFast, HamshahriClustring, IndicReviewsClusteringP2P, + KlueMrcDomainClustering, + KlueYnatMrcCategoryClustering, LivedoorNewsClustering, LivedoorNewsClusteringv2, MasakhaNEWSClusteringP2P, @@ -445,13 +449,13 @@ AROVisualAttribution, AROVisualRelation, BirdsnapClassification, - BirdsnapZeroshotClassification, + BirdsnapZeroShotClassification, BLINKIT2IMultiChoice, BLINKIT2IRetrieval, BLINKIT2TMultiChoice, BLINKIT2TRetrieval, Caltech101Classification, - Caltech101ZeroshotClassification, + Caltech101ZeroShotClassification, CIFAR10Classification, CIFAR10Clustering, CIFAR10ZeroShotClassification, @@ -461,22 +465,22 @@ CIRRIT2IRetrieval, CLEVRCount, Country211Classification, - Country211ZeroshotClassification, + Country211ZeroShotClassification, CVBenchCount, CVBenchDepth, CVBenchDistance, CVBenchRelation, DTDClassification, - DTDZeroshotClassification, + DTDZeroShotClassification, EDIST2ITRetrieval, EncyclopediaVQAIT2ITRetrieval, EuroSATClassification, - EuroSATZeroshotClassification, + EuroSATZeroShotClassification, Fashion200kI2TRetrieval, Fashion200kT2IRetrieval, FashionIQIT2IRetrieval, FER2013Classification, - FER2013ZeroshotClassification, + FER2013ZeroShotClassification, FGVCAircraftClassification, FGVCAircraftZeroShotClassification, Flickr30kI2TRetrieval, @@ -486,13 +490,13 @@ GLDv2I2IRetrieval, GLDv2I2TRetrieval, GTSRBClassification, - GTSRBZeroshotClassification, + GTSRBZeroShotClassification, HatefulMemesI2TRetrieval, HatefulMemesT2IRetrieval, ImageCoDeT2IMultiChoice, ImageCoDeT2IRetrieval, Imagenet1kClassification, - Imagenet1kZeroshotClassification, + Imagenet1kZeroShotClassification, ImageNet10Clustering, ImageNetDog15Clustering, InfoSeekIT2ITRetrieval, @@ -502,7 +506,7 @@ MemotionT2IRetrieval, METI2IRetrieval, MNISTClassification, - MNISTZeroshotClassification, + MNISTZeroShotClassification, MSCOCOI2TRetrieval, MSCOCOT2IRetrieval, NIGHTSI2IRetrieval, @@ -511,13 +515,13 @@ OVENIT2TRetrieval, OxfordFlowersClassification, OxfordPetsClassification, - OxfordPetsZeroshotClassification, + OxfordPetsZeroShotClassification, PatchCamelyonClassification, - PatchCamelyonZeroshotClassification, + PatchCamelyonZeroShotClassification, ReMuQIT2TRetrieval, RenderedSST2, RESISC45Classification, - RESISC45ZeroshotClassification, + RESISC45ZeroShotClassification, ROxfordEasyI2IMultiChoice, ROxfordEasyI2IRetrieval, ROxfordHardI2IMultiChoice, @@ -538,9 +542,9 @@ SOPI2IRetrieval, StanfordCarsClassification, StanfordCarsI2I, - StanfordCarsZeroshotClassification, + StanfordCarsZeroShotClassification, STL10Classification, - STL10ZeroshotClassification, + STL10ZeroShotClassification, STS12VisualSTS, STS13VisualSTS, STS14VisualSTS, @@ -550,11 +554,11 @@ STSBenchmarkMultilingualVisualSTS, SugarCrepe, SUN397Classification, - SUN397ZeroshotClassification, + SUN397ZeroShotClassification, TinyImageNet, TUBerlinT2IRetrieval, UCF101Classification, - UCF101ZeroshotClassification, + UCF101ZeroShotClassification, VidoreArxivQARetrieval, VidoreDocVQARetrieval, VidoreInfoVQARetrieval, @@ -689,6 +693,7 @@ ArguAnaPL, AutoRAGRetrieval, BelebeleRetrieval, + BrightLongRetrieval, BrightRetrieval, BSARDRetrieval, BuiltBenchRetrieval, @@ -919,6 +924,7 @@ TwitterHjerneRetrieval, VideoRetrieval, VieQuADRetrieval, + WebFAQRetrieval, WikipediaRetrievalMultilingual, WinoGrande, XMarket, @@ -977,30 +983,6 @@ __all__ = [ "VOC2007Classification", - "MNISTZeroshotClassification", - "CLEVR", - "CLEVRCount", - "SciMMIR", - "PatchCamelyonZeroshotClassification", - "OxfordPetsZeroshotClassification", - "EuroSATZeroshotClassification", - "StanfordCarsZeroshotClassification", - "CIFAR100ZeroShotClassification", - "CIFAR10ZeroShotClassification", - "Country211ZeroshotClassification", - "Food101ZeroShotClassification", - "SUN397ZeroshotClassification", - "GTSRBZeroshotClassification", - "Imagenet1kZeroshotClassification", - "DTDZeroshotClassification", - "RESISC45ZeroshotClassification", - "STL10ZeroshotClassification", - "Caltech101ZeroshotClassification", - "BirdsnapZeroshotClassification", - "RenderedSST2", - "UCF101ZeroshotClassification", - "FER2013ZeroshotClassification", - "FGVCAircraftZeroShotClassification", "STS17MultilingualVisualSTS", "STSBenchmarkMultilingualVisualSTS", "STS13VisualSTS", @@ -1115,6 +1097,30 @@ "XFlickr30kCoT2IRetrieval", "WITT2IRetrieval", "XM3600T2IRetrieval", + "MNISTZeroShotClassification", + "CLEVR", + "CLEVRCount", + "SciMMIR", + "PatchCamelyonZeroShotClassification", + "OxfordPetsZeroShotClassification", + "EuroSATZeroShotClassification", + "StanfordCarsZeroShotClassification", + "CIFAR100ZeroShotClassification", + "CIFAR10ZeroShotClassification", + "Country211ZeroShotClassification", + "Food101ZeroShotClassification", + "SUN397ZeroShotClassification", + "GTSRBZeroShotClassification", + "Imagenet1kZeroShotClassification", + "DTDZeroShotClassification", + "RESISC45ZeroShotClassification", + "STL10ZeroShotClassification", + "Caltech101ZeroShotClassification", + "BirdsnapZeroShotClassification", + "RenderedSST2", + "UCF101ZeroShotClassification", + "FER2013ZeroShotClassification", + "FGVCAircraftZeroShotClassification", "CLSClusteringFastP2P", "CLSClusteringFastS2S", "CLSClusteringP2P", @@ -1186,6 +1192,8 @@ "LivedoorNewsClustering", "LivedoorNewsClusteringv2", "MewsC16JaClustering", + "KlueYnatMrcCategoryClustering", + "KlueMrcDomainClustering", "WikiClusteringFastP2P", "WikiClusteringP2P", "MLSUMClusteringS2S", @@ -1239,6 +1247,8 @@ "NTREXBitextMining", "IndicGenBenchFloresBitextMining", "NollySentiBitextMining", + "WebFAQBitextMiningQAs", + "WebFAQBitextMiningQuestions", "BUCCBitextMiningFast", "PhincBitextMining", "TatoebaBitextMining", @@ -1757,6 +1767,7 @@ "HellaSwag", "PIQA", "SpartQA", + "BrightLongRetrieval", "BrightRetrieval", "TempReasonL1", "HotpotQA", @@ -1818,6 +1829,7 @@ "HunSum2AbstractiveRetrieval", "AutoRAGRetrieval", "KoStrategyQA", + "WebFAQRetrieval", "WikipediaRetrievalMultilingual", "MintakaRetrieval", "PublicHealthQARetrieval", diff --git a/pyproject.toml b/pyproject.toml index 29c7ec5c69..94e0ccad85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.36.25" +version = "1.36.29" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ @@ -54,6 +54,7 @@ homepage = "https://github.com/embeddings-benchmark/mteb" mteb = "mteb.cli:main" [project.optional-dependencies] +image = ["torchvision>0.0.0"] dev = [ "ruff==0.9.7", # locked so we don't get PRs which fail only due to a lint update "pytest>=8.3.4", diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index 5bdb9bcd01..f9a73a64c3 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -31,8 +31,8 @@ AbsTaskImageTextPairClassification, ) from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS -from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( - AbsTaskZeroshotClassification, +from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( + AbsTaskZeroShotClassification, ) from mteb.abstasks.TaskMetadata import TaskMetadata @@ -3134,7 +3134,7 @@ def load_data(self, **kwargs): self.data_loaded = True -class MockZeroshotClassificationTask(AbsTaskZeroshotClassification): +class MockZeroShotClassificationTask(AbsTaskZeroShotClassification): expected_stats = { "test": { "average_text_length": 26.0, @@ -3146,7 +3146,7 @@ class MockZeroshotClassificationTask(AbsTaskZeroshotClassification): metadata = TaskMetadata( type="ZeroShotClassification", - name="MockZeroshotClassification", + name="MockZeroShotClassification", main_score="accuracy", **general_args, # type: ignore ) diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py index d34aeeba37..9693468697 100644 --- a/tests/test_benchmark/task_grid.py +++ b/tests/test_benchmark/task_grid.py @@ -45,7 +45,7 @@ MockSummarizationTask, MockTextMultipleChoiceTask, MockVisualSTSTask, - MockZeroshotClassificationTask, + MockZeroShotClassificationTask, ) TASK_TEST_GRID = ( @@ -119,7 +119,7 @@ MockImageClusteringTask(), MockImageTextPairClassificationTask(), MockVisualSTSTask(), - MockZeroshotClassificationTask(), + MockZeroShotClassificationTask(), MockImageMultilabelClassificationTask(), MockMultilingualImageClassificationTask(), MockMultilingualImageTextPairClassificationTask(), diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 741cf0c0bf..eaa66c9186 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -8,7 +8,6 @@ import numpy as np import pytest -from sentence_transformers import SentenceTransformer from torch.utils.data import DataLoader import mteb @@ -37,7 +36,7 @@ MockRerankingTask, MockRetrievalTask, ) -from .task_grid import MOCK_TASK_TEST_GRID +from .task_grid import MOCK_MIEB_TASK_GRID, MOCK_TASK_TEST_GRID logging.basicConfig(level=logging.INFO) @@ -134,7 +133,7 @@ def encode( assert prompt_name == _task_name return np.zeros((len(sentences.dataset), 10)) - class EncoderWithoutInstructions(SentenceTransformer): + class EncoderWithoutInstructions(MockSentenceTransformer): def encode(self, sentences: DataLoader, **kwargs): assert kwargs["prompt_name"] is None return super().encode(sentences, **kwargs) @@ -158,7 +157,7 @@ def encode(self, sentences: DataLoader, **kwargs): overwrite_results=True, ) # Test that the task_name is not passed down to the encoder - model = EncoderWithoutInstructions("average_word_embeddings_levy_dependency") + model = EncoderWithoutInstructions() assert model.prompts == {}, "The encoder should not have any prompts" eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) @@ -194,6 +193,32 @@ def encode(self, sentences: DataLoader, task_name: str | None = None, **kwargs): ) +@pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID + MOCK_MIEB_TASK_GRID) +def test_task_name_passed_encoder(task_name: mteb.AbsTask, tmp_path: Path): + """Test that all tasks correctly pass down the task_name to the encoder.""" + _task_name = ( + task_name.metadata.name if isinstance(task_name, mteb.AbsTask) else task_name + ) + + class MockEncoderWithInstructions(mteb.Encoder): + def encode(self, sentences, task_name: str | None = None, **kwargs): + assert task_name == _task_name + return np.zeros((len(sentences), 10)) + + if isinstance(task_name, mteb.AbsTask): + tasks = [task_name] + else: + tasks = mteb.get_tasks(tasks=[task_name]) + + eval = mteb.MTEB(tasks=tasks) + + eval.run( + MockEncoderWithInstructions(), + output_folder=tmp_path.as_posix(), + overwrite_results=True, + ) + + @pytest.mark.parametrize("model", [MockNumpyEncoder()]) def test_run_using_benchmark(model: mteb.Encoder, tmp_path: Path): """Test that a benchmark object can be run using the MTEB class."""