Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions mteb/descriptive_stats/Retrieval/MBPPRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"test": {
"number_of_characters": 78520,
"num_samples": 1948,
"num_queries": 974,
"num_documents": 974,
"min_document_length": 37,
"average_document_length": 78.61601642710473,
"max_document_length": 249,
"unique_documents": 974,
"min_query_length": 2,
"average_query_length": 2.0,
"max_query_length": 2,
"unique_queries": 974,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.0,
"max_relevant_docs_per_query": 1,
"unique_relevant_docs": 974
}
}
1 change: 1 addition & 0 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .code.CodeTransOceanDLRetrieval import *
from .code.COIRCodeSearchNetRetrieval import *
from .code.CosQARetrieval import *
from .code.MBPPRetrieval import *
from .code.HumanEvalRetrieval import *
from .code.StackOverflowQARetrieval import *
from .code.SyntheticText2SqlRetrieval import *
Expand Down
88 changes: 88 additions & 0 deletions mteb/tasks/Retrieval/code/MBPPRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval


class MBPPRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="MBPPRetrieval",
description="A code retrieval task based on 378 Python programming problems from MBPP (Mostly Basic Python Programming). Each query is a natural language description of a programming task (e.g., 'Write a function to find the shared elements from the given two lists'), and the corpus contains Python code implementations. The task is to retrieve the correct code snippet that solves the described problem. Queries are problem descriptions while the corpus contains Python function implementations with proper syntax and logic.",
reference="https://huggingface.co/datasets/embedding-benchmark/MBPP",
dataset={
"path": "embedding-benchmark/MBPP",
"revision": "12a7c6b3b7e985be9b9f81e2f06306ffd17cc5a4",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm this version does not seem to exist. I was able to find the latest one though:

586a1fd6a0c63fdeda3b49c0293559a81c79cdec

},
type="Retrieval",
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn", "python-Code"],
main_score="ndcg_at_10",
date=("2021-01-01", "2021-12-31"),
domains=["Programming"],
task_subtypes=["Code retrieval"],
license="cc-by-4.0",
annotations_creators="expert-annotated",
dialect=[],
sample_creation="found",
bibtex_citation=r"""
@article{austin2021program,
author = {Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},
journal = {arXiv preprint arXiv:2108.07732},
title = {Program Synthesis with Large Language Models},
year = {2021},
}
""",
)

def load_data(self, **kwargs):
if self.data_loaded:
return

from datasets import load_dataset

# Load the three configurations
corpus_ds = load_dataset(
self.metadata.dataset["path"],
"corpus",
revision=self.metadata.dataset["revision"],
)["corpus"]
queries_ds = load_dataset(
self.metadata.dataset["path"],
"queries",
revision=self.metadata.dataset["revision"],
)["queries"]
qrels_ds = load_dataset(
self.metadata.dataset["path"],
"default",
revision=self.metadata.dataset["revision"],
)["test"]

# Initialize data structures with 'test' split
corpus = {}
queries = {}
relevant_docs = {}

# Process corpus
for item in corpus_ds:
corpus[item["id"]] = {"title": "", "text": item["text"]}

# Process queries
for item in queries_ds:
queries[item["id"]] = item["text"]

# Process qrels (relevant documents)
for item in qrels_ds:
query_id = item["query-id"]
if query_id not in relevant_docs:
relevant_docs[query_id] = {}
relevant_docs[query_id][item["corpus-id"]] = int(item["score"])

# Organize data by splits as expected by MTEB
self.corpus = {"test": corpus}
self.queries = {"test": queries}
self.relevant_docs = {"test": relevant_docs}

self.data_loaded = True
Loading