Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
0163342
test: fix dataset availability test (#2141)
KennethEnevoldsen Feb 24, 2025
760fcaf
fix: Update NVIDIA-Embed training data (#2143)
KennethEnevoldsen Feb 24, 2025
9f6cc4e
1.34.29
invalid-email-address Feb 24, 2025
8538e93
fix: Add annotations for Voyage exp (#2144)
KennethEnevoldsen Feb 24, 2025
25cd62d
1.34.30
invalid-email-address Feb 24, 2025
8e97d36
Fix tokens num in cde models (#2148)
Samoed Feb 24, 2025
0e624b2
feat: Add Qodo-Embed-1-7B model metadata and rename existing model (#…
talshef Feb 24, 2025
4d23c6c
1.35.0
invalid-email-address Feb 24, 2025
bd2a67c
misc: add Any2AnyRetrievalDescriptiveStatistics (#2139)
isaac-chung Feb 24, 2025
ef3f4f0
Update tasks table
github-actions[bot] Feb 24, 2025
a7dc95a
Added zero-shot percentages and different filtering scheme (#2153)
x-tabdeveloping Feb 25, 2025
565e29c
fix: Incorrect annotations for Mistral-based embedding models (#2157)
KennethEnevoldsen Feb 25, 2025
90ec21c
1.35.1
invalid-email-address Feb 25, 2025
8afb78a
Update FaMTEBRetrieval.py (#2171)
garciasces Feb 26, 2025
331cded
Update tasks table
github-actions[bot] Feb 26, 2025
6cc1822
fix: Add Training data annotations (#2173)
KennethEnevoldsen Feb 26, 2025
ed0cb31
1.35.2
invalid-email-address Feb 26, 2025
dea231b
feat: Add MIEB and MIEB-lite as benchmarks (#2035)
isaac-chung Feb 27, 2025
dbcbf54
Update tasks table
github-actions[bot] Feb 27, 2025
afe1739
1.36.0
invalid-email-address Feb 27, 2025
62b33f2
fix: update training datasets and revision for jina models (#2179)
Feb 27, 2025
1959c73
fix: Add more training data annotations (#2178)
KennethEnevoldsen Feb 27, 2025
4a0bb5c
1.36.1
invalid-email-address Feb 27, 2025
43d15f1
Added training data annotation for e5-base-4k (#2186)
x-tabdeveloping Feb 28, 2025
1b23d4e
fix: Added training data annotations to MXBAI (#2185)
x-tabdeveloping Feb 28, 2025
7daf893
fix: Update MTEB(Scandinavian) to use new DanFEVER (#2180)
KennethEnevoldsen Feb 28, 2025
0307102
fix: Added training data annotation for MMLW models (#2188)
x-tabdeveloping Feb 28, 2025
7642c07
1.36.2
invalid-email-address Feb 28, 2025
0901cf6
fix: Added training data for sentence-croissant (#2189)
x-tabdeveloping Feb 28, 2025
d4b691f
1.36.3
invalid-email-address Feb 28, 2025
3325f7e
fix: update ru models annotation (#2181)
Samoed Feb 28, 2025
c04d158
1.36.4
invalid-email-address Feb 28, 2025
fee6fc0
fix: Alphabetical ordering of tasks in dropdowns (#2191)
ayush1298 Feb 28, 2025
0631089
1.36.5
invalid-email-address Feb 28, 2025
7345235
misc: Speed up qrel creation in any2anyretrieval (#2196)
isaac-chung Feb 28, 2025
29464ac
use 'mteb.MTEB' instead of 'MTEB' for custom model (#2199)
yaya-sy Feb 28, 2025
a165b64
Merge branch 'refs/heads/main' into merge_main
Samoed Mar 1, 2025
8601b21
lint
Samoed Mar 1, 2025
f538b4f
fix code carbon
Samoed Mar 1, 2025
3097740
fix aggregated
Samoed Mar 1, 2025
1c8d715
add base models for e5 (#2183)
Samoed Mar 2, 2025
7af37d4
add similar datasets (#2205)
Samoed Mar 2, 2025
587892d
add labse annotation (#2182)
Samoed Mar 2, 2025
761a174
fix: Fixed leaderboard crash (#2221)
x-tabdeveloping Mar 3, 2025
e57cd50
1.36.6
invalid-email-address Mar 3, 2025
2dd1391
fix: More training data annotations (#2220)
x-tabdeveloping Mar 3, 2025
546e0c4
1.36.7
invalid-email-address Mar 3, 2025
4ee4e7c
Add LLM2CLIP (OpenAI variants) (#2222)
isaac-chung Mar 3, 2025
c5fded2
Change `dataset on HF` test to use official api (#2213)
Samoed Mar 3, 2025
3e991bd
Descriptive stats functions for Any2AnyMC and ImageTextPC (#2197)
imenelydiaker Mar 3, 2025
cc47225
Update tasks table
github-actions[bot] Mar 3, 2025
ee514cb
fix: Add training data annotations to uderver-bloom models (#2210)
KennethEnevoldsen Mar 3, 2025
4de58c3
1.36.8
invalid-email-address Mar 3, 2025
a87927b
Add comment to `voyage-3-m-exp` model (#2229)
Samoed Mar 3, 2025
fe4c17a
Merge branch 'refs/heads/main' into merge_main
Samoed Mar 4, 2025
3a9d271
docs: Update description of EURLex (#2231)
KennethEnevoldsen Mar 4, 2025
7f7d3e8
Automatically add similar tasks to training_tasks (#2228)
Samoed Mar 4, 2025
a857b10
Merge branch 'refs/heads/main' into merge_main
Samoed Mar 4, 2025
5759f84
lint
Samoed Mar 4, 2025
d786633
refactor
Samoed Mar 4, 2025
4cf714e
update BEIR-PL annotation
Samoed Mar 4, 2025
6b03f0f
fix
Samoed Mar 4, 2025
07e6ae5
update test
Samoed Mar 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ class CustomModel:

model = CustomModel()
tasks = mteb.get_tasks(tasks=["Banking77Classification"])
evaluation = MTEB(tasks=tasks)
evaluation = mteb.MTEB(tasks=tasks)
evaluation.run(model)
```

Expand Down
2,202 changes: 1,103 additions & 1,099 deletions docs/tasks.md

Large diffs are not rendered by default.

266 changes: 232 additions & 34 deletions mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from mteb.abstasks.AbsTask import AbsTask, ScoresDict

from ...evaluation.evaluators import Any2AnyMultiChoiceEvaluator
from ..TaskMetadata import DescriptiveStatistics

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -186,6 +187,95 @@ def _load_qrels(self, split):
self.qrels = qrels_ds


class Any2AnyMutipleChoiceDescriptiveStatistics(DescriptiveStatistics):
"""Descriptive statistics for Any2TextMutipleChoice

Attributes:
num_samples: Number of queries and documents
num_queries: number of queries in the dataset
num_documents: Number of documents
number_of_characters: Total number of text characters in the dataset

For text only:
min_document_length: Minimum length of documents
average_document_length: Average length of documents
max_document_length: Maximum length of documents
unique_documents: Number of unique documents

For text only:
min_query_length: Minimum length of queries
average_query_length: Average length of queries
max_query_length: Maximum length of queries
unique_queries: Number of unique queries

For images:
num_query_images: Number of query images
num_document_images: Number of document images

For images:
min_document_image_width: Minimum width of document images
average_document_image_width: Average width of document images
max_document_image_width: Maximum width of document images
min_document_image_height: Minimum height of document images
average_document_image_height: Average height of document images
max_document_image_height: Maximum height of document images

For images:
min_query_image_width: Minimum width of query images
average_query_image_width: Average width of query images
max_query_image_width: Maximum width of query images
min_query_image_height: Minimum height of query images
average_query_image_height: Average height of query images
max_query_image_height: Maximum height of query images

min_relevant_docs_per_query: Minimum number of relevant documents per query
average_relevant_docs_per_query: Average number of relevant documents per query
max_relevant_docs_per_query: Maximum number of relevant documents per query
unique_relevant_docs: Number of unique relevant documents

min_irrelevant_docs_per_query: Minimum number of irrelevant documents per query
average_irrelevant_docs_per_query: Average number of irrelevant documents per query
max_irrelevant_docs_per_query: Maximum number of irrelevant documents per query
unique_irrelevant_docs: Number of unique irrelevant documents
"""

num_samples: int
num_queries: int
num_documents: int
number_of_characters: int

min_document_length: int
average_document_length: float
max_document_length: int
unique_documents: int
num_document_images: int

min_document_image_width: float
average_document_image_width: float
max_document_image_width: float
min_document_image_height: float
average_document_image_height: float
max_document_image_height: float

min_query_length: int
average_query_length: float
max_query_length: int
unique_queries: int
num_query_images: int

min_query_image_width: float
average_query_image_width: float
max_query_image_width: float
min_query_image_height: float
average_query_image_height: float
max_query_image_height: float

min_relevant_docs_per_query: int
average_relevant_docs_per_query: float
max_relevant_docs_per_query: int
unique_relevant_docs: int


class AbsTaskAny2AnyMultiChoice(AbsTask):
"""Abstract class for Any2Any multiple choice experiments

Expand Down Expand Up @@ -376,39 +466,124 @@ def _add_main_score(self, scores: ScoresDict) -> None:

def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
):
pass

def calculate_metadata_metrics(self) -> None:
self.load_data()

all_details = {}
pbar_split = tqdm.tqdm(self.metadata.eval_splits, desc="Processing Splits...")
for split in pbar_split:
pbar_split.set_postfix_str(f"Split: {split}")
logger.info(f"Processing metadata for split {split}")
all_details[split] = {}
if self.metadata.is_multilingual:
pbar_lang = tqdm.tqdm(
self.relevant_docs.keys(), desc="Processing Languages..."
) -> Any2AnyMutipleChoiceDescriptiveStatistics:
if hf_subset:
queries = self.queries[hf_subset][split]
corpus = self.corpus[hf_subset][split]
relevant_docs = self.relevant_docs[hf_subset][split]
elif compute_overall:
queries = {}
corpus = {}
relevant_docs = {}
for hf_subset in self.metadata.eval_langs:
queries.update(process_docs(self.queries, hf_subset, split))
corpus.update(process_docs(self.corpus, hf_subset, split))
relevant_docs.update(
process_relevant_docs(self.relevant_docs, hf_subset, split)
)
for lang in pbar_lang:
pbar_lang.set_postfix_str(f"Language: {lang}")
logger.info(f"Processing metadata for language {lang}")
split_details = process_language(
self.relevant_docs[lang][split],
self.queries[lang][split],
self.corpus[lang][split],
lang,
)
all_details[split][lang] = split_details
else:
split_details = process_language(
self.relevant_docs[split], self.queries[split], self.corpus[split]
)
all_details[split] = split_details

return all_details
else:
queries = self.queries[split]
corpus = self.corpus[split]
relevant_docs = self.relevant_docs[split]

queries_lens, doc_lens = [], []
num_query_images = 0
num_document_images = 0

q_modality = queries[0]["modality"]
unique_queries = len(set(queries["text"])) if "text" in q_modality else 0

for query in tqdm.tqdm(queries, desc="queries:"):
if "text" in q_modality:
text_query = query["text"]
queries_lens.append(len(text_query))
if "image" in q_modality:
num_query_images += 1

d_modality = corpus[0]["modality"]
unique_documents = len(set(corpus["text"])) if "text" in d_modality else 0

for doc in tqdm.tqdm(corpus, desc="docs:"):
if "text" in d_modality:
text_doc = doc["text"]
doc_lens.append(len(text_doc))
if "image" in d_modality:
num_document_images += 1

total_doc_len = sum(doc_lens)
total_query_len = sum(queries_lens)
num_documents = len(corpus)
num_queries = len(queries)

d_modality = corpus[0]["modality"]
imgs = [doc["image"] for doc in corpus if "image" in d_modality]
d_img_widths, d_img_heights = [], []
for img in imgs:
width, height = img.size
d_img_widths.append(height)
d_img_heights.append(width)

q_modality = queries[0]["modality"]
imgs = [query["image"] for query in queries if "image" in q_modality]
q_img_widths, q_img_heights = [], []
for img in imgs:
width, height = img.size
q_img_widths.append(height)
q_img_heights.append(width)

# create a list of number of relevant docs per query
queries_set = set(queries["id"])
qrels_lengths = [
len(relevant_docs[qid])
for qid in tqdm.tqdm(relevant_docs.keys(), desc="qrels:")
if qid in queries_set
]
num_qrels = sum(qrels_lengths)
qrels_per_doc = num_qrels / len(relevant_docs) if num_queries else 0
unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]})

return Any2AnyMutipleChoiceDescriptiveStatistics(
number_of_characters=total_query_len + total_doc_len,
num_samples=num_documents + num_queries,
num_queries=num_queries,
num_documents=num_documents,
min_document_length=min(doc_lens) if doc_lens else 0,
average_document_length=total_doc_len / len(doc_lens) if doc_lens else 0,
max_document_length=max(doc_lens) if doc_lens else 0,
unique_documents=unique_documents,
min_document_image_width=min(d_img_widths) if d_img_widths else 0,
average_document_image_width=sum(d_img_widths) / len(d_img_widths)
if d_img_widths
else 0,
max_document_image_width=max(d_img_widths) if d_img_widths else 0,
min_document_image_height=min(d_img_heights) if d_img_heights else 0,
average_document_image_height=sum(d_img_heights) / len(d_img_heights)
if d_img_heights
else 0,
max_document_image_height=max(d_img_heights) if d_img_heights else 0,
num_document_images=num_document_images,
min_query_length=min(queries_lens) if queries_lens else 0,
average_query_length=total_query_len / len(queries_lens)
if queries_lens
else 0,
max_query_length=max(queries_lens) if queries_lens else 0,
unique_queries=unique_queries,
num_query_images=num_query_images,
min_query_image_width=min(q_img_widths) if q_img_widths else 0,
average_query_image_width=sum(q_img_widths) / len(q_img_widths)
if q_img_widths
else 0,
max_query_image_width=max(q_img_widths) if q_img_widths else 0,
min_query_image_height=min(q_img_heights) if q_img_heights else 0,
average_query_image_height=sum(q_img_heights) / len(q_img_heights)
if q_img_heights
else 0,
max_query_image_height=max(q_img_heights) if q_img_heights else 0,
min_relevant_docs_per_query=min(qrels_lengths),
average_relevant_docs_per_query=qrels_per_doc,
max_relevant_docs_per_query=max(qrels_lengths),
unique_relevant_docs=unique_qrels,
)


def process_language(relevant_docs, queries, corpus, lang=None):
Expand Down Expand Up @@ -448,13 +623,36 @@ def process_language(relevant_docs, queries, corpus, lang=None):
def calculate_length(queries, corpus):
queries_lens = []
doc_lens = []
for query in queries.values():
for query in queries:
queries_lens.append(len(query))

for doc in corpus.values():
for doc in corpus:
if isinstance(doc, Image.Image):
doc_lens.append(1.0) # for image append 1. Can perhaps be removed.

doc_len = sum(doc_lens) / len(doc_lens) if doc_lens else 0
query_len = sum(queries_lens) / len(queries_lens) if queries_lens else 0
return query_len, doc_len


def process_relevant_docs(
collection: dict[str, dict[str, dict[str, dict[str, int]]]],
hf_subset: str,
split: str,
) -> dict[str, dict[str, int]]:
"""Collections can contain overlapping ids in different splits. Prepend split to avoid this"""
return_collection = {}
for query_id, relevant in collection[hf_subset][split].items():
return_collection[f"{split}_{hf_subset}_{query_id}"] = {
f"{split}_{hf_subset}_{doc_id}": value for doc_id, value in relevant.items()
}
return return_collection


def process_docs(
collection: dict[str, dict[str, dict[str, str] | str]], hf_subset: str, split: str
) -> dict[str, str]:
"""Collections can contain overlapping ids in different splits. Prepend split to avoid this"""
return {
f"{split}_{hf_subset}_{k}": v for k, v in collection[hf_subset][split].items()
}
Loading