Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2151,3 +2151,66 @@
""",
contacts=["BaoLocPham"],
)

JINA_VDR = Benchmark(
name="JinaVDR",
display_name="Jina Visual Document Retrieval",
tasks=get_tasks(
tasks=[
"JinaVDRMedicalPrescriptionsRetrieval",
"JinaVDRStanfordSlideRetrieval",
"JinaVDRDonutVQAISynHMPRetrieval",
"JinaVDRTableVQARetrieval",
"JinaVDRChartQARetrieval",
"JinaVDRTQARetrieval",
"JinaVDROpenAINewsRetrieval",
"JinaVDREuropeanaDeNewsRetrieval",
"JinaVDREuropeanaEsNewsRetrieval",
"JinaVDREuropeanaItScansRetrieval",
"JinaVDREuropeanaNlLegalRetrieval",
"JinaVDRHindiGovVQARetrieval",
"JinaVDRAutomobileCatelogRetrieval",
"JinaVDRBeveragesCatalogueRetrieval",
"JinaVDRRamensBenchmarkRetrieval",
"JinaVDRJDocQARetrieval",
"JinaVDRHungarianDocQARetrieval",
"JinaVDRArabicChartQARetrieval",
"JinaVDRArabicInfographicsVQARetrieval",
"JinaVDROWIDChartsRetrieval",
"JinaVDRMPMQARetrieval",
"JinaVDRJina2024YearlyBookRetrieval",
"JinaVDRWikimediaCommonsMapsRetrieval",
"JinaVDRPlotQARetrieval",
"JinaVDRMMTabRetrieval",
"JinaVDRCharXivOCRRetrieval",
"JinaVDRStudentEnrollmentSyntheticRetrieval",
"JinaVDRGitHubReadmeRetrieval",
"JinaVDRTweetStockSyntheticsRetrieval",
"JinaVDRAirbnbSyntheticRetrieval",
"JinaVDRShanghaiMasterPlanRetrieval",
"JinaVDRWikimediaCommonsDocumentsRetrieval",
"JinaVDREuropeanaFrNewsRetrieval",
"JinaVDRDocQAHealthcareIndustryRetrieval",
"JinaVDRDocQAAI",
"JinaVDRShiftProjectRetrieval",
"JinaVDRTatQARetrieval",
"JinaVDRInfovqaRetrieval",
"JinaVDRDocVQARetrieval",
"JinaVDRDocQAGovReportRetrieval",
"JinaVDRTabFQuadRetrieval",
"JinaVDRDocQAEnergyRetrieval",
"JinaVDRArxivQARetrieval",
],
),
description="Multilingual, domain-diverse and layout-rich document retrieval benchmark.",
reference="https://arxiv.org/abs/2506.18902",
citation=r"""@misc{günther2025jinaembeddingsv4universalembeddingsmultimodal,
archiveprefix = {arXiv},
author = {Michael Günther and Saba Sturua and Mohammad Kalim Akram and Isabelle Mohr and Andrei Ungureanu and Bo Wang and Sedigheh Eslami and Scott Martens and Maximilian Werk and Nan Wang and Han Xiao},
eprint = {2506.18902},
primaryclass = {cs.AI},
title = {jina-embeddings-v4: Universal Embeddings for Multimodal Multilingual Retrieval},
url = {https://arxiv.org/abs/2506.18902},
year = {2025},
}""",
)
1 change: 1 addition & 0 deletions mteb/custom_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"cc-by-4.0",
"cc-by-sa-3.0",
"cc-by-sa-4.0",
"cc-by-nc-3.0",
"cc-by-nc-4.0",
"cc-by-nc-sa-3.0",
"cc-by-nc-sa-4.0",
Expand Down
1 change: 1 addition & 0 deletions mteb/leaderboard/benchmark_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class MenuEntry:
"MIEB(lite)",
"MIEB(Img)",
"VisualDocumentRetrieval",
"JinaVDR",
]
),
),
Expand Down
2 changes: 1 addition & 1 deletion mteb/models/colpali_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def calculate_probs(self, text_embeddings, image_embeddings):
return scores.softmax(dim=-1)

def similarity(self, a, b):
return self.processor.score(a, b)
return self.processor.score(a, b, **self.processor_kwargs)


class ColPaliWrapper(ColPaliEngineWrapper):
Expand Down
6 changes: 2 additions & 4 deletions mteb/models/jina_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,9 +341,7 @@ def get_text_embeddings(
return self.model.encode_text(
texts=texts,
batch_size=batch_size,
return_multivector=True
if task_type.startswith("DocumentUnderstanding")
else False,
return_multivector=self.vector_type == "multi_vector",
prompt_name=prompt_name_param,
task=base_task,
return_numpy=return_numpy,
Expand Down Expand Up @@ -389,7 +387,7 @@ def get_image_embeddings(
images=all_images,
batch_size=batch_size,
max_pixels=max_pixels,
return_multivector=True,
return_multivector=self.vector_type == "multi_vector",
task=base_task,
return_numpy=return_numpy,
)
Expand Down
1 change: 1 addition & 0 deletions mteb/tasks/Image/Any2AnyRetrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from .eng.VQA2IT2TRetrieval import *
from .eng.WebQAT2ITRetrieval import *
from .eng.WebQAT2TRetrieval import *
from .multilingual.JinaVDRBenchRetrieval import *
from .multilingual.MIRACLVisionRetrieval import *
from .multilingual.VdrMultilingualRetrieval import *
from .multilingual.Vidore2BenchRetrieval import *
Expand Down
Loading
Loading