Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions mteb/models/model_implementations/arctic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,26 @@
)
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader

ARCTIC_V1_CITATION = """@article{merrick2024embedding,
title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
author={Merrick, Luke},
journal={arXiv preprint arXiv:2407.18887},
year={2024},
eprint={2407.18887},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2407.18887}
}"""

ARCTIC_V2_CITATION = """@article{yu2024arctic,
title={Arctic-Embed 2.0: Multilingual Retrieval Without Compromise},
author={Yu, Puxuan and Merrick, Luke and Nuti, Gaurav and Campos, Daniel},
journal={arXiv preprint arXiv:2412.04506},
year={2024},
eprint={2412.04506},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2412.04506}
}"""

LANGUAGES_V2_0 = [
"afr-Latn",
"ara-Arab",
Expand Down Expand Up @@ -138,6 +158,7 @@
public_training_code=None,
public_training_data=None,
training_datasets=arctic_v1_training_datasets,
citation=ARCTIC_V1_CITATION,
)


Expand All @@ -162,6 +183,7 @@
public_training_code=None,
public_training_data=None, # couldn't find
training_datasets=arctic_v1_training_datasets,
citation=ARCTIC_V1_CITATION,
)


Expand All @@ -186,6 +208,7 @@
public_training_code=None,
public_training_data=None, # couldn't find
training_datasets=arctic_v1_training_datasets,
citation=ARCTIC_V1_CITATION,
)

arctic_embed_m_long = ModelMeta(
Expand All @@ -210,6 +233,7 @@
public_training_code=None,
public_training_data=None, # couldn't find
training_datasets=arctic_v1_training_datasets,
citation=ARCTIC_V1_CITATION,
)

arctic_embed_l = ModelMeta(
Expand All @@ -233,6 +257,7 @@
public_training_code=None,
public_training_data=None, # couldn't find
training_datasets=arctic_v1_training_datasets,
citation=ARCTIC_V1_CITATION,
)

arctic_embed_m_v1_5 = ModelMeta(
Expand Down Expand Up @@ -261,6 +286,7 @@
public_training_code=None,
public_training_data=None,
training_datasets=arctic_v1_training_datasets,
citation=ARCTIC_V1_CITATION,
)

arctic_embed_m_v2_0 = ModelMeta(
Expand All @@ -285,6 +311,7 @@
public_training_code=None,
public_training_data=None, # couldn't find
training_datasets=arctic_v2_training_datasets,
citation=ARCTIC_V2_CITATION,
)

arctic_embed_l_v2_0 = ModelMeta(
Expand All @@ -308,4 +335,5 @@
public_training_code=None,
public_training_data=None, # couldn't find
training_datasets=arctic_v2_training_datasets,
citation=ARCTIC_V2_CITATION,
)
9 changes: 9 additions & 0 deletions mteb/models/model_implementations/b1ade_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,13 @@
# BAAI/bge-large-en-v1.5
# mixedbread-ai/mxbai-embed-large-v1
# avsolatorio/GIST-large-Embedding-v0
citation="""
@misc{bigscience_workshop_2022,
author = { {Shreyas Subramanian} },
title = { {b1ade series of models} },
year = 2024,
url = { https://huggingface.co/w601sxs/b1ade-embed },
publisher = { Hugging Face }
}
""",
)
11 changes: 11 additions & 0 deletions mteb/models/model_implementations/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,17 @@
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
training_datasets=E5_MISTRAL_TRAINING_DATA | bge_full_data,
adapted_from="intfloat/e5-mistral-7b-instruct",
citation="""
@misc{li2024makingtextembeddersfewshot,
title={Making Text Embedders Few-Shot Learners},
author={Chaofan Li and MingHao Qin and Shitao Xiao and Jianlyu Chen and Kun Luo and Yingxia Shao and Defu Lian and Zheng Liu},
year={2024},
eprint={2409.15700},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2409.15700},
}
""",
)

bge_m3_unsupervised = ModelMeta(
Expand Down
11 changes: 11 additions & 0 deletions mteb/models/model_implementations/e5_v.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@
"4.44.2" # Issue 1647: Only works with transformers==4.44.2.
)

E5_V_CITATION = """@article{jiang2024e5v,
title={E5-V: Universal Embeddings with Multimodal Large Language Models},
author={Jiang, Ting and Song, Minghui and Zhang, Zihan and Huang, Haizhen and Deng, Weiwei and Sun, Feng and Zhang, Qi and Wang, Deqing and Zhuang, Fuzhen},
journal={arXiv preprint arXiv:2407.12580},
year={2024},
eprint={2407.12580},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2407.12580}
}"""


class E5VModel(AbsEncoder):
def __init__(
Expand Down Expand Up @@ -169,4 +179,5 @@ def encode(
training_datasets=set(
# princeton-nlp/datasets-for-simcse
),
citation=E5_V_CITATION,
)
11 changes: 11 additions & 0 deletions mteb/models/model_implementations/evaclip_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
from mteb.models.model_meta import ModelMeta, ScoringFunction
from mteb.types import Array, BatchedInput, PromptType

EVA_CLIP_CITATION = """@article{EVA-CLIP,
title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
journal={arXiv preprint arXiv:2303.15389},
year={2023}
}"""


def evaclip_loader(model_name, **kwargs):
try:
Expand Down Expand Up @@ -148,6 +155,7 @@ def encode(
similarity_fn_name=ScoringFunction.COSINE,
use_instructions=False,
training_datasets=training_datasets,
citation=EVA_CLIP_CITATION,
)

EVA02_CLIP_L_14 = ModelMeta(
Expand All @@ -170,6 +178,7 @@ def encode(
similarity_fn_name=ScoringFunction.COSINE,
use_instructions=False,
training_datasets=training_datasets,
citation=EVA_CLIP_CITATION,
)

EVA02_CLIP_bigE_14 = ModelMeta(
Expand All @@ -192,6 +201,7 @@ def encode(
similarity_fn_name=ScoringFunction.COSINE,
use_instructions=False,
training_datasets=laion_2b,
citation=EVA_CLIP_CITATION,
)


Expand All @@ -215,4 +225,5 @@ def encode(
similarity_fn_name=ScoringFunction.COSINE,
use_instructions=False,
training_datasets=laion_2b,
citation=EVA_CLIP_CITATION,
)
9 changes: 9 additions & 0 deletions mteb/models/model_implementations/fa_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@
# Persian Wikipedia
# Other data crawled from websites like bigbangpage.com, chetor.com, eligasht.com/blog, digikala.com/mag, and ted.com/talks.
),
citation="""
@article{ParsBERT,
title={ParsBERT: Transformer-based Model for Persian Language Understanding},
author={Mehrdad Farahani, Mohammad Gharachorloo, Marzieh Farahani, Mohammad Manthouri},
journal={ArXiv},
year={2020},
volume={abs/2005.12515}
}
""",
)

bert_zwnj = ModelMeta(
Expand Down
12 changes: 12 additions & 0 deletions mteb/models/model_implementations/gme_v_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@

logger = logging.getLogger(__name__)

GME_CITATION = """@misc{zhang2024gme,
title={GME: Improving Universal Multimodal Retrieval by Multimodal LLMs},
author={Zhang, Xin and Zhang, Yanzhao and Xie, Wen and Li, Mingxin and Dai, Ziqi and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Li, Wenjie and Zhang, Min},
year={2024},
eprint={2412.16855},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={http://arxiv.org/abs/2412.16855}
}"""


class Encoder(torch.nn.Module):
def __init__(
Expand Down Expand Up @@ -349,6 +359,7 @@ def fetch_image(
public_training_code=None,
public_training_data=None,
training_datasets=training_data,
citation=GME_CITATION,
)

gme_qwen2vl_7b = ModelMeta(
Expand All @@ -371,4 +382,5 @@ def fetch_image(
public_training_code=None,
public_training_data=None,
training_datasets=training_data,
citation=GME_CITATION,
)
10 changes: 10 additions & 0 deletions mteb/models/model_implementations/mcinext_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@

logger = logging.getLogger(__name__)

HAKIM_CITATION = """@article{sarmadi2025hakim,
title={Hakim: Farsi Text Embedding Model},
author={Sarmadi, Mehran and Alikhani, Morteza and Zinvandi, Erfan and Pourbahman, Zahra},
journal={arXiv preprint arXiv:2505.08435},
year={2025}
}"""

MODEL_API_NAMES = {
"hakim": "Hakim",
"hakim-small": "Hakim_small",
Expand Down Expand Up @@ -396,6 +403,7 @@ def encode(
"SynPerSTS",
"Query2Query",
},
citation=HAKIM_CITATION,
)


Expand Down Expand Up @@ -462,6 +470,7 @@ def encode(
"SynPerSTS",
"Query2Query",
},
citation=HAKIM_CITATION,
)

hakim_unsup = ModelMeta(
Expand Down Expand Up @@ -491,4 +500,5 @@ def encode(
"MSMARCO-Fa",
"Query2Query",
},
citation=HAKIM_CITATION,
)
15 changes: 15 additions & 0 deletions mteb/models/model_implementations/model2vec_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@

logger = logging.getLogger(__name__)

MODEL2VEC_CITATION = """@software{minishlab2024model2vec,
authors = {Stephan Tulkens, Thomas van Dongen},
title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
year = {2024},
url = {https://github.com/MinishLab/model2vec}
}"""

_POTION_MULTILINGUAL_128M_LANGUAGES = [
"afr-Latn",
"amh-Ethi",
Expand Down Expand Up @@ -172,6 +179,7 @@ def encode(
training_datasets=bge_training_data, # distilled
public_training_code="https://github.com/MinishLab/model2vec",
public_training_data=None,
citation=MODEL2VEC_CITATION,
)


Expand All @@ -196,6 +204,7 @@ def encode(
training_datasets=bge_training_data, # distilled
public_training_code="https://github.com/MinishLab/model2vec",
public_training_data=None,
citation=MODEL2VEC_CITATION,
)

m2v_base_output = ModelMeta(
Expand All @@ -219,6 +228,7 @@ def encode(
training_datasets=bge_training_data, # distilled
public_training_code="https://github.com/MinishLab/model2vec",
public_training_data=None,
citation=MODEL2VEC_CITATION,
)

m2v_multilingual_output = ModelMeta(
Expand All @@ -242,6 +252,7 @@ def encode(
training_datasets=None,
public_training_code="https://github.com/MinishLab/model2vec",
public_training_data=None,
citation=MODEL2VEC_CITATION,
)

potion_base_2m = ModelMeta(
Expand All @@ -265,6 +276,7 @@ def encode(
training_datasets=bge_training_data, # distilled
public_training_code="https://github.com/MinishLab/model2vec",
public_training_data=None,
citation=MODEL2VEC_CITATION,
)

potion_base_4m = ModelMeta(
Expand All @@ -288,6 +300,7 @@ def encode(
training_datasets=bge_training_data, # distilled
public_training_code="https://github.com/MinishLab/model2vec",
public_training_data=None,
citation=MODEL2VEC_CITATION,
)

potion_base_8m = ModelMeta(
Expand All @@ -311,6 +324,7 @@ def encode(
training_datasets=bge_training_data, # distilled
public_training_code="https://github.com/MinishLab/model2vec",
public_training_data=None,
citation=MODEL2VEC_CITATION,
)

potion_multilingual_128m = ModelMeta(
Expand All @@ -334,6 +348,7 @@ def encode(
training_datasets=bge_training_data, # distilled
public_training_code="https://github.com/MinishLab/model2vec",
public_training_data=None,
citation=MODEL2VEC_CITATION,
)

pubmed_bert_100k = ModelMeta(
Expand Down
11 changes: 11 additions & 0 deletions mteb/models/model_implementations/nomic_models_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@
from mteb.models.model_meta import ModelMeta, ScoringFunction
from mteb.types import Array, BatchedInput, PromptType

NOMIC_EMBED_VISION_CITATION = """@article{nussbaum2024nomicembedvision,
title={Nomic Embed Vision: Expanding the Latent Space},
author={Nussbaum, Zach and Duderstadt, Brandon and Mulyar, Andriy},
journal={arXiv preprint arXiv:2406.18587},
year={2024},
eprint={2406.18587},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2406.18587}
}"""


class NomicVisionModel(AbsEncoder):
def __init__(
Expand Down Expand Up @@ -174,4 +184,5 @@ def encode(
# https://arxiv.org/pdf/2406.18587
# DFN-2B
),
citation=NOMIC_EMBED_VISION_CITATION,
)
Loading