diff --git a/mteb/models/model_implementations/arctic_models.py b/mteb/models/model_implementations/arctic_models.py index cf17925898..1c9508f026 100644 --- a/mteb/models/model_implementations/arctic_models.py +++ b/mteb/models/model_implementations/arctic_models.py @@ -4,6 +4,26 @@ ) from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader +ARCTIC_V1_CITATION = """@article{merrick2024embedding, + title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining}, + author={Merrick, Luke}, + journal={arXiv preprint arXiv:2407.18887}, + year={2024}, + eprint={2407.18887}, + archivePrefix={arXiv}, + url={https://arxiv.org/abs/2407.18887} +}""" + +ARCTIC_V2_CITATION = """@article{yu2024arctic, + title={Arctic-Embed 2.0: Multilingual Retrieval Without Compromise}, + author={Yu, Puxuan and Merrick, Luke and Nuti, Gaurav and Campos, Daniel}, + journal={arXiv preprint arXiv:2412.04506}, + year={2024}, + eprint={2412.04506}, + archivePrefix={arXiv}, + url={https://arxiv.org/abs/2412.04506} +}""" + LANGUAGES_V2_0 = [ "afr-Latn", "ara-Arab", @@ -138,6 +158,7 @@ public_training_code=None, public_training_data=None, training_datasets=arctic_v1_training_datasets, + citation=ARCTIC_V1_CITATION, ) @@ -162,6 +183,7 @@ public_training_code=None, public_training_data=None, # couldn't find training_datasets=arctic_v1_training_datasets, + citation=ARCTIC_V1_CITATION, ) @@ -186,6 +208,7 @@ public_training_code=None, public_training_data=None, # couldn't find training_datasets=arctic_v1_training_datasets, + citation=ARCTIC_V1_CITATION, ) arctic_embed_m_long = ModelMeta( @@ -210,6 +233,7 @@ public_training_code=None, public_training_data=None, # couldn't find training_datasets=arctic_v1_training_datasets, + citation=ARCTIC_V1_CITATION, ) arctic_embed_l = ModelMeta( @@ -233,6 +257,7 @@ public_training_code=None, public_training_data=None, # couldn't find training_datasets=arctic_v1_training_datasets, + citation=ARCTIC_V1_CITATION, ) arctic_embed_m_v1_5 = ModelMeta( @@ -261,6 +286,7 @@ public_training_code=None, public_training_data=None, training_datasets=arctic_v1_training_datasets, + citation=ARCTIC_V1_CITATION, ) arctic_embed_m_v2_0 = ModelMeta( @@ -285,6 +311,7 @@ public_training_code=None, public_training_data=None, # couldn't find training_datasets=arctic_v2_training_datasets, + citation=ARCTIC_V2_CITATION, ) arctic_embed_l_v2_0 = ModelMeta( @@ -308,4 +335,5 @@ public_training_code=None, public_training_data=None, # couldn't find training_datasets=arctic_v2_training_datasets, + citation=ARCTIC_V2_CITATION, ) diff --git a/mteb/models/model_implementations/b1ade_models.py b/mteb/models/model_implementations/b1ade_models.py index 2440780897..721ee50aaf 100644 --- a/mteb/models/model_implementations/b1ade_models.py +++ b/mteb/models/model_implementations/b1ade_models.py @@ -33,4 +33,13 @@ # BAAI/bge-large-en-v1.5 # mixedbread-ai/mxbai-embed-large-v1 # avsolatorio/GIST-large-Embedding-v0 + citation=""" + @misc{bigscience_workshop_2022, + author = { {Shreyas Subramanian} }, + title = { {b1ade series of models} }, + year = 2024, + url = { https://huggingface.co/w601sxs/b1ade-embed }, + publisher = { Hugging Face } +} +""", ) diff --git a/mteb/models/model_implementations/bge_models.py b/mteb/models/model_implementations/bge_models.py index c730639eab..9fdfc0e5fa 100644 --- a/mteb/models/model_implementations/bge_models.py +++ b/mteb/models/model_implementations/bge_models.py @@ -748,6 +748,17 @@ public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", training_datasets=E5_MISTRAL_TRAINING_DATA | bge_full_data, adapted_from="intfloat/e5-mistral-7b-instruct", + citation=""" + @misc{li2024makingtextembeddersfewshot, + title={Making Text Embedders Few-Shot Learners}, + author={Chaofan Li and MingHao Qin and Shitao Xiao and Jianlyu Chen and Kun Luo and Yingxia Shao and Defu Lian and Zheng Liu}, + year={2024}, + eprint={2409.15700}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2409.15700}, +} +""", ) bge_m3_unsupervised = ModelMeta( diff --git a/mteb/models/model_implementations/e5_v.py b/mteb/models/model_implementations/e5_v.py index 2ea035ae17..cc1a99f4b8 100644 --- a/mteb/models/model_implementations/e5_v.py +++ b/mteb/models/model_implementations/e5_v.py @@ -14,6 +14,16 @@ "4.44.2" # Issue 1647: Only works with transformers==4.44.2. ) +E5_V_CITATION = """@article{jiang2024e5v, + title={E5-V: Universal Embeddings with Multimodal Large Language Models}, + author={Jiang, Ting and Song, Minghui and Zhang, Zihan and Huang, Haizhen and Deng, Weiwei and Sun, Feng and Zhang, Qi and Wang, Deqing and Zhuang, Fuzhen}, + journal={arXiv preprint arXiv:2407.12580}, + year={2024}, + eprint={2407.12580}, + archivePrefix={arXiv}, + url={https://arxiv.org/abs/2407.12580} +}""" + class E5VModel(AbsEncoder): def __init__( @@ -169,4 +179,5 @@ def encode( training_datasets=set( # princeton-nlp/datasets-for-simcse ), + citation=E5_V_CITATION, ) diff --git a/mteb/models/model_implementations/evaclip_models.py b/mteb/models/model_implementations/evaclip_models.py index f0783e0593..b88a4ced18 100644 --- a/mteb/models/model_implementations/evaclip_models.py +++ b/mteb/models/model_implementations/evaclip_models.py @@ -11,6 +11,13 @@ from mteb.models.model_meta import ModelMeta, ScoringFunction from mteb.types import Array, BatchedInput, PromptType +EVA_CLIP_CITATION = """@article{EVA-CLIP, + title={EVA-CLIP: Improved Training Techniques for CLIP at Scale}, + author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue}, + journal={arXiv preprint arXiv:2303.15389}, + year={2023} +}""" + def evaclip_loader(model_name, **kwargs): try: @@ -148,6 +155,7 @@ def encode( similarity_fn_name=ScoringFunction.COSINE, use_instructions=False, training_datasets=training_datasets, + citation=EVA_CLIP_CITATION, ) EVA02_CLIP_L_14 = ModelMeta( @@ -170,6 +178,7 @@ def encode( similarity_fn_name=ScoringFunction.COSINE, use_instructions=False, training_datasets=training_datasets, + citation=EVA_CLIP_CITATION, ) EVA02_CLIP_bigE_14 = ModelMeta( @@ -192,6 +201,7 @@ def encode( similarity_fn_name=ScoringFunction.COSINE, use_instructions=False, training_datasets=laion_2b, + citation=EVA_CLIP_CITATION, ) @@ -215,4 +225,5 @@ def encode( similarity_fn_name=ScoringFunction.COSINE, use_instructions=False, training_datasets=laion_2b, + citation=EVA_CLIP_CITATION, ) diff --git a/mteb/models/model_implementations/fa_models.py b/mteb/models/model_implementations/fa_models.py index ec379d4b4e..415becd890 100644 --- a/mteb/models/model_implementations/fa_models.py +++ b/mteb/models/model_implementations/fa_models.py @@ -27,6 +27,15 @@ # Persian Wikipedia # Other data crawled from websites like bigbangpage.com, chetor.com, eligasht.com/blog, digikala.com/mag, and ted.com/talks. ), + citation=""" + @article{ParsBERT, + title={ParsBERT: Transformer-based Model for Persian Language Understanding}, + author={Mehrdad Farahani, Mohammad Gharachorloo, Marzieh Farahani, Mohammad Manthouri}, + journal={ArXiv}, + year={2020}, + volume={abs/2005.12515} +} +""", ) bert_zwnj = ModelMeta( diff --git a/mteb/models/model_implementations/gme_v_models.py b/mteb/models/model_implementations/gme_v_models.py index f347a95ecb..ae8f3c2254 100644 --- a/mteb/models/model_implementations/gme_v_models.py +++ b/mteb/models/model_implementations/gme_v_models.py @@ -14,6 +14,16 @@ logger = logging.getLogger(__name__) +GME_CITATION = """@misc{zhang2024gme, + title={GME: Improving Universal Multimodal Retrieval by Multimodal LLMs}, + author={Zhang, Xin and Zhang, Yanzhao and Xie, Wen and Li, Mingxin and Dai, Ziqi and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Li, Wenjie and Zhang, Min}, + year={2024}, + eprint={2412.16855}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={http://arxiv.org/abs/2412.16855} +}""" + class Encoder(torch.nn.Module): def __init__( @@ -349,6 +359,7 @@ def fetch_image( public_training_code=None, public_training_data=None, training_datasets=training_data, + citation=GME_CITATION, ) gme_qwen2vl_7b = ModelMeta( @@ -371,4 +382,5 @@ def fetch_image( public_training_code=None, public_training_data=None, training_datasets=training_data, + citation=GME_CITATION, ) diff --git a/mteb/models/model_implementations/mcinext_models.py b/mteb/models/model_implementations/mcinext_models.py index 367878acb4..bfeb9c0e83 100644 --- a/mteb/models/model_implementations/mcinext_models.py +++ b/mteb/models/model_implementations/mcinext_models.py @@ -12,6 +12,13 @@ logger = logging.getLogger(__name__) +HAKIM_CITATION = """@article{sarmadi2025hakim, + title={Hakim: Farsi Text Embedding Model}, + author={Sarmadi, Mehran and Alikhani, Morteza and Zinvandi, Erfan and Pourbahman, Zahra}, + journal={arXiv preprint arXiv:2505.08435}, + year={2025} +}""" + MODEL_API_NAMES = { "hakim": "Hakim", "hakim-small": "Hakim_small", @@ -396,6 +403,7 @@ def encode( "SynPerSTS", "Query2Query", }, + citation=HAKIM_CITATION, ) @@ -462,6 +470,7 @@ def encode( "SynPerSTS", "Query2Query", }, + citation=HAKIM_CITATION, ) hakim_unsup = ModelMeta( @@ -491,4 +500,5 @@ def encode( "MSMARCO-Fa", "Query2Query", }, + citation=HAKIM_CITATION, ) diff --git a/mteb/models/model_implementations/model2vec_models.py b/mteb/models/model_implementations/model2vec_models.py index 37c41dd4b6..f0ce608aa3 100644 --- a/mteb/models/model_implementations/model2vec_models.py +++ b/mteb/models/model_implementations/model2vec_models.py @@ -14,6 +14,13 @@ logger = logging.getLogger(__name__) +MODEL2VEC_CITATION = """@software{minishlab2024model2vec, + authors = {Stephan Tulkens, Thomas van Dongen}, + title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model}, + year = {2024}, + url = {https://github.com/MinishLab/model2vec} +}""" + _POTION_MULTILINGUAL_128M_LANGUAGES = [ "afr-Latn", "amh-Ethi", @@ -172,6 +179,7 @@ def encode( training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", public_training_data=None, + citation=MODEL2VEC_CITATION, ) @@ -196,6 +204,7 @@ def encode( training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", public_training_data=None, + citation=MODEL2VEC_CITATION, ) m2v_base_output = ModelMeta( @@ -219,6 +228,7 @@ def encode( training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", public_training_data=None, + citation=MODEL2VEC_CITATION, ) m2v_multilingual_output = ModelMeta( @@ -242,6 +252,7 @@ def encode( training_datasets=None, public_training_code="https://github.com/MinishLab/model2vec", public_training_data=None, + citation=MODEL2VEC_CITATION, ) potion_base_2m = ModelMeta( @@ -265,6 +276,7 @@ def encode( training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", public_training_data=None, + citation=MODEL2VEC_CITATION, ) potion_base_4m = ModelMeta( @@ -288,6 +300,7 @@ def encode( training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", public_training_data=None, + citation=MODEL2VEC_CITATION, ) potion_base_8m = ModelMeta( @@ -311,6 +324,7 @@ def encode( training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", public_training_data=None, + citation=MODEL2VEC_CITATION, ) potion_multilingual_128m = ModelMeta( @@ -334,6 +348,7 @@ def encode( training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", public_training_data=None, + citation=MODEL2VEC_CITATION, ) pubmed_bert_100k = ModelMeta( diff --git a/mteb/models/model_implementations/nomic_models_vision.py b/mteb/models/model_implementations/nomic_models_vision.py index 7dbcd3a4ed..54cac9dc44 100644 --- a/mteb/models/model_implementations/nomic_models_vision.py +++ b/mteb/models/model_implementations/nomic_models_vision.py @@ -12,6 +12,16 @@ from mteb.models.model_meta import ModelMeta, ScoringFunction from mteb.types import Array, BatchedInput, PromptType +NOMIC_EMBED_VISION_CITATION = """@article{nussbaum2024nomicembedvision, + title={Nomic Embed Vision: Expanding the Latent Space}, + author={Nussbaum, Zach and Duderstadt, Brandon and Mulyar, Andriy}, + journal={arXiv preprint arXiv:2406.18587}, + year={2024}, + eprint={2406.18587}, + archivePrefix={arXiv}, + url={https://arxiv.org/abs/2406.18587} +}""" + class NomicVisionModel(AbsEncoder): def __init__( @@ -174,4 +184,5 @@ def encode( # https://arxiv.org/pdf/2406.18587 # DFN-2B ), + citation=NOMIC_EMBED_VISION_CITATION, ) diff --git a/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py b/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py index 36ee416e70..b2ef974d2f 100644 --- a/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +++ b/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py @@ -9,6 +9,16 @@ from mteb.models.model_meta import ModelMeta from mteb.types import Array, BatchedInput, PromptType +LLAMA_NEMORETRIEVER_CITATION = """@misc{xu2025llamanemoretrievercolembedtopperforming, + title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model}, + author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge}, + year={2025}, + eprint={2507.05513}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2507.05513} +}""" + class LlamaNemoretrieverColembed(AbsEncoder): def __init__( @@ -148,6 +158,7 @@ def encode( similarity_fn_name="MaxSim", use_instructions=True, training_datasets=TRAINING_DATA, + citation=LLAMA_NEMORETRIEVER_CITATION, ) llama_nemoretriever_colembed_3b_v1 = ModelMeta( @@ -173,4 +184,5 @@ def encode( similarity_fn_name="MaxSim", use_instructions=True, training_datasets=TRAINING_DATA, + citation=LLAMA_NEMORETRIEVER_CITATION, ) diff --git a/mteb/models/model_implementations/nvidia_models.py b/mteb/models/model_implementations/nvidia_models.py index 40d2cdd4fd..e80e298640 100644 --- a/mteb/models/model_implementations/nvidia_models.py +++ b/mteb/models/model_implementations/nvidia_models.py @@ -18,6 +18,16 @@ logger = logging.getLogger(__name__) +NV_RETRIEVER_CITATION = """@misc{moreira2025nvretrieverimprovingtextembedding, + title={NV-Retriever: Improving text embedding models with effective hard-negative mining}, + author={Gabriel de Souza P. Moreira and Radek Osmulski and Mengyao Xu and Ronay Ak and Benedikt Schifferer and Even Oldridge}, + year={2025}, + eprint={2407.15831}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2407.15831} +}""" + def instruction_template( instruction: str, prompt_type: PromptType | None = None @@ -117,6 +127,7 @@ def instruction_template( training_datasets=nvidia_training_datasets, public_training_code=None, public_training_data=None, + citation=NV_RETRIEVER_CITATION, ) NV_embed_v1 = ModelMeta( @@ -146,6 +157,7 @@ def instruction_template( training_datasets=nvidia_training_datasets, public_training_code=None, public_training_data=None, + citation=NV_RETRIEVER_CITATION, ) llama_embed_nemotron_evaluated_languages = [ @@ -533,4 +545,5 @@ def _extract_embeddings( public_training_code=None, # Will be released later public_training_data=None, # Will be released later contacts=["ybabakhin"], + citation=NV_RETRIEVER_CITATION, ) diff --git a/mteb/models/model_implementations/salesforce_models.py b/mteb/models/model_implementations/salesforce_models.py index ca09750e2f..219ae13ef6 100644 --- a/mteb/models/model_implementations/salesforce_models.py +++ b/mteb/models/model_implementations/salesforce_models.py @@ -100,6 +100,13 @@ def instruction_template( public_training_code=None, public_training_data=None, training_datasets=None, + citation="""@article{liu2024codexembed, + title={CodeXEmbed: A Generalist Embedding Model Family for Multiligual and Multi-task Code Retrieval}, + author={Liu, Ye and Meng, Rui and Jot, Shafiq and Savarese, Silvio and Xiong, Caiming and Zhou, Yingbo and Yavuz, Semih}, + journal={arXiv preprint arXiv:2411.12644}, + year={2024} +} +""", ) SFR_Embedding_Mistral = ModelMeta( @@ -129,4 +136,13 @@ def instruction_template( public_training_code=None, public_training_data=None, training_datasets=SFR_TRAINING_DATA, + citation=""" + @misc{SFRAIResearch2024, + title={SFR-Embedding-Mistral:Enhance Text Retrieval with Transfer Learning}, + author={Rui Meng, Ye Liu, Shafiq Rayhan Joty, Caiming Xiong, Yingbo Zhou, Semih Yavuz}, + howpublished={Salesforce AI Research Blog}, + year={2024}, + url={https://www.salesforce.com/blog/sfr-embedding/} +} +""", ) diff --git a/mteb/models/model_implementations/searchmap_models.py b/mteb/models/model_implementations/searchmap_models.py index 4b341fc6d9..c3b1060532 100644 --- a/mteb/models/model_implementations/searchmap_models.py +++ b/mteb/models/model_implementations/searchmap_models.py @@ -37,4 +37,12 @@ public_training_data=None, training_datasets=None, adapted_from="NovaSearch/stella_en_400M_v5", + citation="""@misc{vectorpath2025searchmap, + title={SearchMap: Conversational E-commerce Search Embedding Model}, + author={VectorPath Research Team}, + year={2025}, + publisher={Hugging Face}, + journal={HuggingFace Model Hub}, +} +""", )