diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 61cce4e071..d437843237 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -1087,57 +1087,66 @@ adapted_from="sdadas/polish-roberta-large-v2", superseded_by=None, ) + +udever_dataset = { # discussed here: https://github.com/embeddings-benchmark/mteb/issues/2193 + "MSMARCO": [], + # SNLI + # MultiNLI +} + +udever_langauges = [ + "aka_Latn", + "ara_Arab", + "asm_Beng", + "bam_Latn", + "ben_Beng", + "cat_Latn", + "eng_Latn", + "spa_Latn", + "eus_Latn", + "fon_Latn", + "fra_Latn", + "guj_Gujr", + "hin_Deva", + "ind_Latn", + "ibo_Latn", + "kik_Latn", + "kan_Knda", + "lug_Latn", + "lin_Latn", + "mal_Mlym", + "mar_Deva", + "nep_Deva", + "nso_Latn", + "nya_Latn", + "ori_Orya", + "pan_Guru", + "por_Latn", + "run_Latn", + "kin_Latn", + "sna_Latn", + "sot_Latn", + "swa_Latn", + "tam_Taml", + "tel_Telu", + "tsn_Latn", + "tso_Latn", + "tum_Latn", + "twi_Latn", + "urd_Arab", + "vie_Latn", + "wol_Latn", + "xho_Latn", + "yor_Latn", + "zho_Hans", + "zul_Latn", +] + izhx__udever_bloom_1b1 = ModelMeta( name="izhx/udever-bloom-1b1", revision="7bf1ee29878cb040b2708a691aa4b61f27eaa252", release_date="2023-10-24", - languages=[ - "aka_Latn", - "ara_Arab", - "asm_Beng", - "bam_Latn", - "ben_Beng", - "cat_Latn", - "eng_Latn", - "spa_Latn", - "eus_Latn", - "fon_Latn", - "fra_Latn", - "guj_Gujr", - "hin_Deva", - "ind_Latn", - "ibo_Latn", - "kik_Latn", - "kan_Knda", - "lug_Latn", - "lin_Latn", - "mal_Mlym", - "mar_Deva", - "nep_Deva", - "nso_Latn", - "nya_Latn", - "ori_Orya", - "pan_Guru", - "por_Latn", - "run_Latn", - "kin_Latn", - "sna_Latn", - "sot_Latn", - "swa_Latn", - "tam_Taml", - "tel_Telu", - "tsn_Latn", - "tso_Latn", - "tum_Latn", - "twi_Latn", - "urd_Arab", - "vie_Latn", - "wol_Latn", - "xho_Latn", - "yor_Latn", - "zho_Hans", - "zul_Latn", - ], + languages=udever_langauges, loader=None, n_parameters=None, memory_usage_mb=None, @@ -1151,7 +1160,7 @@ reference="https://huggingface.co/izhx/udever-bloom-1b1", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, + training_datasets=udever_dataset, adapted_from="bigscience/bloom-1b1", superseded_by=None, ) @@ -1159,53 +1168,7 @@ name="izhx/udever-bloom-3b", revision="4edd8affe80ca89ba0f6b6ba4103fc7f25fc57b2", release_date="2023-10-24", - languages=[ - "aka_Latn", - "ara_Arab", - "asm_Beng", - "bam_Latn", - "ben_Beng", - "cat_Latn", - "eng_Latn", - "spa_Latn", - "eus_Latn", - "fon_Latn", - "fra_Latn", - "guj_Gujr", - "hin_Deva", - "ind_Latn", - "ibo_Latn", - "kik_Latn", - "kan_Knda", - "lug_Latn", - "lin_Latn", - "mal_Mlym", - "mar_Deva", - "nep_Deva", - "nso_Latn", - "nya_Latn", - "ori_Orya", - "pan_Guru", - "por_Latn", - "run_Latn", - "kin_Latn", - "sna_Latn", - "sot_Latn", - "swa_Latn", - "tam_Taml", - "tel_Telu", - "tsn_Latn", - "tso_Latn", - "tum_Latn", - "twi_Latn", - "urd_Arab", - "vie_Latn", - "wol_Latn", - "xho_Latn", - "yor_Latn", - "zho_Hans", - "zul_Latn", - ], + languages=udever_langauges, loader=None, n_parameters=None, memory_usage_mb=None, @@ -1219,7 +1182,7 @@ reference="https://huggingface.co/izhx/udever-bloom-3b", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, + training_datasets=udever_dataset, adapted_from="bigscience/bloom-3b", superseded_by=None, ) @@ -1227,53 +1190,7 @@ name="izhx/udever-bloom-560m", revision="b2a723e355946ec5a5c5fbed3459766627ded2bb", release_date="2023-10-24", - languages=[ - "aka_Latn", - "ara_Arab", - "asm_Beng", - "bam_Latn", - "ben_Beng", - "cat_Latn", - "eng_Latn", - "spa_Latn", - "eus_Latn", - "fon_Latn", - "fra_Latn", - "guj_Gujr", - "hin_Deva", - "ind_Latn", - "ibo_Latn", - "kik_Latn", - "kan_Knda", - "lug_Latn", - "lin_Latn", - "mal_Mlym", - "mar_Deva", - "nep_Deva", - "nso_Latn", - "nya_Latn", - "ori_Orya", - "pan_Guru", - "por_Latn", - "run_Latn", - "kin_Latn", - "sna_Latn", - "sot_Latn", - "swa_Latn", - "tam_Taml", - "tel_Telu", - "tsn_Latn", - "tso_Latn", - "tum_Latn", - "twi_Latn", - "urd_Arab", - "vie_Latn", - "wol_Latn", - "xho_Latn", - "yor_Latn", - "zho_Hans", - "zul_Latn", - ], + languages=udever_langauges, loader=None, n_parameters=None, memory_usage_mb=None, @@ -1287,7 +1204,7 @@ reference="https://huggingface.co/izhx/udever-bloom-560m", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, + training_datasets=udever_dataset, adapted_from="bigscience/bloom-560m", superseded_by=None, ) @@ -1295,53 +1212,7 @@ name="izhx/udever-bloom-7b1", revision="18e8d3e6dbd94868584877f2e72a105a17df22ef", release_date="2023-10-24", - languages=[ - "aka_Latn", - "ara_Arab", - "asm_Beng", - "bam_Latn", - "ben_Beng", - "cat_Latn", - "eng_Latn", - "spa_Latn", - "eus_Latn", - "fon_Latn", - "fra_Latn", - "guj_Gujr", - "hin_Deva", - "ind_Latn", - "ibo_Latn", - "kik_Latn", - "kan_Knda", - "lug_Latn", - "lin_Latn", - "mal_Mlym", - "mar_Deva", - "nep_Deva", - "nso_Latn", - "nya_Latn", - "ori_Orya", - "pan_Guru", - "por_Latn", - "run_Latn", - "kin_Latn", - "sna_Latn", - "sot_Latn", - "swa_Latn", - "tam_Taml", - "tel_Telu", - "tsn_Latn", - "tso_Latn", - "tum_Latn", - "twi_Latn", - "urd_Arab", - "vie_Latn", - "wol_Latn", - "xho_Latn", - "yor_Latn", - "zho_Hans", - "zul_Latn", - ], + languages=udever_langauges, loader=None, n_parameters=None, memory_usage_mb=None, @@ -1355,7 +1226,7 @@ reference="https://huggingface.co/izhx/udever-bloom-7b1", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, + training_datasets=udever_dataset, adapted_from="bigscience/bloom-7b1", superseded_by=None, ) diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index b4413c3e02..f3476b264e 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -4,6 +4,15 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader +mixedbread_training_data = { + # from correspondance: + # as mentioned in our blog post + # (https://www.mixedbread.com/blog/mxbai-embed-large-v1#built-for-rag-and-real-world-use-cases:~:text=During%20the%20whole,related%20use%20cases.) + # We do not train on any data (except the MSMarco training split) of MTEB. We have a strong filtering process to ensure the OOD setting. That's true + # for all of our models. Keep up the good work and let me know if you have any questions. + "MSMARCO": [], +} + mxbai_embed_large_v1 = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -29,9 +38,7 @@ use_instructions=True, public_training_code=None, public_training_data=None, - training_datasets={ - "MSMARCO": ["train"], - }, + training_datasets=mixedbread_training_data, ) mxbai_embed_2d_large_v1 = ModelMeta( @@ -54,9 +61,7 @@ superseded_by=None, public_training_code=None, public_training_data=None, - training_datasets={ - "MSMARCO": ["train"], - }, + training_datasets=mixedbread_training_data, ) @@ -80,7 +85,5 @@ superseded_by=None, public_training_code=None, public_training_data=None, - training_datasets={ - "MSMARCO": ["train"], - }, + training_datasets=mixedbread_training_data, )