From 9e0a9070a36eaf1db774be651754d90fa66b2615 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 23 May 2025 09:32:38 +0200 Subject: [PATCH 1/4] Added ModelMeta for potion-multilingual-128M --- mteb/models/model2vec_models.py | 130 +++++++++++++++++++++++++++++++- 1 file changed, 129 insertions(+), 1 deletion(-) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 69b716f186..fdb5ef2d44 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -14,6 +14,109 @@ logger = logging.getLogger(__name__) +_POTION_MULTILINGUAL_128M_LANGUAGES = [ + "afr-Latn", + "amh-Ethi", + "ara-Arab", + "aze-Latn", + "bel-Cyrl", + "bul-Cyrl", + "ben-Beng", + "cat-Latn", + "ceb-Latn", + "cos-Latn", + "ces-Latn", + "cym-Latn", + "dan-Latn", + "deu-Latn", + "ell-Grek", + "eng-Latn", + "epo-Latn", + "spa-Latn", + "est-Latn", + "eus-Latn", + "fas-Arab", + "fin-Latn", + "fil-Latn", + "fra-Latn", + "fry-Latn", + "gle-Latn", + "gla-Latn", + "glg-Latn", + "guj-Gujr", + "hau-Latn", + "haw-Latn", + "hin-Deva", + "hmn-Latn", + "hat-Latn", + "hun-Latn", + "hye-Armn", + "ind-Latn", + "ibo-Latn", + "isl-Latn", + "ita-Latn", + "heb-Hebr", + "jpn-Jpan", + "jav-Latn", + "kat-Geor", + "kaz-Cyrl", + "khm-Khmr", + "kan-Knda", + "kor-Hang", + "kur-Latn", + "kir-Cyrl", + "lat-Latn", + "ltz-Latn", + "lao-Laoo", + "lit-Latn", + "lav-Latn", + "mlg-Latn", + "mri-Latn", + "mkd-Cyrl", + "mal-Mlym", + "mon-Cyrl", + "mar-Deva", + "msa-Latn", + "mlt-Latn", + "mya-Mymr", + "nep-Deva", + "nld-Latn", + "nob-Latn", + "nya-Latn", + "pan-Guru", + "pol-Latn", + "pus-Arab", + "por-Latn", + "ron-Latn", + "rus-Cyrl", + "snd-Arab", + "sin-Sinh", + "slk-Latn", + "slv-Latn", + "smo-Latn", + "sna-Latn", + "som-Latn", + "sqi-Latn", + "srp-Cyrl", + "sot-Latn", + "sun-Latn", + "swe-Latn", + "swa-Latn", + "tam-Taml", + "tel-Telu", + "tgk-Latn", + "tha-Thai", + "tur-Latn", + "ukr-Cyrl", + "urd-Arab", + "uzb-Latn", + "vie-Latn", + "xho-Latn", + "yid-Hebr", + "yor-Latn", + "zho-Hans", + "zul-Latn", +] class Model2VecWrapper(Wrapper): def __init__( @@ -49,7 +152,6 @@ def encode( """ return self.static_model.encode(sentences).astype(np.float32) - m2v_base_glove_subword = ModelMeta( loader=partial( Model2VecWrapper, @@ -233,6 +335,32 @@ def encode( public_training_data=None, ) +potion_multilingual_128m = ModelMeta( + loader=partial( + Model2VecWrapper, + model_name="minishlab/potion-multilingual-128M", + ), + name="minishlab/potion-multilingual-128M", + languages=_POTION_MULTILINGUAL_128M_LANGUAGES, + open_weights=True, + revision="38ebd7f10f71e67fa8db898290f92b82e9cfff2a", + release_date="2024-10-29", + n_parameters=128 * 1e6, + memory_usage_mb=489, + max_tokens=np.inf, + embed_dim=256, + license="mit", + similarity_fn_name="cosine", + framework=["NumPy"], + reference="https://huggingface.co/minishlab/potion-multilingual-128M", + use_instructions=False, + adapted_from="BAAI/bge-m3", + superseded_by=None, + training_datasets=bge_training_data, # distilled + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, +) + pubmed_bert_100k = ModelMeta( loader=partial( Model2VecWrapper, model_name="NeuML/pubmedbert-base-embeddings-100K" From 35a6fc729e2e7d684a510a5d39dc57d720ac8f9d Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 23 May 2025 09:40:52 +0200 Subject: [PATCH 2/4] Fixed linting --- mteb/models/model2vec_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index fdb5ef2d44..7c7e812f9a 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -152,6 +152,7 @@ def encode( """ return self.static_model.encode(sentences).astype(np.float32) + m2v_base_glove_subword = ModelMeta( loader=partial( Model2VecWrapper, From cd1881edb5ccf320fa8d3433769064376fb264ed Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 23 May 2025 09:41:53 +0200 Subject: [PATCH 3/4] Fixed linting --- mteb/models/model2vec_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 7c7e812f9a..4a53c042d3 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -118,6 +118,7 @@ "zul-Latn", ] + class Model2VecWrapper(Wrapper): def __init__( self, From b33e0bcaf7bf1e19ece02cff3f10509c8683d757 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 23 May 2025 09:55:59 +0200 Subject: [PATCH 4/4] Updated date --- mteb/models/model2vec_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 4a53c042d3..f817abd10c 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -346,7 +346,7 @@ def encode( languages=_POTION_MULTILINGUAL_128M_LANGUAGES, open_weights=True, revision="38ebd7f10f71e67fa8db898290f92b82e9cfff2a", - release_date="2024-10-29", + release_date="2025-05-23", n_parameters=128 * 1e6, memory_usage_mb=489, max_tokens=np.inf,