From 9c309671c210046c44e74074f479673dbafb2dc7 Mon Sep 17 00:00:00 2001 From: Nadia Sheikh Date: Tue, 25 Mar 2025 19:01:35 -0400 Subject: [PATCH 1/4] feat: added pubmedbert model2vec models --- mteb/models/model2vec_models.py | 131 ++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index ee79f1cafa..453bac7e57 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -235,3 +235,134 @@ def encode( public_training_code="https://github.com/MinishLab/model2vec", public_training_data=None, ) + +pubmed_bert_100k = ModelMeta( + loader=partial( + Model2VecWrapper, + name="NeuML/pubmedbert-base-embeddings-100K" + ), + name="NeuML/pubmedbert-base-embeddings-100K", + languages=["eng_Latn"], + open_weights=True, + revision="34ba71e35c393fdad7ed695113f653feb407b16b", + release_date="2025-01-03", + n_parameters=1 * 1e5, + memory_usage_mb=0, + max_tokens=np.inf, + embed_dim=64, + license="apache-2.0", + similarity_fn_name="cosine", + framework=["NumPy"], + reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-100K", + use_instructions=False, + adapted_from="NeuML/pubmedbert-base-embeddings", + superseded_by=None, + training_datasets=None, + public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-100K#training", + public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", +) + +pubmed_bert_500k = ModelMeta( + loader=partial( + Model2VecWrapper, + name="NeuML/pubmedbert-base-embeddings-500K" + ), + name="NeuML/pubmedbert-base-embeddings-500K", + languages=["eng_Latn"], + open_weights=True, + revision="34ba71e35c393fdad7ed695113f653feb407b16b", + release_date="2025-01-03", + n_parameters=1 * 1e6, + memory_usage_mb=2, + max_tokens=np.inf, + embed_dim=64, + license="apache-2.0", + similarity_fn_name="cosine", + framework=["NumPy"], + reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-500K", + use_instructions=False, + adapted_from="NeuML/pubmedbert-base-embeddings", + superseded_by=None, + training_datasets=None, + public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-500K#training", + public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", +) + +pubmed_bert_1m = ModelMeta( + loader=partial( + Model2VecWrapper, + name="NeuML/pubmedbert-base-embeddings-1M" + ), + name="NeuML/pubmedbert-base-embeddings-1M", + languages=["eng_Latn"], + open_weights=True, + revision="2b7fed222594708da6d88bcda92ae9b434b7ddd1", + release_date="2025-01-03", + n_parameters=1 * 1e6, + memory_usage_mb=2, + max_tokens=np.inf, + embed_dim=64, + license="apache-2.0", + similarity_fn_name="cosine", + framework=["NumPy"], + reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-1M", + use_instructions=False, + adapted_from="NeuML/pubmedbert-base-embeddings", + superseded_by=None, + training_datasets=None, + public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-1M#training", + public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", +) + +pubmed_bert_2m = ModelMeta( + loader=partial( + Model2VecWrapper, + name="NeuML/pubmedbert-base-embeddings-2M" + ), + name="NeuML/pubmedbert-base-embeddings-2M", + languages=["eng_Latn"], + open_weights=True, + revision="1d7bbe04d6713e425161146bfdc71473cbed498a", + release_date="2025-01-03", + n_parameters=1.95 * 1e6, + memory_usage_mb=7, + max_tokens=np.inf, + embed_dim=64, + license="apache-2.0", + similarity_fn_name="cosine", + framework=["NumPy"], + reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-2M", + use_instructions=False, + adapted_from="NeuML/pubmedbert-base-embeddings", + superseded_by=None, + training_datasets=None, + public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-2M#training", + public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", +) + +pubmed_bert_8m = ModelMeta( + loader=partial( + Model2VecWrapper, + name="NeuML/pubmedbert-base-embeddings-8M" + ), + name="NeuML/pubmedbert-base-embeddings-8M", + languages=["eng_Latn"], + open_weights=True, + revision="387d350015e963744f4fafe56a574b7cd48646c9", + release_date="2025-01-03", + n_parameters=7.81 * 1e6, + memory_usage_mb=30, + max_tokens=np.inf, + embed_dim=256, + license="apache-2.0", + similarity_fn_name="cosine", + framework=["NumPy"], + reference="https://huggingface.co/NeuML/pubmedbert-base-embeddings-8M", + use_instructions=False, + adapted_from="NeuML/pubmedbert-base-embeddings", + superseded_by=None, + training_datasets=None, + public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-8M#training", + public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", +) + From da4fadf41b30a1efc87e2177343293bcc3355f79 Mon Sep 17 00:00:00 2001 From: Nadia Sheikh Date: Wed, 26 Mar 2025 07:32:18 -0400 Subject: [PATCH 2/4] fix: attribute model_name --- mteb/models/model2vec_models.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 453bac7e57..3061338693 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -238,8 +238,7 @@ def encode( pubmed_bert_100k = ModelMeta( loader=partial( - Model2VecWrapper, - name="NeuML/pubmedbert-base-embeddings-100K" + Model2VecWrapper, model_name="NeuML/pubmedbert-base-embeddings-100K" ), name="NeuML/pubmedbert-base-embeddings-100K", languages=["eng_Latn"], @@ -264,8 +263,7 @@ def encode( pubmed_bert_500k = ModelMeta( loader=partial( - Model2VecWrapper, - name="NeuML/pubmedbert-base-embeddings-500K" + Model2VecWrapper, model_name="NeuML/pubmedbert-base-embeddings-500K" ), name="NeuML/pubmedbert-base-embeddings-500K", languages=["eng_Latn"], @@ -289,10 +287,7 @@ def encode( ) pubmed_bert_1m = ModelMeta( - loader=partial( - Model2VecWrapper, - name="NeuML/pubmedbert-base-embeddings-1M" - ), + loader=partial(Model2VecWrapper, model_name="NeuML/pubmedbert-base-embeddings-1M"), name="NeuML/pubmedbert-base-embeddings-1M", languages=["eng_Latn"], open_weights=True, @@ -315,10 +310,7 @@ def encode( ) pubmed_bert_2m = ModelMeta( - loader=partial( - Model2VecWrapper, - name="NeuML/pubmedbert-base-embeddings-2M" - ), + loader=partial(Model2VecWrapper, model_name="NeuML/pubmedbert-base-embeddings-2M"), name="NeuML/pubmedbert-base-embeddings-2M", languages=["eng_Latn"], open_weights=True, @@ -341,10 +333,7 @@ def encode( ) pubmed_bert_8m = ModelMeta( - loader=partial( - Model2VecWrapper, - name="NeuML/pubmedbert-base-embeddings-8M" - ), + loader=partial(Model2VecWrapper, model_name="NeuML/pubmedbert-base-embeddings-8M"), name="NeuML/pubmedbert-base-embeddings-8M", languages=["eng_Latn"], open_weights=True, @@ -365,4 +354,3 @@ def encode( public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-8M#training", public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", ) - From 7877a848d8af43b3557905482f95d0535473d3b5 Mon Sep 17 00:00:00 2001 From: Nadia Sheikh Date: Wed, 26 Mar 2025 17:02:17 -0400 Subject: [PATCH 3/4] fix: fixed commit hash for pubmed_bert model2vec models --- mteb/models/model2vec_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 3061338693..754737dc91 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -243,7 +243,7 @@ def encode( name="NeuML/pubmedbert-base-embeddings-100K", languages=["eng_Latn"], open_weights=True, - revision="34ba71e35c393fdad7ed695113f653feb407b16b", + revision="bac5e3b12fb8c650e92a19c41b436732c4f16e9e", release_date="2025-01-03", n_parameters=1 * 1e5, memory_usage_mb=0, @@ -270,7 +270,7 @@ def encode( open_weights=True, revision="34ba71e35c393fdad7ed695113f653feb407b16b", release_date="2025-01-03", - n_parameters=1 * 1e6, + n_parameters=5 * 1e5, memory_usage_mb=2, max_tokens=np.inf, embed_dim=64, From 79154b8e7ba84fdec2b061f19e2a7e8fe2e23975 Mon Sep 17 00:00:00 2001 From: Nadia Sheikh Date: Mon, 31 Mar 2025 17:48:00 -0400 Subject: [PATCH 4/4] fix: changes requested in PR 2443 --- mteb/models/model2vec_models.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 754737dc91..2b0a2ed02c 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -256,7 +256,7 @@ def encode( use_instructions=False, adapted_from="NeuML/pubmedbert-base-embeddings", superseded_by=None, - training_datasets=None, + training_datasets={}, public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-100K#training", public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", ) @@ -281,7 +281,7 @@ def encode( use_instructions=False, adapted_from="NeuML/pubmedbert-base-embeddings", superseded_by=None, - training_datasets=None, + training_datasets={}, public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-500K#training", public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", ) @@ -304,7 +304,7 @@ def encode( use_instructions=False, adapted_from="NeuML/pubmedbert-base-embeddings", superseded_by=None, - training_datasets=None, + training_datasets={}, public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-1M#training", public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", ) @@ -327,7 +327,7 @@ def encode( use_instructions=False, adapted_from="NeuML/pubmedbert-base-embeddings", superseded_by=None, - training_datasets=None, + training_datasets={}, public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-2M#training", public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", ) @@ -350,7 +350,7 @@ def encode( use_instructions=False, adapted_from="NeuML/pubmedbert-base-embeddings", superseded_by=None, - training_datasets=None, + training_datasets={}, public_training_code="https://huggingface.co/NeuML/pubmedbert-base-embeddings-8M#training", public_training_data="https://pubmed.ncbi.nlm.nih.gov/download/", )