embeddings-benchmark · isaac-chung · Jan 20, 2025 · Jan 11, 2025 · Jan 11, 2025 · Jan 11, 2025
diff --git a/mteb/model_meta.py b/mteb/model_meta.py
@@ -59,15 +59,13 @@ class ModelMeta(BaseModel):
         name: The name of the model, ideally the name on huggingface.
         n_parameters: The number of parameters in the model, e.g. 7_000_000 for a 7M parameter model. Can be None if the the number of parameters is not known (e.g. for proprietary models) or
             if the loader returns a SentenceTransformer model from which it can be derived.
-        memory_usage: The amount of memory the model uses in GB. Can be None if the memory usage is not known (e.g. for proprietary models).
         max_tokens: The maximum number of tokens the model can handle. Can be None if the maximum number of tokens is not known (e.g. for proprietary
             models).
         embed_dim: The dimension of the embeddings produced by the model. Currently all models are assumed to produce fixed-size embeddings.
         revision: The revision number of the model. If None it is assumed that the metadata (including the loader) is valid for all revisions of the model.
         release_date: The date the model's revision was released.
         license: The license under which the model is released. Required if open_weights is True.
         open_weights: Whether the model is open source or proprietary.
-        public_training_data: Whether the training data used to train the model is publicly available.
         public_training_code: Whether the code used to train the model is publicly available.
         similarity_fn_name: The distance metric used by the model.
         framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`.
@@ -90,19 +88,17 @@ class ModelMeta(BaseModel):
     release_date: STR_DATE | None
     languages: list[ISO_LANGUAGE_SCRIPT] | None
     loader: Callable[..., Encoder] | None = None
-    n_parameters: int | None = None
-    memory_usage: float | None = None
-    max_tokens: float | None = None
-    embed_dim: int | None = None
-    license: str | None = None
-    open_weights: bool | None = None
-    public_training_data: bool | None = None
-    public_training_code: bool | None = None
-    framework: list[FRAMEWORKS] = []
+    n_parameters: int | None
+    max_tokens: float | None
+    embed_dim: int | None
+    license: str | None
+    open_weights: bool | None
+    public_training_code: str | None
+    framework: list[FRAMEWORKS]
     reference: STR_URL | None = None
-    similarity_fn_name: DISTANCE_METRICS | None = None
-    use_instructions: bool | None = None
-    training_datasets: dict[str, list[str]] | None = None
+    similarity_fn_name: DISTANCE_METRICS | None
+    use_instructions: bool | None
+    training_datasets: dict[str, list[str]] | None
     adapted_from: str | None = None
     superseded_by: str | None = None
 

diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py
@@ -94,7 +94,6 @@
     open_weights=True,
     framework=["Sentence Transformers", "PyTorch"],
     n_parameters=22_600_000,
-    memory_usage=None,
     max_tokens=512,
     embed_dim=384,
     license="apache-2.0",
@@ -103,8 +102,7 @@
     use_instructions=True,
     adapted_from="sentence-transformers/all-MiniLM-L6-v2",
     superseded_by=None,
-    public_training_data=False,  # couldn't find
-    public_training_code=False,  # couldn't find
+    public_training_code=None,  # couldn't find
     training_datasets={
         # source: https://arxiv.org/pdf/2405.05374
         # splits not specified to assuming everything
@@ -145,7 +143,6 @@
     open_weights=True,
     framework=["Sentence Transformers", "PyTorch"],
     n_parameters=32_200_000,
-    memory_usage=None,
     max_tokens=512,
     embed_dim=384,
     license="apache-2.0",
@@ -154,8 +151,7 @@
     use_instructions=True,
     adapted_from="intfloat/e5-small-unsupervised",
     superseded_by=None,
-    public_training_data=False,  # couldn't find
-    public_training_code=False,  # couldn't find
+    public_training_code=None,  # couldn't find
     training_datasets={
         # source: https://arxiv.org/pdf/2405.05374
         # splits not specified to assuming everything
@@ -196,7 +192,6 @@
     open_weights=True,
     framework=["Sentence Transformers", "PyTorch"],
     n_parameters=109_000_000,
-    memory_usage=None,
     max_tokens=512,
     embed_dim=768,
     license="apache-2.0",
@@ -205,8 +200,7 @@
     use_instructions=True,
     adapted_from="intfloat/e5-base-unsupervised",
     superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5",
-    public_training_data=False,  # couldn't find
-    public_training_code=False,  # couldn't find
+    public_training_code=None,  # couldn't find
     training_datasets={
         # source: https://arxiv.org/pdf/2405.05374
         # splits not specified to assuming everything
@@ -247,7 +241,6 @@
     open_weights=True,
     framework=["Sentence Transformers", "PyTorch"],
     n_parameters=137_000_000,
-    memory_usage=None,
     max_tokens=2048,
     embed_dim=768,
     license="apache-2.0",
@@ -256,8 +249,7 @@
     use_instructions=True,
     adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised",
     superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0",
-    public_training_data=False,  # couldn't find
-    public_training_code=False,  # couldn't find
+    public_training_code=None,  # couldn't find
     training_datasets={
         # source: https://arxiv.org/pdf/2405.05374
         # splits not specified to assuming everything
@@ -298,7 +290,6 @@
     open_weights=True,
     framework=["Sentence Transformers", "PyTorch"],
     n_parameters=335_000_000,
-    memory_usage=None,
     max_tokens=512,
     embed_dim=1024,
     license="apache-2.0",
@@ -307,8 +298,7 @@
     use_instructions=True,
     adapted_from="intfloat/e5-base-unsupervised",
     superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0",
-    public_training_data=False,  # couldn't find
-    public_training_code=False,  # couldn't find
+    public_training_code=None,  # couldn't find
     training_datasets={
         # source: https://arxiv.org/pdf/2405.05374
         # splits not specified to assuming everything
@@ -351,7 +341,6 @@
     open_weights=True,
     framework=["Sentence Transformers", "PyTorch"],
     n_parameters=109_000_000,
-    memory_usage=None,
     max_tokens=512,
     embed_dim=768,
     license="apache-2.0",
@@ -360,6 +349,8 @@
     use_instructions=True,
     adapted_from=None,
     superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0",
+    public_training_code=None,
+    training_datasets=None,
 )
 
 arctic_embed_m_v2_0 = ModelMeta(
@@ -376,7 +367,6 @@
     open_weights=True,
     framework=["Sentence Transformers", "PyTorch"],
     n_parameters=305_000_000,
-    memory_usage=None,
     max_tokens=8192,
     embed_dim=768,
     license="apache-2.0",
@@ -385,8 +375,7 @@
     use_instructions=True,
     adapted_from="Alibaba-NLP/gte-multilingual-base",
     superseded_by=None,
-    public_training_data=False,  # couldn't find
-    public_training_code=False,  # couldn't find
+    public_training_code=None,  # couldn't find
     training_datasets={
         # source: https://arxiv.org/pdf/2405.05374
         # splits not specified to assuming everything
@@ -426,7 +415,6 @@
     open_weights=True,
     framework=["Sentence Transformers", "PyTorch"],
     n_parameters=568_000_000,
-    memory_usage=None,
     max_tokens=8192,
     embed_dim=1024,
     license="apache-2.0",
@@ -435,8 +423,7 @@
     use_instructions=True,
     adapted_from="BAAI/bge-m3-retromae",
     superseded_by=None,
-    public_training_data=False,  # couldn't find
-    public_training_code=False,  # couldn't find
+    public_training_code=None,  # couldn't find
     training_datasets={
         # source: https://arxiv.org/pdf/2405.05374
         # splits not specified to assuming everything

diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py
@@ -365,15 +365,13 @@
     revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a",
     release_date="2023-09-12",  # initial commit of hf model.
     n_parameters=24_000_000,
-    memory_usage=None,
     embed_dim=512,
     license="mit",
     max_tokens=512,
     reference="https://huggingface.co/BAAI/bge-small-en-v1.5",
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
-    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
     training_datasets=bge_training_data,
 )
@@ -391,15 +389,13 @@
     revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a",
     release_date="2023-09-11",  # initial commit of hf model.
     n_parameters=438_000_000,
-    memory_usage=None,
     embed_dim=768,
     license="mit",
     max_tokens=512,
     reference="https://huggingface.co/BAAI/bge-base-en-v1.5",
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
-    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
     training_datasets=bge_training_data,
 )
@@ -417,15 +413,13 @@
     revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09",
     release_date="2023-09-12",  # initial commit of hf model.
     n_parameters=1_340_000_000,
-    memory_usage=None,
     embed_dim=1024,
     license="mit",
     max_tokens=512,
     reference="https://huggingface.co/BAAI/bge-large-en-v1.5",
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
-    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
     training_datasets=bge_training_data,
 )
@@ -443,15 +437,13 @@
     revision="7999e1d3359715c523056ef9478215996d62a620",
     release_date="2023-09-12",  # initial commit of hf model.
     n_parameters=24_000_000,
-    memory_usage=None,
     embed_dim=512,
     license="mit",
     max_tokens=512,
     reference="https://huggingface.co/BAAI/bge-small-zh-v1.5",
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
-    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
     training_datasets=bge_chinese_training_data,
 )
@@ -469,15 +461,13 @@
     revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65",
     release_date="2023-09-11",  # initial commit of hf model.
     n_parameters=438_000_000,
-    memory_usage=None,
     embed_dim=768,
     license="mit",
     max_tokens=512,
     reference="https://huggingface.co/BAAI/bge-base-zh-v1.5",
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
-    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
     training_datasets=bge_chinese_training_data,
 )
@@ -495,15 +485,13 @@
     revision="79e7739b6ab944e86d6171e44d24c997fc1e0116",
     release_date="2023-09-12",  # initial commit of hf model.
     n_parameters=1_340_000_000,
-    memory_usage=None,
     embed_dim=1024,
     license="mit",
     max_tokens=512,
     reference="https://huggingface.co/BAAI/bge-large-zh-v1.5",
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
-    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
     training_datasets=bge_chinese_training_data,
 )
@@ -520,15 +508,13 @@
     revision="5617a9f61b028005a4858fdac845db406aefb181",
     release_date="2024-06-28",
     n_parameters=568_000_000,
-    memory_usage=None,
     embed_dim=4096,
     license="mit",
     max_tokens=8194,
     reference="https://huggingface.co/BAAI/bge-m3",
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=False,
-    public_training_data=True,
     public_training_code=None,
     training_datasets=bgem3_training_data,
 )
@@ -555,15 +541,13 @@
     revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a",
     release_date="2024-07-25",  # initial commit of hf model.
     n_parameters=9.24 * 1e9,
-    memory_usage=None,
     embed_dim=3584,  # from old C-MTEB leaderboard
     license="gemma",
     max_tokens=8192,  # from old C-MTEB leaderboard
     reference="https://huggingface.co/BAAI/bge-multilingual-gemma2",
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=False,
-    public_training_data=False,
-    public_training_code=False,
+    public_training_code=None,
     training_datasets=None,  # not disclosed
 )
diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py
@@ -131,12 +131,13 @@ def encode(self, texts: list[str], **kwargs):
     revision="0_1_10",
     release_date="2024-07-10",  ## release of version 0.1.10
     n_parameters=None,
-    memory_usage=None,
     embed_dim=None,
     license=None,
     max_tokens=None,
-    reference=None,
+    reference="https://github.com/xhluca/bm25s",
     similarity_fn_name=None,
     framework=[],
     use_instructions=False,
+    public_training_code="https://github.com/xhluca/bm25s",
+    training_datasets=None,
 )