Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
a168496
fix: Leaderboard: `K` instead of `M`
KennethEnevoldsen Jan 11, 2025
e61d7f2
format
KennethEnevoldsen Jan 11, 2025
e1b89e3
fixed existing annotations to refer to task name instead of hf dataset
KennethEnevoldsen Jan 11, 2025
9ffeae4
added annotation to nvidia
KennethEnevoldsen Jan 11, 2025
0495d32
added voyage
KennethEnevoldsen Jan 11, 2025
5f7ef65
added uae annotations
KennethEnevoldsen Jan 11, 2025
ac48012
Added stella annotations
KennethEnevoldsen Jan 11, 2025
c1c7eb6
sentence trf models
KennethEnevoldsen Jan 11, 2025
4ec9121
added salesforce and e5
KennethEnevoldsen Jan 11, 2025
c54859d
jina
KennethEnevoldsen Jan 11, 2025
d7f5684
bge + model2vec
KennethEnevoldsen Jan 11, 2025
9ea60ff
added llm2vec annotations
KennethEnevoldsen Jan 11, 2025
b123d92
add jasper
KennethEnevoldsen Jan 11, 2025
aa728d1
format
KennethEnevoldsen Jan 11, 2025
87b5d9d
Merge remote-tracking branch 'origin' into add-more-annotations
KennethEnevoldsen Jan 11, 2025
121bf0e
format
KennethEnevoldsen Jan 12, 2025
b2b9cca
Updated annotations and moved jina models
KennethEnevoldsen Jan 13, 2025
569c674
make models parameters needed to be filled
Samoed Jan 13, 2025
1f3a30f
fix tests
Samoed Jan 13, 2025
58aaad8
remove comments
Samoed Jan 13, 2025
5676852
remove model meta from test
Samoed Jan 15, 2025
ddf32d1
fix model meta from split
Samoed Jan 15, 2025
3aab7ec
fix: add even more training dataset annotations (#1793)
KennethEnevoldsen Jan 15, 2025
4234da7
Merge branch 'refs/heads/main' into make_model_meta_params_reqired
Samoed Jan 15, 2025
3e5bd4c
Merge remote-tracking branch 'refs/remotes/origin/add-more-annotation…
Samoed Jan 15, 2025
403cbac
Merge branch 'refs/heads/main' into make_model_meta_params_reqired
Samoed Jan 17, 2025
8e6259b
fig merges
Samoed Jan 17, 2025
9d53dc7
update models info
Samoed Jan 17, 2025
36ccc7f
change public_training_code to str
Samoed Jan 18, 2025
e5e9d26
change `public_training_code=False` to None
Samoed Jan 18, 2025
4c70d38
remove annotations
Samoed Jan 19, 2025
3e58f08
remove annotations
Samoed Jan 19, 2025
1044f6e
remove changed annotations
Samoed Jan 19, 2025
53013b4
remove changed annotations
Samoed Jan 19, 2025
7b2e1c4
remove `public_training_data` and `memory usage`
Samoed Jan 19, 2025
86ede73
make framework not optional
Samoed Jan 19, 2025
1355d22
make framework non-optional
Samoed Jan 19, 2025
3a765d9
empty frameworks
Samoed Jan 19, 2025
5661fb6
add framework
Samoed Jan 19, 2025
ff54c31
fix tests
Samoed Jan 19, 2025
bd0fa0f
Update mteb/models/overview.py
Samoed Jan 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 10 additions & 14 deletions mteb/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,13 @@ class ModelMeta(BaseModel):
name: The name of the model, ideally the name on huggingface.
n_parameters: The number of parameters in the model, e.g. 7_000_000 for a 7M parameter model. Can be None if the the number of parameters is not known (e.g. for proprietary models) or
if the loader returns a SentenceTransformer model from which it can be derived.
memory_usage: The amount of memory the model uses in GB. Can be None if the memory usage is not known (e.g. for proprietary models).
max_tokens: The maximum number of tokens the model can handle. Can be None if the maximum number of tokens is not known (e.g. for proprietary
models).
embed_dim: The dimension of the embeddings produced by the model. Currently all models are assumed to produce fixed-size embeddings.
revision: The revision number of the model. If None it is assumed that the metadata (including the loader) is valid for all revisions of the model.
release_date: The date the model's revision was released.
license: The license under which the model is released. Required if open_weights is True.
open_weights: Whether the model is open source or proprietary.
public_training_data: Whether the training data used to train the model is publicly available.
public_training_code: Whether the code used to train the model is publicly available.
similarity_fn_name: The distance metric used by the model.
framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`.
Expand All @@ -90,19 +88,17 @@ class ModelMeta(BaseModel):
release_date: STR_DATE | None
languages: list[ISO_LANGUAGE_SCRIPT] | None
loader: Callable[..., Encoder] | None = None
n_parameters: int | None = None
memory_usage: float | None = None
max_tokens: float | None = None
embed_dim: int | None = None
license: str | None = None
open_weights: bool | None = None
public_training_data: bool | None = None
public_training_code: bool | None = None
framework: list[FRAMEWORKS] = []
n_parameters: int | None
max_tokens: float | None
embed_dim: int | None
license: str | None
open_weights: bool | None
public_training_code: str | None
framework: list[FRAMEWORKS]
reference: STR_URL | None = None
similarity_fn_name: DISTANCE_METRICS | None = None
use_instructions: bool | None = None
training_datasets: dict[str, list[str]] | None = None
similarity_fn_name: DISTANCE_METRICS | None
use_instructions: bool | None
training_datasets: dict[str, list[str]] | None
adapted_from: str | None = None
superseded_by: str | None = None

Expand Down
31 changes: 9 additions & 22 deletions mteb/models/arctic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=22_600_000,
memory_usage=None,
max_tokens=512,
embed_dim=384,
license="apache-2.0",
Expand All @@ -103,8 +102,7 @@
use_instructions=True,
adapted_from="sentence-transformers/all-MiniLM-L6-v2",
superseded_by=None,
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
public_training_code=None, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
Expand Down Expand Up @@ -145,7 +143,6 @@
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=32_200_000,
memory_usage=None,
max_tokens=512,
embed_dim=384,
license="apache-2.0",
Expand All @@ -154,8 +151,7 @@
use_instructions=True,
adapted_from="intfloat/e5-small-unsupervised",
superseded_by=None,
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
public_training_code=None, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
Expand Down Expand Up @@ -196,7 +192,6 @@
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=109_000_000,
memory_usage=None,
max_tokens=512,
embed_dim=768,
license="apache-2.0",
Expand All @@ -205,8 +200,7 @@
use_instructions=True,
adapted_from="intfloat/e5-base-unsupervised",
superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5",
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
public_training_code=None, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
Expand Down Expand Up @@ -247,7 +241,6 @@
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=137_000_000,
memory_usage=None,
max_tokens=2048,
embed_dim=768,
license="apache-2.0",
Expand All @@ -256,8 +249,7 @@
use_instructions=True,
adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised",
superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0",
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
public_training_code=None, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
Expand Down Expand Up @@ -298,7 +290,6 @@
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=335_000_000,
memory_usage=None,
max_tokens=512,
embed_dim=1024,
license="apache-2.0",
Expand All @@ -307,8 +298,7 @@
use_instructions=True,
adapted_from="intfloat/e5-base-unsupervised",
superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0",
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
public_training_code=None, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
Expand Down Expand Up @@ -351,7 +341,6 @@
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=109_000_000,
memory_usage=None,
max_tokens=512,
embed_dim=768,
license="apache-2.0",
Expand All @@ -360,6 +349,8 @@
use_instructions=True,
adapted_from=None,
superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0",
public_training_code=None,
training_datasets=None,
)

arctic_embed_m_v2_0 = ModelMeta(
Expand All @@ -376,7 +367,6 @@
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=305_000_000,
memory_usage=None,
max_tokens=8192,
embed_dim=768,
license="apache-2.0",
Expand All @@ -385,8 +375,7 @@
use_instructions=True,
adapted_from="Alibaba-NLP/gte-multilingual-base",
superseded_by=None,
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
public_training_code=None, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
Expand Down Expand Up @@ -426,7 +415,6 @@
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=568_000_000,
memory_usage=None,
max_tokens=8192,
embed_dim=1024,
license="apache-2.0",
Expand All @@ -435,8 +423,7 @@
use_instructions=True,
adapted_from="BAAI/bge-m3-retromae",
superseded_by=None,
public_training_data=False, # couldn't find
public_training_code=False, # couldn't find
public_training_code=None, # couldn't find
training_datasets={
# source: https://arxiv.org/pdf/2405.05374
# splits not specified to assuming everything
Expand Down
18 changes: 1 addition & 17 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,15 +365,13 @@
revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a",
release_date="2023-09-12", # initial commit of hf model.
n_parameters=24_000_000,
memory_usage=None,
embed_dim=512,
license="mit",
max_tokens=512,
reference="https://huggingface.co/BAAI/bge-small-en-v1.5",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets=bge_training_data,
)
Expand All @@ -391,15 +389,13 @@
revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a",
release_date="2023-09-11", # initial commit of hf model.
n_parameters=438_000_000,
memory_usage=None,
embed_dim=768,
license="mit",
max_tokens=512,
reference="https://huggingface.co/BAAI/bge-base-en-v1.5",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets=bge_training_data,
)
Expand All @@ -417,15 +413,13 @@
revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09",
release_date="2023-09-12", # initial commit of hf model.
n_parameters=1_340_000_000,
memory_usage=None,
embed_dim=1024,
license="mit",
max_tokens=512,
reference="https://huggingface.co/BAAI/bge-large-en-v1.5",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets=bge_training_data,
)
Expand All @@ -443,15 +437,13 @@
revision="7999e1d3359715c523056ef9478215996d62a620",
release_date="2023-09-12", # initial commit of hf model.
n_parameters=24_000_000,
memory_usage=None,
embed_dim=512,
license="mit",
max_tokens=512,
reference="https://huggingface.co/BAAI/bge-small-zh-v1.5",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets=bge_chinese_training_data,
)
Expand All @@ -469,15 +461,13 @@
revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65",
release_date="2023-09-11", # initial commit of hf model.
n_parameters=438_000_000,
memory_usage=None,
embed_dim=768,
license="mit",
max_tokens=512,
reference="https://huggingface.co/BAAI/bge-base-zh-v1.5",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets=bge_chinese_training_data,
)
Expand All @@ -495,15 +485,13 @@
revision="79e7739b6ab944e86d6171e44d24c997fc1e0116",
release_date="2023-09-12", # initial commit of hf model.
n_parameters=1_340_000_000,
memory_usage=None,
embed_dim=1024,
license="mit",
max_tokens=512,
reference="https://huggingface.co/BAAI/bge-large-zh-v1.5",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets=bge_chinese_training_data,
)
Expand All @@ -520,15 +508,13 @@
revision="5617a9f61b028005a4858fdac845db406aefb181",
release_date="2024-06-28",
n_parameters=568_000_000,
memory_usage=None,
embed_dim=4096,
license="mit",
max_tokens=8194,
reference="https://huggingface.co/BAAI/bge-m3",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
public_training_data=True,
public_training_code=None,
training_datasets=bgem3_training_data,
)
Expand All @@ -555,15 +541,13 @@
revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a",
release_date="2024-07-25", # initial commit of hf model.
n_parameters=9.24 * 1e9,
memory_usage=None,
embed_dim=3584, # from old C-MTEB leaderboard
license="gemma",
max_tokens=8192, # from old C-MTEB leaderboard
reference="https://huggingface.co/BAAI/bge-multilingual-gemma2",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
public_training_data=False,
public_training_code=False,
public_training_code=None,
training_datasets=None, # not disclosed
)
5 changes: 3 additions & 2 deletions mteb/models/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,13 @@ def encode(self, texts: list[str], **kwargs):
revision="0_1_10",
release_date="2024-07-10", ## release of version 0.1.10
n_parameters=None,
memory_usage=None,
embed_dim=None,
license=None,
max_tokens=None,
reference=None,
reference="https://github.com/xhluca/bm25s",
similarity_fn_name=None,
framework=[],
use_instructions=False,
public_training_code="https://github.com/xhluca/bm25s",
training_datasets=None,
)
Loading