diff --git a/mteb/models/model_implementations/mdbr_models.py b/mteb/models/model_implementations/mdbr_models.py index 775cdfc59b..e52c66f540 100644 --- a/mteb/models/model_implementations/mdbr_models.py +++ b/mteb/models/model_implementations/mdbr_models.py @@ -1,5 +1,7 @@ from mteb.models.model_implementations.arctic_models import arctic_v1_training_datasets -from mteb.models.model_implementations.mxbai_models import mixedbread_training_data +from mteb.models.model_implementations.mixedbread_ai_models import ( + mixedbread_training_data, +) from mteb.models.model_meta import ModelMeta from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader diff --git a/mteb/models/model_implementations/mxbai_models.py b/mteb/models/model_implementations/mixedbread_ai_models.py similarity index 71% rename from mteb/models/model_implementations/mxbai_models.py rename to mteb/models/model_implementations/mixedbread_ai_models.py index f9e31796f0..0376267c76 100644 --- a/mteb/models/model_implementations/mxbai_models.py +++ b/mteb/models/model_implementations/mixedbread_ai_models.py @@ -1,3 +1,4 @@ +from mteb.models.model_implementations.pylate_models import MultiVectorModel from mteb.models.model_meta import ( ModelMeta, ScoringFunction, @@ -239,3 +240,93 @@ }""", contacts=None, ) + +mxbai_edge_colbert_v0_17m = ModelMeta( + loader=MultiVectorModel, + name="mixedbread-ai/mxbai-edge-colbert-v0-17m", + model_type=["late-interaction"], + languages=["eng-Latn"], + open_weights=True, + revision="23ae07f5bf028bc0d1f80c82e6e2dd2311f13a46", + public_training_code=None, + public_training_data=None, + release_date="2025-10-16", + n_parameters=int(17 * 1e6), + memory_usage_mb=64, + max_tokens=7999, + embed_dim=None, + license="apache-2.0", + similarity_fn_name=ScoringFunction.MAX_SIM, + framework=["PyLate", "ColBERT", "Transformers", "safetensors"], + reference="https://huggingface.co/mixedbread-ai/mxbai-edge-colbert-v0-17m", + use_instructions=False, + adapted_from="https://huggingface.co/jhu-clsp/ettin-encoder-17m", + superseded_by=None, + training_datasets={ + "CornStack", + "MSMARCO", + "NQ", + "HotpotQA", + "AmazonQA", + "LoTTE", + "MultiLongDocRetrieval", + # "FineWeb", + # "PubMedQA", + # "TriviaQA", + }, + citation="""@misc{takehi2025fantasticsmallretrieverstrain, + title={Fantastic (small) Retrievers and How to Train Them: mxbai-edge-colbert-v0 Tech Report}, + author={Rikiya Takehi and Benjamin ClaviƩ and Sean Lee and Aamir Shakir}, + year={2025}, + eprint={2510.14880}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2510.14880}, +}""", + contacts=None, +) + +mxbai_edge_colbert_v0_32m = ModelMeta( + loader=MultiVectorModel, + name="mixedbread-ai/mxbai-edge-colbert-v0-32m", + model_type=["late-interaction"], + languages=["eng-Latn"], + open_weights=True, + revision="2f12870a85dae80680b9babc59992c9a2bc59e4a", + public_training_code=None, + public_training_data=None, + release_date="2025-10-16", + n_parameters=int(32 * 1e6), + memory_usage_mb=122, + max_tokens=511, + embed_dim=None, + license="apache-2.0", + similarity_fn_name=ScoringFunction.MAX_SIM, + framework=["PyLate", "ColBERT", "Transformers", "safetensors"], + reference="https://huggingface.co/mixedbread-ai/mxbai-edge-colbert-v0-32m", + use_instructions=False, + adapted_from="https://huggingface.co/jhu-clsp/ettin-encoder-32m", + superseded_by=None, + training_datasets={ + "CornStack", + "MSMARCO", + "NQ", + "HotpotQA", + "AmazonQA", + "LoTTE", + "MultiLongDocRetrieval", + # "FineWeb", + # "PubMedQA", + # "TriviaQA", + }, + citation="""@misc{takehi2025fantasticsmallretrieverstrain, + title={Fantastic (small) Retrievers and How to Train Them: mxbai-edge-colbert-v0 Tech Report}, + author={Rikiya Takehi and Benjamin ClaviƩ and Sean Lee and Aamir Shakir}, + year={2025}, + eprint={2510.14880}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2510.14880}, +}""", + contacts=None, +)