diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 0a1befc2c9..b105f301b6 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -59,7 +59,6 @@ class ModelMeta(BaseModel): name: The name of the model, ideally the name on huggingface. n_parameters: The number of parameters in the model, e.g. 7_000_000 for a 7M parameter model. Can be None if the the number of parameters is not known (e.g. for proprietary models) or if the loader returns a SentenceTransformer model from which it can be derived. - memory_usage: The amount of memory the model uses in GB. Can be None if the memory usage is not known (e.g. for proprietary models). max_tokens: The maximum number of tokens the model can handle. Can be None if the maximum number of tokens is not known (e.g. for proprietary models). embed_dim: The dimension of the embeddings produced by the model. Currently all models are assumed to produce fixed-size embeddings. @@ -67,7 +66,6 @@ class ModelMeta(BaseModel): release_date: The date the model's revision was released. license: The license under which the model is released. Required if open_weights is True. open_weights: Whether the model is open source or proprietary. - public_training_data: Whether the training data used to train the model is publicly available. public_training_code: Whether the code used to train the model is publicly available. similarity_fn_name: The distance metric used by the model. framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`. @@ -90,19 +88,17 @@ class ModelMeta(BaseModel): release_date: STR_DATE | None languages: list[ISO_LANGUAGE_SCRIPT] | None loader: Callable[..., Encoder] | None = None - n_parameters: int | None = None - memory_usage: float | None = None - max_tokens: float | None = None - embed_dim: int | None = None - license: str | None = None - open_weights: bool | None = None - public_training_data: bool | None = None - public_training_code: bool | None = None - framework: list[FRAMEWORKS] = [] + n_parameters: int | None + max_tokens: float | None + embed_dim: int | None + license: str | None + open_weights: bool | None + public_training_code: str | None + framework: list[FRAMEWORKS] reference: STR_URL | None = None - similarity_fn_name: DISTANCE_METRICS | None = None - use_instructions: bool | None = None - training_datasets: dict[str, list[str]] | None = None + similarity_fn_name: DISTANCE_METRICS | None + use_instructions: bool | None + training_datasets: dict[str, list[str]] | None adapted_from: str | None = None superseded_by: str | None = None diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index b4c2b97ac6..66822d41b0 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -94,7 +94,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=22_600_000, - memory_usage=None, max_tokens=512, embed_dim=384, license="apache-2.0", @@ -103,8 +102,7 @@ use_instructions=True, adapted_from="sentence-transformers/all-MiniLM-L6-v2", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -145,7 +143,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=32_200_000, - memory_usage=None, max_tokens=512, embed_dim=384, license="apache-2.0", @@ -154,8 +151,7 @@ use_instructions=True, adapted_from="intfloat/e5-small-unsupervised", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -196,7 +192,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=109_000_000, - memory_usage=None, max_tokens=512, embed_dim=768, license="apache-2.0", @@ -205,8 +200,7 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5", - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -247,7 +241,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=137_000_000, - memory_usage=None, max_tokens=2048, embed_dim=768, license="apache-2.0", @@ -256,8 +249,7 @@ use_instructions=True, adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -298,7 +290,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=335_000_000, - memory_usage=None, max_tokens=512, embed_dim=1024, license="apache-2.0", @@ -307,8 +298,7 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -351,7 +341,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=109_000_000, - memory_usage=None, max_tokens=512, embed_dim=768, license="apache-2.0", @@ -360,6 +349,8 @@ use_instructions=True, adapted_from=None, superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", + public_training_code=None, + training_datasets=None, ) arctic_embed_m_v2_0 = ModelMeta( @@ -376,7 +367,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=305_000_000, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -385,8 +375,7 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-multilingual-base", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -426,7 +415,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=568_000_000, - memory_usage=None, max_tokens=8192, embed_dim=1024, license="apache-2.0", @@ -435,8 +423,7 @@ use_instructions=True, adapted_from="BAAI/bge-m3-retromae", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 05547d6a04..d8270c573b 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -365,7 +365,6 @@ revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a", release_date="2023-09-12", # initial commit of hf model. n_parameters=24_000_000, - memory_usage=None, embed_dim=512, license="mit", max_tokens=512, @@ -373,7 +372,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_training_data, ) @@ -391,7 +389,6 @@ revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a", release_date="2023-09-11", # initial commit of hf model. n_parameters=438_000_000, - memory_usage=None, embed_dim=768, license="mit", max_tokens=512, @@ -399,7 +396,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_training_data, ) @@ -417,7 +413,6 @@ revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09", release_date="2023-09-12", # initial commit of hf model. n_parameters=1_340_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, @@ -425,7 +420,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_training_data, ) @@ -443,7 +437,6 @@ revision="7999e1d3359715c523056ef9478215996d62a620", release_date="2023-09-12", # initial commit of hf model. n_parameters=24_000_000, - memory_usage=None, embed_dim=512, license="mit", max_tokens=512, @@ -451,7 +444,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_chinese_training_data, ) @@ -469,7 +461,6 @@ revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", release_date="2023-09-11", # initial commit of hf model. n_parameters=438_000_000, - memory_usage=None, embed_dim=768, license="mit", max_tokens=512, @@ -477,7 +468,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_chinese_training_data, ) @@ -495,7 +485,6 @@ revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", release_date="2023-09-12", # initial commit of hf model. n_parameters=1_340_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, @@ -503,7 +492,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_chinese_training_data, ) @@ -520,7 +508,6 @@ revision="5617a9f61b028005a4858fdac845db406aefb181", release_date="2024-06-28", n_parameters=568_000_000, - memory_usage=None, embed_dim=4096, license="mit", max_tokens=8194, @@ -528,7 +515,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=True, public_training_code=None, training_datasets=bgem3_training_data, ) @@ -555,7 +541,6 @@ revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", release_date="2024-07-25", # initial commit of hf model. n_parameters=9.24 * 1e9, - memory_usage=None, embed_dim=3584, # from old C-MTEB leaderboard license="gemma", max_tokens=8192, # from old C-MTEB leaderboard @@ -563,7 +548,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=None, # not disclosed ) diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py index 4231752702..ea56fd432b 100644 --- a/mteb/models/bm25.py +++ b/mteb/models/bm25.py @@ -131,12 +131,13 @@ def encode(self, texts: list[str], **kwargs): revision="0_1_10", release_date="2024-07-10", ## release of version 0.1.10 n_parameters=None, - memory_usage=None, embed_dim=None, license=None, max_tokens=None, - reference=None, + reference="https://github.com/xhluca/bm25s", similarity_fn_name=None, framework=[], use_instructions=False, + public_training_code="https://github.com/xhluca/bm25s", + training_datasets=None, ) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 4b34045f89..8718a2e2a3 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -227,7 +227,6 @@ def encode( revision="1", release_date="2023-11-02", n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=512, reference="https://cohere.com/blog/introducing-embed-v3", @@ -235,8 +234,7 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) @@ -253,15 +251,13 @@ def encode( revision="1", release_date="2023-11-02", n_parameters=None, - memory_usage=None, max_tokens=512, embed_dim=1024, license=None, similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) @@ -278,15 +274,13 @@ def encode( reference="https://cohere.com/blog/introducing-embed-v3", release_date="2023-11-02", n_parameters=None, - memory_usage=None, max_tokens=512, embed_dim=384, license=None, similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) @@ -303,14 +297,12 @@ def encode( revision="1", release_date="2023-11-02", n_parameters=None, - memory_usage=None, max_tokens=512, embed_dim=384, license=None, similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 8753791bff..87b5fdb93a 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -152,7 +152,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: languages=["eng_Latn"], open_weights=True, revision="c1e84128e85ef755c096a95bdb06b47793b13acf", - public_training_code=True, + public_training_code=None, release_date="2024-09-21", n_parameters=110 * 1e6, max_tokens=180, # Reduced for Benchmarking - see ColBERT paper @@ -164,6 +164,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, + training_datasets=None, ) @@ -203,7 +204,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: ], open_weights=True, revision="4cf816e5e2b03167b132a3c847a9ecd48ba708e1", - public_training_code=False, + public_training_code=None, release_date="2024-08-16", n_parameters=559 * 1e6, max_tokens=8192, @@ -215,4 +216,5 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, + training_datasets=None, ) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index 182a6ea4b2..f4d5909350 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -36,12 +36,10 @@ use_instructions=True, reference="https://huggingface.co/intfloat/multilingual-e5-large-instruct", n_parameters=560_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=514, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -68,11 +66,9 @@ use_instructions=True, reference="https://huggingface.co/intfloat/e5-mistral-7b-instruct", n_parameters=7_111_000_000, - memory_usage=None, embed_dim=4096, license="mit", max_tokens=32768, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 9537824e59..ace25ca08d 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -139,7 +139,6 @@ revision="fd1525a9fd15316a2d503bf26ab031a61d056e98", release_date=E5_PAPER_RELEASE_DATE, n_parameters=118_000_000, - memory_usage=None, embed_dim=384, license="mit", max_tokens=512, @@ -147,8 +146,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -164,7 +162,6 @@ revision="d13f1b27baf31030b7fd040960d60d909913633f", release_date=E5_PAPER_RELEASE_DATE, n_parameters=278_000_000, - memory_usage=None, embed_dim=768, license="mit", max_tokens=514, @@ -172,8 +169,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -190,7 +186,6 @@ revision="ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb", release_date=E5_PAPER_RELEASE_DATE, n_parameters=560_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=514, @@ -198,8 +193,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -215,7 +209,6 @@ revision="dca8b1a9dae0d4575df2bf423a5edb485a431236", release_date=E5_PAPER_RELEASE_DATE, n_parameters=33_000_000, - memory_usage=None, embed_dim=384, license="mit", max_tokens=512, @@ -223,8 +216,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -241,7 +233,6 @@ revision="e272f3049e853b47cb5ca3952268c6662abda68f", release_date=E5_PAPER_RELEASE_DATE, n_parameters=33_000_000, - memory_usage=None, embed_dim=384, license="mit", max_tokens=512, @@ -249,8 +240,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -267,7 +257,6 @@ revision="1c644c92ad3ba1efdad3f1451a637716616a20e8", release_date=E5_PAPER_RELEASE_DATE, n_parameters=109_000_000, - memory_usage=None, embed_dim=768, license="mit", max_tokens=512, @@ -277,8 +266,7 @@ use_instructions=True, superseded_by=None, adapted_from=None, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -295,7 +283,6 @@ revision="b322e09026e4ea05f42beadf4d661fb4e101d311", release_date=E5_PAPER_RELEASE_DATE, n_parameters=335_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=514, @@ -305,8 +292,7 @@ use_instructions=True, superseded_by=None, adapted_from=None, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -323,7 +309,6 @@ revision="4dc6d853a804b9c8886ede6dda8a073b7dc08a81", release_date="2022-12-26", n_parameters=335_000_000, - memory_usage=None, embed_dim=1024, license="apache-2.0", max_tokens=512, @@ -333,8 +318,7 @@ use_instructions=True, superseded_by="intfloat/e5-large-v2", adapted_from=None, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -351,7 +335,6 @@ revision="b533fe4636f4a2507c08ddab40644d20b0006d6a", release_date="2022-12-26", n_parameters=109_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -361,7 +344,6 @@ use_instructions=True, superseded_by="intfloat/e5-base-v2", adapted_from=None, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index 1b4a4a13ff..08065f7af0 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -145,15 +145,13 @@ def encode( revision="1", # revision is intended for implementation release_date="2024-05-14", n_parameters=None, - memory_usage=None, max_tokens=2048, embed_dim=768, license=None, similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) @@ -169,15 +167,13 @@ def encode( revision="1", # revision is intended for implementation release_date="2024-11-18", n_parameters=None, - memory_usage=None, max_tokens=2048, embed_dim=768, license=None, similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) @@ -193,14 +189,12 @@ def encode( revision="1", # revision is intended for implementation release_date="2024-05-14", n_parameters=None, - memory_usage=None, max_tokens=2048, embed_dim=768, license=None, similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index a4f5befd19..a68502b06d 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -31,7 +31,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: revision="13f00a0e36500c80ce12870ea513846a066004af", release_date="2024-02-15", n_parameters=7_240_000_000, - memory_usage=None, embed_dim=4096, license="apache-2.0", max_tokens=4096, @@ -41,8 +40,7 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: use_instructions=True, training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data - public_training_code=True, # https://github.com/ContextualAI/gritlm - public_training_data=False, + public_training_code="https://github.com/ContextualAI/gritlm", ) gritlm8x7b = ModelMeta( loader=partial( # type: ignore @@ -58,7 +56,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: revision="7f089b13e3345510281733ca1e6ff871b5b4bc76", release_date="2024-02-15", n_parameters=57_920_000_000, - memory_usage=None, embed_dim=4096, license="apache-2.0", max_tokens=4096, @@ -68,6 +65,5 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: use_instructions=True, training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data - public_training_code=True, # https://github.com/ContextualAI/gritlm - public_training_data=False, + public_training_code="https://github.com/ContextualAI/gritlm", ) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index f800aaa941..da265e79c2 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -39,13 +39,15 @@ def instruction_template( revision="e26182b2122f4435e8b3ebecbf363990f409b45b", release_date="2024-06-15", # initial commit of hf model. n_parameters=7_613_000_000, - memory_usage=None, embed_dim=3584, license="apache-2.0", reference="https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + training_datasets=None, + max_tokens=131072, ) @@ -67,7 +69,6 @@ def instruction_template( revision="07d27e5226328010336563bc1b564a5e3436a298", release_date="2024-04-20", # initial commit of hf model. n_parameters=7_720_000_000, - memory_usage=None, embed_dim=4096, license="apache-2.0", max_tokens=32768, @@ -75,6 +76,8 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + training_datasets=None, ) @@ -96,7 +99,6 @@ def instruction_template( revision="c6c1b92f4a3e1b92b326ad29dd3c8433457df8dd", release_date="2024-07-29", # initial commit of hf model. n_parameters=1_780_000_000, - memory_usage=None, embed_dim=8960, license="apache-2.0", max_tokens=131072, @@ -104,6 +106,8 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + training_datasets=None, ) gte_small_zh = ModelMeta( @@ -118,7 +122,6 @@ def instruction_template( revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", release_date="2023-11-08", # initial commit of hf model. n_parameters=30.3 * 1e6, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, @@ -126,7 +129,6 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=False, public_training_code=None, training_datasets=None, # Not disclosed ) @@ -143,7 +145,6 @@ def instruction_template( revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", release_date="2023-11-08", # initial commit of hf model. n_parameters=102 * 1e6, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, @@ -151,7 +152,6 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=False, public_training_code=None, training_datasets=None, # Not disclosed ) @@ -168,7 +168,6 @@ def instruction_template( revision="64c364e579de308104a9b2c170ca009502f4f545", release_date="2023-11-08", # initial commit of hf model. n_parameters=326 * 1e6, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, @@ -176,7 +175,6 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=False, public_training_code=None, training_datasets=None, # Not disclosed ) @@ -286,7 +284,6 @@ def instruction_template( revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", release_date="2024-07-20", # initial commit of hf model. n_parameters=305 * 1e6, - memory_usage=None, embed_dim=1024, license="apache-2", max_tokens=8192, @@ -294,7 +291,6 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=True, public_training_code=None, # couldn't find training_datasets=gte_multi_training_data, ) diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py index c2443de233..78bad6097f 100644 --- a/mteb/models/ibm_granite_models.py +++ b/mteb/models/ibm_granite_models.py @@ -33,7 +33,6 @@ revision="47db56afe692f731540413c67dd818ff492277e7", release_date="2024-12-18", n_parameters=107_000_000, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=512, @@ -42,6 +41,9 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + use_instructions=False, + training_datasets=None, ) granite_278m_multilingual = ModelMeta( @@ -56,7 +58,6 @@ revision="84e3546b88b0cb69f8078608a1df558020bcbf1f", release_date="2024-12-18", n_parameters=278_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -65,6 +66,9 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + use_instructions=False, + training_datasets=None, ) granite_30m_english = ModelMeta( @@ -79,7 +83,6 @@ revision="eddbb57470f896b5f8e2bfcb823d8f0e2d2024a5", release_date="2024-12-18", n_parameters=30_000_000, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=512, @@ -88,6 +91,9 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + use_instructions=False, + training_datasets=None, ) granite_125m_english = ModelMeta( @@ -102,7 +108,6 @@ revision="e48d3a5b47eaa18e3fe07d4676e187fd80f32730", release_date="2024-12-18", n_parameters=125_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -111,4 +116,7 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + use_instructions=False, + training_datasets=None, ) diff --git a/mteb/models/inf_models.py b/mteb/models/inf_models.py index 4670b20735..dc31adccd2 100644 --- a/mteb/models/inf_models.py +++ b/mteb/models/inf_models.py @@ -17,7 +17,6 @@ revision="d2d074546028c0012b5cc6af78c4fac24896e67f", release_date="2024-12-24", # initial commit of hf model. n_parameters=7_069_121_024, - memory_usage=None, embed_dim=3584, license="apache-2.0", max_tokens=131_072, @@ -26,7 +25,6 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct", - public_training_code=False, - public_training_data=False, + public_training_code=None, training_datasets=None, ) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 0062df2acc..1dc06d5640 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -81,7 +81,6 @@ def encode( revision="d6330ce98f8a0d741e781df845904c9484f00efa", release_date="2024-12-11", # first commit n_parameters=1_999_000_000, - memory_usage=None, max_tokens=131072, embed_dim=8960, license="apache-2.0", @@ -94,5 +93,4 @@ def encode( training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], public_training_code=None, - public_training_data=None, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 728ffaa98f..265d512371 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -223,8 +223,7 @@ def encode( use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", training_datasets=None, - public_training_code=False, - public_training_data=False, + public_training_code=None, ) @@ -235,7 +234,6 @@ def encode( revision="6e85f575bc273f1fd840a658067d0157933c83f0", release_date="2023-09-27", n_parameters=137_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=8192, @@ -246,8 +244,7 @@ def encode( superseded_by=None, adapted_from=None, training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC + public_training_code=None, ) jina_embeddings_v2_small_en = ModelMeta( @@ -257,7 +254,6 @@ def encode( revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", release_date="2023-09-27", n_parameters=32_700_000, - memory_usage=None, embed_dim=512, license="apache-2.0", max_tokens=8192, @@ -268,8 +264,7 @@ def encode( superseded_by=None, adapted_from=None, training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} + public_training_code=None, ) jina_embedding_b_en_v1 = ModelMeta( @@ -279,7 +274,6 @@ def encode( revision="aa0645035294a8c0607ce5bb700aba982cdff32c", release_date="2023-07-07", n_parameters=110_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -290,8 +284,7 @@ def encode( superseded_by="jinaai/jina-embeddings-v2-base-en", adapted_from=None, training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} + public_training_code=None, ) jina_embedding_s_en_v1 = ModelMeta( @@ -301,7 +294,6 @@ def encode( revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", release_date="2023-07-07", n_parameters=35_000_000, - memory_usage=None, embed_dim=512, license="apache-2.0", max_tokens=512, @@ -312,6 +304,5 @@ def encode( superseded_by="jinaai/jina-embeddings-v2-small-en", adapted_from=None, training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} + public_training_code=None, ) diff --git a/mteb/models/linq_models.py b/mteb/models/linq_models.py index 4babbf75cf..11cfa74ed1 100644 --- a/mteb/models/linq_models.py +++ b/mteb/models/linq_models.py @@ -32,7 +32,6 @@ def instruction_template( revision="0c1a0b0589177079acc552433cad51d7c9132379", release_date="2024-05-29", # initial commit of hf model. n_parameters=7_110_000_000, - memory_usage=None, embed_dim=4096, license="cc-by-nc-4.0", max_tokens=32768, @@ -40,4 +39,6 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + training_datasets=None, ) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index cbc42fe5ed..a5f1a69a36 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -117,7 +117,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="baa8ebf04a1c2500e61288e7dad65e8ae42601a7", # TODO: Not sure what to put here as a model is made of two peft repos, each with a different revision release_date="2024-04-09", n_parameters=7_505_000_000, - memory_usage=None, max_tokens=8192, embed_dim=4096, license="mit", @@ -125,8 +124,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, ) @@ -144,7 +142,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="1cb7b735326d13a8541db8f57f35da5373f5e9c6", release_date="2024-04-09", n_parameters=7_505_000_000, - memory_usage=None, max_tokens=8192, embed_dim=4096, license="mit", @@ -152,8 +149,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, ) @@ -172,7 +168,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="0ae69bdd5816105778b971c3138e8f8a18eaa3ae", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -180,8 +175,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, ) @@ -199,7 +193,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="2c055a5d77126c0d3dc6cd8ffa30e2908f4f45f8", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -207,8 +200,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, ) @@ -226,7 +218,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="2c055a5d77126c0d3dc6cd8ffa30e2908f4f45f8", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -234,8 +225,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, ) @@ -253,7 +243,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="a76944871d169ebe7c97eb921764cd063afed785", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -261,8 +250,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, ) @@ -280,7 +268,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="a5943d406c6b016fef3f07906aac183cf1a0b47d", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -288,8 +275,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, ) @@ -307,7 +293,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="a5943d406c6b016fef3f07906aac183cf1a0b47d", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -315,7 +300,6 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 09e423240e..5233ecec6b 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -17,12 +17,10 @@ languages=["eng_Latn"], loader=None, n_parameters=7110660096, - memory_usage=None, max_tokens=32768.0, embed_dim=None, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/Haon-Chen/speed-embedding-7b-instruct", @@ -39,12 +37,10 @@ languages=[], loader=None, n_parameters=278043648, - memory_usage=None, max_tokens=514.0, embed_dim=768, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2", @@ -61,12 +57,10 @@ languages=None, loader=None, n_parameters=494032768, - memory_usage=None, max_tokens=131072.0, embed_dim=896, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", @@ -83,12 +77,10 @@ languages=None, loader=None, n_parameters=494032768, - memory_usage=None, max_tokens=131072.0, embed_dim=896, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", @@ -105,12 +97,10 @@ languages=["eng_Latn"], loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/Hum-Works/lodestone-base-4096-v1", @@ -169,12 +159,10 @@ languages=[], loader=None, n_parameters=2506172416, - memory_usage=None, max_tokens=8192.0, embed_dim=2048, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Jaume/gemma-2b-embeddings", @@ -191,12 +179,10 @@ languages=["eng_Latn"], loader=None, n_parameters=7241732096, - memory_usage=None, max_tokens=32768.0, embed_dim=None, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", @@ -219,12 +205,10 @@ trust_remote_code=True, ), n_parameters=278043648, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-base", @@ -246,12 +230,10 @@ trust_remote_code=True, ), n_parameters=559890432, - memory_usage=None, max_tokens=514.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-large", @@ -273,12 +255,10 @@ trust_remote_code=True, ), n_parameters=117653760, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-small", @@ -295,12 +275,10 @@ languages=None, loader=None, n_parameters=17389824, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", @@ -318,12 +296,10 @@ languages=None, loader=None, n_parameters=22713216, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", @@ -341,12 +317,10 @@ languages=None, loader=None, n_parameters=15615360, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Squirtle", @@ -364,12 +338,10 @@ languages=None, loader=None, n_parameters=15615360, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Venusaur", @@ -387,12 +359,10 @@ languages=None, loader=None, n_parameters=17389824, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Wartortle", @@ -410,12 +380,10 @@ languages=None, loader=None, n_parameters=17389824, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro", @@ -432,12 +400,10 @@ languages=None, loader=None, n_parameters=19164288, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro-v4", @@ -454,12 +420,10 @@ languages=["fra_Latn"], loader=None, n_parameters=559890432, - memory_usage=None, max_tokens=514.0, embed_dim=1024, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/OrdalieTech/Solon-embeddings-large-0.1", @@ -476,12 +440,10 @@ languages=["ara_Arab"], loader=None, n_parameters=135193344, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", @@ -498,12 +460,10 @@ languages=["ara_Arab"], loader=None, n_parameters=117653760, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", @@ -522,12 +482,10 @@ languages=["ara_Arab"], loader=None, n_parameters=278043648, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", @@ -546,12 +504,10 @@ languages=["ara_Arab"], loader=None, n_parameters=470926848, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", @@ -570,12 +526,10 @@ languages=["ara_Arab"], loader=None, n_parameters=109486464, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", @@ -594,12 +548,10 @@ languages=["ara_Arab"], loader=None, n_parameters=162841344, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", @@ -616,12 +568,10 @@ languages=None, loader=None, n_parameters=None, - memory_usage=None, max_tokens=512.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-lunaris-text-embeddings", @@ -638,12 +588,10 @@ languages=None, loader=None, n_parameters=None, - memory_usage=None, max_tokens=514.0, embed_dim=768, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-stellaris-text-embeddings", @@ -660,12 +608,10 @@ languages=None, loader=None, n_parameters=567754752, - memory_usage=None, max_tokens=8194.0, embed_dim=1024, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/bge-m3-custom-fr", @@ -682,12 +628,10 @@ languages=None, loader=None, n_parameters=1279887360, - memory_usage=None, max_tokens=2048.0, embed_dim=2048, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.2", @@ -704,12 +648,10 @@ languages=None, loader=None, n_parameters=1279887360, - memory_usage=None, max_tokens=2048.0, embed_dim=2048, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.3", @@ -726,12 +668,10 @@ languages=["fra_Latn", "eng_Latn"], loader=None, n_parameters=1279887360, - memory_usage=None, max_tokens=2048.0, embed_dim=2048, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", @@ -749,12 +689,10 @@ languages=["eng_Latn"], loader=None, n_parameters=109482752, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-base", @@ -771,12 +709,10 @@ languages=["eng_Latn"], loader=None, n_parameters=335142400, - memory_usage=None, max_tokens=512.0, embed_dim=1024, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-large", @@ -793,12 +729,10 @@ languages=["eng_Latn"], loader=None, n_parameters=33360512, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-small", @@ -815,12 +749,10 @@ languages=["pol_Latn"], loader=None, n_parameters=103705344, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="gpl-3.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/KartonBERT-USE-base-v1", @@ -837,12 +769,10 @@ languages=["pol_Latn"], loader=None, n_parameters=None, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="lgpl", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/st-polish-kartonberta-base-alpha-v1", @@ -859,12 +789,10 @@ languages=["pol_Latn"], loader=None, n_parameters=278043648, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-base", @@ -881,12 +809,10 @@ languages=["eng_Latn"], loader=None, n_parameters=None, - memory_usage=None, max_tokens=4096.0, embed_dim=None, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/dwzhu/e5-base-4k", @@ -903,12 +829,10 @@ languages=["pol_Latn"], loader=None, n_parameters=559890432, - memory_usage=None, max_tokens=514.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-large", @@ -925,12 +849,10 @@ languages=["pol_Latn"], loader=None, n_parameters=117653760, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-small", @@ -947,12 +869,10 @@ languages=["pol_Latn"], loader=None, n_parameters=124442880, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-base", @@ -969,12 +889,10 @@ languages=["pol_Latn"], loader=None, n_parameters=434961408, - memory_usage=None, max_tokens=514.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-large", @@ -1037,12 +955,10 @@ ], loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-1b1", @@ -1105,12 +1021,10 @@ ], loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-3b", @@ -1173,12 +1087,10 @@ ], loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-560m", @@ -1241,12 +1153,10 @@ ], loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-7b1", @@ -1263,12 +1173,10 @@ languages=["eng_Latn"], loader=None, n_parameters=109482240, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-Embedding-v0", @@ -1285,12 +1193,10 @@ languages=["eng_Latn"], loader=None, n_parameters=22713216, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-all-MiniLM-L6-v2", @@ -1307,12 +1213,10 @@ languages=["eng_Latn"], loader=None, n_parameters=335141888, - memory_usage=None, max_tokens=512.0, embed_dim=1024, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-large-Embedding-v0", @@ -1329,12 +1233,10 @@ languages=["eng_Latn"], loader=None, n_parameters=33360000, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-small-Embedding-v0", @@ -1351,12 +1253,10 @@ languages=None, loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=4096, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/bigscience/sgpt-bloom-7b1-msmarco", @@ -1373,12 +1273,10 @@ languages=["deu_Latn"], loader=None, n_parameters=335736320, - memory_usage=None, max_tokens=512.0, embed_dim=1024, license=None, open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", @@ -1396,12 +1294,10 @@ languages=["eng_Latn"], loader=None, n_parameters=33360000, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/abhinand/MedEmbed-small-v0.1", @@ -1424,12 +1320,10 @@ languages=["eng_Latn"], loader=None, n_parameters=33360000, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0", @@ -1446,12 +1340,10 @@ languages=["eng_Latn"], loader=None, n_parameters=22713216, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/brahmairesearch/slx-v0.1", @@ -1468,12 +1360,10 @@ languages=None, loader=None, n_parameters=None, - memory_usage=None, max_tokens=514.0, embed_dim=768, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/deepfile/embedder-100p", @@ -1490,12 +1380,10 @@ languages=["rus_Cyrl"], loader=None, n_parameters=359026688, - memory_usage=None, max_tokens=8194.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/deepvk/USER-bge-m3", @@ -1523,12 +1411,10 @@ languages=["eng_Latn"], loader=None, n_parameters=None, - memory_usage=None, max_tokens=512.0, embed_dim=None, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/infgrad/stella-base-en-v2", @@ -1545,12 +1431,10 @@ languages=None, loader=None, n_parameters=98688000, - memory_usage=None, max_tokens=512.0, embed_dim=1024, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/malenia1/ternary-weight-embedding", @@ -1567,12 +1451,10 @@ languages=["ara_Arab", "eng_Latn"], loader=None, n_parameters=559890432, - memory_usage=None, max_tokens=514.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/omarelshehy/arabic-english-sts-matryoshka", @@ -1599,12 +1481,10 @@ release_date="2024-09-04", languages=["zho_Hans", "eng_Latn"], n_parameters=2724880896, - memory_usage=None, max_tokens=512.0, embed_dim=2304, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/openbmb/MiniCPM-Embedding", @@ -1631,12 +1511,10 @@ ], loader=None, n_parameters=117654272, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/shibing624/text2vec-base-multilingual", @@ -1654,12 +1532,10 @@ languages=["ara_Arab", "eng_Latn"], loader=None, n_parameters=135193344, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/silma-ai/silma-embeddding-matryoshka-v0.1", @@ -1676,12 +1552,10 @@ languages=["eng_Latn"], loader=None, n_parameters=7110660096, - memory_usage=None, max_tokens=32768.0, embed_dim=4096, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", @@ -1698,12 +1572,10 @@ languages=["zho_Hans"], loader=None, n_parameters=None, # Not visible on repo - memory_usage=None, max_tokens=512, embed_dim=128, license="apache-2", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/sbert-chinese-general-v1", @@ -1724,12 +1596,10 @@ languages=["zho_Hans"], loader=None, n_parameters=74.2 * 1e6, - memory_usage=None, max_tokens=1024, embed_dim=768, license="apache-2", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/", @@ -1745,12 +1615,10 @@ languages=["zho_Hans"], loader=None, n_parameters=326 * 1e6, - memory_usage=None, max_tokens=512, embed_dim=1024, license="not specified", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding", @@ -1767,12 +1635,10 @@ languages=["zho_Hans"], loader=None, n_parameters=326 * 1e6, - memory_usage=None, max_tokens=512, embed_dim=768, license="not specified", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding-v2", @@ -1789,12 +1655,10 @@ languages=["zho_Hans"], loader=None, n_parameters=326 * 1e6, - memory_usage=None, max_tokens=512, embed_dim=1024, license="not specified", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", @@ -1811,12 +1675,10 @@ languages=["zho_Hans"], loader=None, n_parameters=326 * 1e6, - memory_usage=None, max_tokens=512, embed_dim=768, license="cc-by-nc-4.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 1a58bbf8e3..afbf9df627 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -75,8 +75,7 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", # ) @@ -101,8 +100,7 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) m2v_base_output = ModelMeta( @@ -126,8 +124,7 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) m2v_multilingual_output = ModelMeta( @@ -151,8 +148,7 @@ def encode( adapted_from="sentence-transformers/LaBSE", superseded_by=None, training_datasets=None, - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) potion_base_2m = ModelMeta( @@ -176,8 +172,7 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) potion_base_4m = ModelMeta( @@ -201,8 +196,7 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) potion_base_8m = ModelMeta( @@ -226,6 +220,5 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py index cf9b96f881..d3943d78d7 100644 --- a/mteb/models/moka_models.py +++ b/mteb/models/moka_models.py @@ -86,7 +86,6 @@ revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c", release_date="2023-06-06", # first commit n_parameters=102 * 1e6, - memory_usage=None, embed_dim=768, # They don't give a specific license but commercial use is not allowed license="unspecified-noncommercial", @@ -97,8 +96,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # Not published - public_training_data=False, # They haven't published it yet + public_training_code=None, # Not published training_datasets=m3e_dataset, ) @@ -109,7 +107,6 @@ revision="44c696631b2a8c200220aaaad5f987f096e986df", release_date="2023-06-02", # first commit n_parameters=None, # Can't be seen on HF page - memory_usage=None, embed_dim=512, # They don't give a specific license but commercial use is not allowed license="unspecified-noncommercial", @@ -120,8 +117,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # Not published - public_training_data=False, # They haven't published it yet + public_training_code=None, # Not published training_datasets=m3e_dataset, ) @@ -133,7 +129,6 @@ revision="12900375086c37ba5d83d1e417b21dc7d1d1f388", release_date="2023-06-21", # first commit n_parameters=None, # Can't be seen on HF page - memory_usage=None, embed_dim=768, # They don't give a specific license but commercial use is not allowed license="unspecified-noncommercial", @@ -144,7 +139,6 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # Not published - public_training_data=False, # They haven't published it yet + public_training_code=None, # Not published training_datasets=m3e_dataset, ) diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index 5dfb9dc42a..04978a190d 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -19,7 +19,6 @@ revision="990580e27d329c7408b3741ecff85876e128e203", release_date="2024-03-07", # initial commit of hf model. n_parameters=335_000_000, - memory_usage=None, max_tokens=512, embed_dim=1024, license="apache-2.0", @@ -27,4 +26,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + training_datasets=None, ) diff --git a/mteb/models/no_instruct_sentence_models.py b/mteb/models/no_instruct_sentence_models.py index 019cfe7e04..a0596b9bd1 100644 --- a/mteb/models/no_instruct_sentence_models.py +++ b/mteb/models/no_instruct_sentence_models.py @@ -90,7 +90,6 @@ def encode( # type: ignore revision="b38747000553d8268915c95a55fc87e707c9aadd", release_date="2024-05-01", # first commit n_parameters=33_400_000, - memory_usage=None, max_tokens=512, embed_dim=384, license="mit", @@ -100,4 +99,6 @@ def encode( # type: ignore use_instructions=False, adapted_from=None, superseded_by=None, + public_training_code=None, + training_datasets=None, ) diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index aa6989941f..5d9da7b596 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -117,7 +117,6 @@ def encode( # type: ignore revision="b0753ae76394dd36bcfb912a46018088bca48be0", release_date="2024-02-10", # first commit n_parameters=137_000_000, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -127,6 +126,8 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code=None, + training_datasets=None, ) nomic_embed_v1 = ModelMeta( @@ -143,7 +144,6 @@ def encode( # type: ignore revision="0759316f275aa0cb93a5b830973843ca66babcf5", release_date="2024-01-31", # first commit n_parameters=None, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -153,6 +153,8 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by="nomic-ai/nomic-embed-text-v1.5", + public_training_code=None, + training_datasets=None, ) nomic_embed_v1_ablated = ModelMeta( @@ -169,7 +171,6 @@ def encode( # type: ignore revision="7d948905c5d5d3874fa55a925d68e49dbf411e5f", release_date="2024-01-15", # first commit n_parameters=None, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -179,6 +180,8 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code=None, + training_datasets=None, ) @@ -196,7 +199,6 @@ def encode( # type: ignore revision="b53d557b15ae63852847c222d336c1609eced93c", release_date="2024-01-15", # first commit n_parameters=None, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -206,6 +208,8 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code=None, + training_datasets=None, ) nomic_modern_bert_embed = ModelMeta( @@ -224,7 +228,6 @@ def encode( # type: ignore revision="5960f1566fb7cb1adf1eb6e816639cf4646d9b12", release_date="2024-12-29", n_parameters=149_000_000, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -234,4 +237,6 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code=None, + training_datasets=None, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 6bf4e041aa..1f345a62be 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -132,7 +132,6 @@ def encode( revision="7604d305b621f14095a1aa23d351674c2859553a", release_date="2024-09-09", # initial commit of hf model. n_parameters=7_850_000_000, - memory_usage=None, embed_dim=4096, license="cc-by-nc-4.0", max_tokens=32768, @@ -142,7 +141,6 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, - public_training_data=True, ) NV_embed_v1 = ModelMeta( @@ -157,7 +155,6 @@ def encode( revision="570834afd5fef5bf3a3c2311a2b6e0a66f6f4f2c", release_date="2024-09-13", # initial commit of hf model. n_parameters=7_850_000_000, - memory_usage=None, embed_dim=4096, license="cc-by-nc-4.0", max_tokens=32768, @@ -167,5 +164,4 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, - public_training_data=True, ) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 619a4a747f..863c9d7828 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -130,14 +130,12 @@ def _to_numpy(self, embedding_response) -> np.ndarray: embed_dim=1536, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://openai.com/index/new-embedding-models-and-api-updates/", similarity_fn_name="cosine", framework=["API"], use_instructions=False, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) text_embedding_3_large = ModelMeta( @@ -158,10 +156,10 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - memory_usage=None, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, + license=None, + similarity_fn_name=None, ) text_embedding_ada_002 = ModelMeta( name="openai/text-embedding-ada-002", @@ -181,8 +179,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - memory_usage=None, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, + license=None, + similarity_fn_name=None, ) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index ea0fa1524c..ad93efb314 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -212,22 +212,39 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: frameworks.append("Sentence Transformers") return ModelMeta( name=model_name, - revision=None, + revision=card_data.get("base_model_revision", None), # TODO release_date=None, # TODO: We need a mapping between conflicting language codes languages=None, license=card_data.get("license", None), framework=frameworks, - public_training_data=bool(card_data.get("datasets", None)), + training_datasets=card_data.get("datasets", None), + similarity_fn_name=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + open_weights=True, + public_training_code=None, + use_instructions=None, ) except Exception as e: logger.warning(f"Failed to extract metadata from model: {e}.") return ModelMeta( - name=None, + name=model_name, revision=None, languages=None, release_date=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=[], ) @@ -250,6 +267,14 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe languages=languages, framework=["Sentence Transformers"], similarity_fn_name=model.similarity_fn_name, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + use_instructions=None, + training_datasets=None, ) except AttributeError as e: logger.warning( @@ -260,5 +285,15 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe revision=None, languages=None, release_date=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=[], ) return meta diff --git a/mteb/models/piccolo_models.py b/mteb/models/piccolo_models.py index 17ea1fc2a9..bb92b55673 100644 --- a/mteb/models/piccolo_models.py +++ b/mteb/models/piccolo_models.py @@ -11,7 +11,6 @@ revision="47c0a63b8f667c3482e05b2fd45577bb19252196", release_date="2023-09-04", # first commit n_parameters=None, # can't see on model card - memory_usage=None, embed_dim=768, license="mit", max_tokens=512, @@ -21,8 +20,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, - public_training_data=False, + public_training_code=None, training_datasets=None, # They don't specify ) @@ -34,7 +32,6 @@ revision="05948c1d889355936bdf9db7d30df57dd78d25a3", release_date="2024-04-22", # first commit n_parameters=None, # we don't know because they removed the model - memory_usage=None, embed_dim=1024, license="not specified", max_tokens=512, @@ -44,7 +41,6 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, - public_training_data=False, + public_training_code=None, training_datasets=None, # They don't say ) diff --git a/mteb/models/promptriever_models.py b/mteb/models/promptriever_models.py index 7fc94cd36d..a7066817a4 100644 --- a/mteb/models/promptriever_models.py +++ b/mteb/models/promptriever_models.py @@ -56,7 +56,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision release_date="2024-09-15", n_parameters=7_000_000, - memory_usage=None, max_tokens=4096, embed_dim=4096, license="apache-2.0", @@ -65,6 +64,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, ) promptriever_llama3 = ModelMeta( @@ -82,7 +82,6 @@ def loader_inner(**kwargs: Any) -> Encoder: training_datasets={"samaya-ai/msmarco-w-instructions": ["train"]}, release_date="2024-09-15", n_parameters=8_000_000, - memory_usage=None, max_tokens=8192, embed_dim=4096, license="apache-2.0", @@ -90,6 +89,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, ) @@ -107,7 +107,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision release_date="2024-09-15", n_parameters=8_000_000, - memory_usage=None, max_tokens=8192, embed_dim=4096, training_datasets={"samaya-ai/msmarco-w-instructions": ["train"]}, @@ -116,6 +115,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, ) promptriever_mistral_v1 = ModelMeta( @@ -133,7 +133,6 @@ def loader_inner(**kwargs: Any) -> Encoder: release_date="2024-09-15", n_parameters=7_000_000, training_datasets={"samaya-ai/msmarco-w-instructions": ["train"]}, - memory_usage=None, max_tokens=4096, embed_dim=4096, license="apache-2.0", @@ -141,4 +140,5 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, ) diff --git a/mteb/models/repllama_models.py b/mteb/models/repllama_models.py index a1f1ba727a..5ae4c0d8cb 100644 --- a/mteb/models/repllama_models.py +++ b/mteb/models/repllama_models.py @@ -142,7 +142,6 @@ def loader_inner(**kwargs: Any) -> Encoder: release_date="2023-10-11", training_datasets={"Tevatron/msmarco-passage-aug": ["train"]}, n_parameters=7_000_000, - memory_usage=None, max_tokens=4096, embed_dim=4096, license="apache-2.0", @@ -150,6 +149,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, ) @@ -168,7 +168,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="01c7f73d771dfac7d292323805ebc428287df4f9-ad5c1d0938a1e02954bcafb4d811ba2f34052e71", # base-peft revision release_date="2024-09-15", n_parameters=7_000_000, - memory_usage=None, max_tokens=4096, embed_dim=4096, license="apache-2.0", @@ -176,4 +175,6 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, + training_datasets=None, ) diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index e8bb483a3d..5609fdf83a 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -204,6 +204,15 @@ def loader_inner(**kwargs: Any) -> Encoder: open_weights=True, revision="0a97706f3827389da43b83348d5d18c9d53876fa", release_date="2020-05-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) # languages unclear: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual/discussions/28 @@ -219,6 +228,15 @@ def loader_inner(**kwargs: Any) -> Encoder: open_weights=True, revision="126747772a932960028d9f4dc93bd5d9c4869be4", release_date="2024-09-26", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) bge_reranker_v2_m3 = ModelMeta( @@ -266,4 +284,13 @@ def loader_inner(**kwargs: Any) -> Encoder: open_weights=True, revision="953dc6f6f85a1b2dbfca4c34a2796e7dde08d41e", release_date="2024-06-24", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) diff --git a/mteb/models/rerankers_monot5_based.py b/mteb/models/rerankers_monot5_based.py index 6dfae3b0a2..5bc50bad70 100644 --- a/mteb/models/rerankers_monot5_based.py +++ b/mteb/models/rerankers_monot5_based.py @@ -296,6 +296,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="77f8e3f7b1eb1afe353aa21a7c3a2fc8feca702e", release_date="2022-03-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) monot5_base = ModelMeta( @@ -310,6 +319,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="f15657ab3d2a5dd0b9a30c8c0b6a0a73c9cb5884", release_date="2022-03-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) monot5_large = ModelMeta( @@ -324,6 +342,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="48cfad1d8dd587670393f27ee8ec41fde63e3d98", release_date="2022-03-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) monot5_3b = ModelMeta( @@ -338,6 +365,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="bc0c419a438c81f592f878ce32430a1823f5db6c", release_date="2022-03-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) flant5_base = ModelMeta( @@ -364,6 +400,14 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) flant5_large = ModelMeta( @@ -390,6 +434,14 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) flant5_xl = ModelMeta( @@ -416,6 +468,14 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) flant5_xxl = ModelMeta( @@ -442,6 +502,14 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) @@ -457,6 +525,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9", release_date="2023-07-18", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) llama2_7b_chat = ModelMeta( @@ -471,6 +548,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="f5db02db724555f92da89c216ac04704f23d4590", release_date="2023-07-18", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) mistral_7b = ModelMeta( @@ -485,6 +571,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="3ad372fc79158a2148299e3318516c786aeded6c", release_date="2023-12-11", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) followir_7b = ModelMeta( @@ -500,6 +595,14 @@ def get_prediction_tokens(self, *args, **kwargs): revision="4d25d437e38b510c01852070c0731e8f6e1875d1", release_date="2024-04-29", training_datasets={"jhu-clsp/FollowIR-train": ["train"]}, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) @@ -620,6 +723,14 @@ def get_prediction_tokens(self, *args, **kwargs): revision="cc0a949b9f21efcaba45c8cabb998ad02ce8d4e7", release_date="2022-01-05", training_datasets={"msmarco": ["train"]}, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) mt5_13b_mmarco_100k = ModelMeta( @@ -634,4 +745,13 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="e1a4317e102a525ea9e16745ad21394a4f1bffbc", release_date="2022-11-04", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 6bca544b11..d8c7e84518 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -15,7 +15,6 @@ revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", release_date="2021-10-28", n_parameters=29_400_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=2048, @@ -23,6 +22,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) rubert_tiny = ModelMeta( @@ -32,7 +33,6 @@ revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", release_date="2021-05-24", n_parameters=29_400_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=2048, @@ -40,6 +40,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) sbert_large_nlu_ru = ModelMeta( @@ -49,7 +51,6 @@ revision="af977d5dfa46a3635e29bf0ef383f2df2a08d47a", release_date="2020-11-20", n_parameters=427_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, # best guess @@ -57,6 +58,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) sbert_large_mt_nlu_ru = ModelMeta( @@ -66,7 +69,6 @@ revision="05300876c2b83f46d3ddd422a7f17e45cf633bb0", release_date="2021-05-18", n_parameters=427_000_000, - memory_usage=None, embed_dim=1024, license="Not specified", max_tokens=512, # best guess @@ -74,6 +76,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) user_base_ru = ModelMeta( @@ -89,7 +93,6 @@ revision="436a489a2087d61aa670b3496a9915f84e46c861", release_date="2024-06-10", n_parameters=427_000_000, - memory_usage=None, embed_dim=1024, license="Not specified", max_tokens=512, # best guess @@ -120,6 +123,7 @@ # "bragovo/dsum_ru": ["train"], # "CarlBrendt/Summ_Dialog_News": ["train"], }, + public_training_code=None, ) deberta_v1_ru = ModelMeta( @@ -129,7 +133,6 @@ revision="bdd30b0e19757e6940c92c7aff19e8fc0a60dff4", release_date="2023-02-07", n_parameters=124_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -137,6 +140,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) rubert_base_cased = ModelMeta( @@ -146,7 +151,6 @@ revision="4036cab694767a299f2b9e6492909664d9414229", release_date="2020-03-04", n_parameters=1280_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, # best guess @@ -154,6 +158,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) distilrubert_small_cased_conversational = ModelMeta( @@ -163,7 +169,6 @@ revision="e348066b4a7279b97138038299bddc6580a9169a", release_date="2022-06-28", n_parameters=107_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, @@ -171,6 +176,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) rubert_base_cased_sentence = ModelMeta( @@ -180,7 +187,6 @@ revision="78b5122d6365337dd4114281b0d08cd1edbb3bc8", release_date="2020-03-04", n_parameters=107_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, @@ -188,6 +194,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) labse_en_ru = ModelMeta( @@ -197,7 +205,6 @@ revision="cf0714e606d4af551e14ad69a7929cd6b0da7f7e", release_date="2021-06-10", n_parameters=129_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, @@ -205,6 +212,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) rubert_tiny_turbo = ModelMeta( @@ -214,7 +223,6 @@ revision="8ce0cf757446ce9bb2d5f5a4ac8103c7a1049054", release_date="2024-06-21", n_parameters=129_000_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=512, @@ -222,6 +230,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, training_datasets=None, # source model in unknown # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) @@ -233,7 +242,6 @@ revision="1940b046c6b5e125df11722b899130329d0a46da", release_date="2024-06-27", n_parameters=129_000_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=512, @@ -243,6 +251,7 @@ use_instructions=False, training_datasets=None, # source model in unknown # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + public_training_code=None, ) @@ -264,4 +273,12 @@ revision="89fb1651989adbb1cfcfdedafd7d102951ad0555", release_date="2024-07-29", use_instructions=True, + n_parameters=404_000_000, + max_tokens=514, + embed_dim=1024, + license="mit", + similarity_fn_name="cosine", + public_training_code=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index 18db09a2b5..4d4a60b621 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -32,7 +32,6 @@ def instruction_template( revision="91762139d94ed4371a9fa31db5551272e0b83818", release_date="2024-06-14", # initial commit of hf model. n_parameters=7_110_000_000, - memory_usage=None, embed_dim=4096, license="cc-by-nc-4.0", max_tokens=32768, @@ -41,8 +40,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, adapted_from="intfloat/e5-mistral-7b-instruct", - public_training_code=False, - public_training_data=False, + public_training_code=None, training_datasets={ # inherits from e5 "MSMARCO": ["train"], "MSMARCOHardNegatives": ["train"], @@ -73,7 +71,6 @@ def instruction_template( revision="938c560d1c236aa563b2dbdf084f28ab28bccb11", release_date="2024-01-24", # initial commit of hf model. n_parameters=7_110_000_000, - memory_usage=None, embed_dim=4096, license="cc-by-nc-4.0", max_tokens=32768, @@ -81,8 +78,7 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=False, - public_training_data=False, + public_training_code=None, training_datasets={ # inherits from e5 "MSMARCO": ["train"], "MSMARCOHardNegatives": ["train"], diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index f8b01c6eaf..fa48ae7ccc 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -101,7 +101,6 @@ revision="8b3219a92973c328a8e22fadcfa821b5dc75636a", release_date="2021-08-30", n_parameters=22_700_000, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=256, @@ -112,8 +111,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, - public_training_code=True, - public_training_data=True, + public_training_code=None, ) all_MiniLM_L12_v2 = ModelMeta( @@ -123,7 +121,6 @@ revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", release_date="2021-08-30", n_parameters=33_400_000, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=256, @@ -134,8 +131,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, - public_training_code=True, - public_training_data=True, + public_training_code=None, ) paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( @@ -145,7 +141,6 @@ revision="bf3bf13ab40c3157080a7ab344c831b9ad18b5eb", release_date="2019-11-01", # release date of paper n_parameters=118_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -156,8 +151,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) - public_training_code=True, - public_training_data=True, + public_training_code=None, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( @@ -167,7 +161,6 @@ revision="79f2382ceacceacdf38563d7c5d16b9ff8d725d6", release_date="2019-11-01", # release date of paper n_parameters=278_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -189,8 +182,7 @@ # "flickr30k-captions": flickr_train_dataset, # "yahoo-answers": yahoo_answers_train_dataset, # "stack-exchange": stack_exchange_train_dataset, - public_training_code=True, - public_training_data=True, + public_training_code=None, ) labse = ModelMeta( @@ -200,7 +192,6 @@ revision="e34fab64a3011d2176c99545a93d5cbddc9a91b7", release_date="2019-11-01", # release date of paper n_parameters=471_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -211,8 +202,7 @@ superseded_by=None, adapted_from=None, training_datasets=None, # scraped and mined webdata including CC, wiki, see section 3.1 https://aclanthology.org/2022.acl-long.62.pdf - public_training_code=True, # https://www.kaggle.com/models/google/labse/tensorFlow2/labse/2?tfhub-redirect=true - public_training_data=False, + public_training_code="https://www.kaggle.com/models/google/labse/tensorFlow2/labse/2?tfhub-redirect=true", ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( @@ -222,7 +212,6 @@ revision="b207367332321f8e44f96e224ef15bc607f4dbf0", release_date="2021-08-30", n_parameters=22_700_000, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=512, @@ -234,7 +223,6 @@ adapted_from="nreimers/MiniLM-L6-H384-uncased", training_datasets=sent_trf_training_dataset, # assumed public_training_code=None, - public_training_data=None, ) all_mpnet_base_v2 = ModelMeta( @@ -244,7 +232,6 @@ revision="9a3225965996d404b775526de6dbfe85d3368642", release_date="2021-08-30", n_parameters=109_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=384, @@ -255,8 +242,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, - public_training_code=True, - public_training_data=True, + public_training_code=None, ) @@ -267,7 +253,6 @@ revision="98f70f14cdf12d7ea217ed2fd4e808b0195f1e7e", release_date="2024-11-10", n_parameters=272_000_000, - memory_usage=None, embed_dim=1024, license="apache-2.0", max_tokens=2048, @@ -288,4 +273,5 @@ # "sentence-transformers/quora-duplicates": ["train"], # "sentence-transformers/natural-questions": ["train"], }, + public_training_code=None, ) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 1e04b41167..44aa1f8604 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -29,8 +29,7 @@ framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", training_datasets=None, - public_training_data=False, # currently not released - public_training_code=False, + public_training_code=None, ) stella_en_1_5b = ModelMeta( @@ -56,8 +55,7 @@ framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", training_datasets=None, - public_training_data=False, # currently not released - public_training_code=False, + public_training_code=None, ) stella_large_zh_v3_1792d = ModelMeta( @@ -67,7 +65,6 @@ revision="d5d39eb8cd11c80a63df53314e59997074469f09", release_date="2024-02-17", n_parameters=None, # can't see on model card - memory_usage=None, embed_dim=1792, license="not specified", max_tokens=512, @@ -77,8 +74,7 @@ use_instructions=False, superseded_by="dunzhang/stella-mrl-large-zh-v3.5-1792d", adapted_from=None, - public_training_code=False, - public_training_data=True, + public_training_code=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -93,7 +89,6 @@ revision="82254892a0fba125aa2abf3a4800d2dd12821343", release_date="2024-02-17", n_parameters=None, # can't see on model card - memory_usage=None, embed_dim=1792, license="mit", max_tokens=512, @@ -103,8 +98,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, - public_training_data=True, + public_training_code=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -120,7 +114,6 @@ revision="17bb1c32a93a8fc5f6fc9e91d5ea86da99983cfe", release_date="2024-02-27", n_parameters=326 * 1e6, - memory_usage=None, embed_dim=1792, license="mit", max_tokens=512, @@ -130,8 +123,7 @@ use_instructions=False, superseded_by=None, adapted_from="dunzhang/stella-large-zh-v3-1792d", - public_training_code=False, - public_training_data=True, + public_training_code=None, training_datasets=None, # Not specified ) @@ -142,7 +134,6 @@ revision="b1075144f440ab4409c05622c1179130ebd57d03", release_date="2024-06-04", n_parameters=326 * 1e6, - memory_usage=None, embed_dim=1792, license="mit", max_tokens=512, @@ -152,8 +143,7 @@ use_instructions=False, superseded_by=None, adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d", - public_training_code=False, - public_training_data=True, + public_training_code=None, training_datasets={ # It's a bit unclear what they have trained on to be honest, because they don't list all # And they also have some rather cryptic description of their training procedure, but at diff --git a/mteb/models/text2vec_models.py b/mteb/models/text2vec_models.py index e26108e0ae..12322e69e9 100644 --- a/mteb/models/text2vec_models.py +++ b/mteb/models/text2vec_models.py @@ -12,7 +12,6 @@ revision="183bb99aa7af74355fb58d16edf8c13ae7c5433e", release_date="2022-01-23", n_parameters=102 * 1e6, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -22,8 +21,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # Couldn't find it - public_training_data=True, + public_training_code=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -39,7 +37,6 @@ revision="e90c150a9c7fb55a67712a766d6820c55fb83cdd", release_date="2023-06-19", n_parameters=118 * 1e6, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -49,8 +46,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # Couldn't find it - public_training_data=True, + public_training_code=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -82,7 +78,6 @@ # So probably best not to. loader=None, n_parameters=118 * 1e6, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=256, @@ -92,8 +87,7 @@ use_instructions=False, superseded_by=None, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", - public_training_code=False, # Couldn't find it - public_training_data=True, + public_training_code=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index ffdaa29f74..bd8be48693 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -82,6 +82,5 @@ def encode( "NLI": [], "SNLI": [], }, - public_training_data=True, - public_training_code=True, + public_training_code=None, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 12925b235b..a98bc041bc 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -151,15 +151,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/05/05/voyage-large-2-instruct-instruction-tuned-and-rank-1-on-mteb/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_finance_2 = ModelMeta( @@ -176,15 +174,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/06/03/domain-specific-embeddings-finance-edition-voyage-finance-2/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_law_2 = ModelMeta( @@ -201,15 +197,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/04/15/domain-specific-embeddings-and-retrieval-legal-edition-voyage-law-2/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_code_2 = ModelMeta( @@ -226,15 +220,13 @@ def _batched_encode( embed_dim=1536, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/01/23/voyage-code-2-elevate-your-code-retrieval/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_large_2 = ModelMeta( @@ -251,15 +243,13 @@ def _batched_encode( embed_dim=1536, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_2 = ModelMeta( @@ -276,15 +266,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, - public_training_code=False, + public_training_code=None, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -300,15 +288,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/06/10/voyage-multilingual-2-multilingual-embedding-model/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_3 = ModelMeta( @@ -325,15 +311,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/09/18/voyage-3/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_3_lite = ModelMeta( @@ -350,13 +334,11 @@ def _batched_encode( embed_dim=512, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/09/18/voyage-3/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index a96604446e..a192fa1341 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -220,7 +220,6 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: license=card_data.get("license", None), framework=frameworks, n_parameters=n_parameters, - public_training_data=bool(datasets), adapted_from=get_base_model(model_name), training_datasets=training_datasets, open_weights=True, @@ -237,6 +236,16 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: revision=None, languages=None, release_date=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + frameworks=[], ) diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index c540bb41ee..fb0cf6cf5a 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -6,8 +6,7 @@ from sentence_transformers import CrossEncoder, SentenceTransformer -from mteb import MTEB -from mteb.model_meta import ModelMeta +from mteb import MTEB, ModelMeta logging.basicConfig(level=logging.INFO) @@ -373,7 +372,18 @@ def test_reranker_same_ndcg1(): open_weights=True, revision=ce_revision, release_date="2021-04-15", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + reference=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) + eval = MTEB(tasks=["SciFact"]) eval.run( de,