diff --git a/mteb/models/model_implementations/align_models.py b/mteb/models/model_implementations/align_models.py index 62ee3a3ab8..e11e843844 100644 --- a/mteb/models/model_implementations/align_models.py +++ b/mteb/models/model_implementations/align_models.py @@ -105,6 +105,7 @@ def encode( align_base = ModelMeta( loader=ALIGNModel, name="kakaobrain/align-base", + model_type=["dense"], languages=["eng-Latn"], revision="e96a37facc7b1f59090ece82293226b817afd6ba", release_date="2023-02-24", diff --git a/mteb/models/model_implementations/amazon_models.py b/mteb/models/model_implementations/amazon_models.py index 10d31284b0..4513655967 100644 --- a/mteb/models/model_implementations/amazon_models.py +++ b/mteb/models/model_implementations/amazon_models.py @@ -3,6 +3,7 @@ amazon_titan_text_embeddings_v2 = ModelMeta( loader=None, name="amazon/Titan-text-embeddings-v2", + model_type=["dense"], revision="1", release_date="2024-04-30", languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/andersborges.py b/mteb/models/model_implementations/andersborges.py index 747fd1486f..fe28c61647 100644 --- a/mteb/models/model_implementations/andersborges.py +++ b/mteb/models/model_implementations/andersborges.py @@ -6,6 +6,7 @@ model2vecdk = ModelMeta( loader=Model2VecModel, # type: ignore name="andersborges/model2vecdk", + model_type=["dense"], languages=["dan-Latn"], open_weights=True, revision="cb576c78dcc1b729e4612645f61db59929d69e61", @@ -30,6 +31,7 @@ model2vecdk_stem = ModelMeta( loader=Model2VecModel, # type: ignore name="andersborges/model2vecdk-stem", + model_type=["dense"], languages=["dan-Latn"], open_weights=True, revision="cb576c78dcc1b729e4612645f61db59929d69e61", diff --git a/mteb/models/model_implementations/ara_models.py b/mteb/models/model_implementations/ara_models.py index c26adee70b..25455be347 100644 --- a/mteb/models/model_implementations/ara_models.py +++ b/mteb/models/model_implementations/ara_models.py @@ -4,6 +4,7 @@ arabic_triplet_matryoshka = ModelMeta( loader=sentence_transformers_loader, name="Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2", + model_type=["dense"], languages=["ara-Arab"], open_weights=True, revision="ed357f222f0b6ea6670d2c9b5a1cb93950d34200", diff --git a/mteb/models/model_implementations/arctic_models.py b/mteb/models/model_implementations/arctic_models.py index 1c9508f026..69032036f4 100644 --- a/mteb/models/model_implementations/arctic_models.py +++ b/mteb/models/model_implementations/arctic_models.py @@ -140,6 +140,7 @@ arctic_embed_xs = ModelMeta( loader=sentence_transformers_loader, name="Snowflake/snowflake-arctic-embed-xs", + model_type=["dense"], revision="742da4f66e1823b5b4dbe6c320a1375a1fd85f9e", release_date="2024-07-08", # initial commit of hf model. languages=["eng-Latn"], @@ -165,6 +166,7 @@ arctic_embed_s = ModelMeta( loader=sentence_transformers_loader, name="Snowflake/snowflake-arctic-embed-s", + model_type=["dense"], revision="d3c1d2d433dd0fdc8e9ca01331a5f225639e798f", release_date="2024-04-12", # initial commit of hf model. languages=["eng-Latn"], @@ -190,6 +192,7 @@ arctic_embed_m = ModelMeta( loader=sentence_transformers_loader, name="Snowflake/snowflake-arctic-embed-m", + model_type=["dense"], revision="cc17beacbac32366782584c8752220405a0f3f40", release_date="2024-04-12", # initial commit of hf model. languages=["eng-Latn"], @@ -215,6 +218,7 @@ loader=sentence_transformers_loader, loader_kwargs={"trust_remote_code": True}, name="Snowflake/snowflake-arctic-embed-m-long", + model_type=["dense"], revision="89d0f6ab196eead40b90cb6f9fefec01a908d2d1", release_date="2024-04-12", # initial commit of hf model. languages=["eng-Latn"], @@ -239,6 +243,7 @@ arctic_embed_l = ModelMeta( loader=sentence_transformers_loader, name="Snowflake/snowflake-arctic-embed-l", + model_type=["dense"], revision="9a9e5834d2e89cdd8bb72b64111dde496e4fe78c", release_date="2024-04-12", # initial commit of hf model. languages=["eng-Latn"], @@ -268,6 +273,7 @@ }, ), name="Snowflake/snowflake-arctic-embed-m-v1.5", + model_type=["dense"], revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", release_date="2024-07-08", # initial commit of hf model. languages=["eng-Latn"], @@ -293,6 +299,7 @@ loader=sentence_transformers_loader, loader_kwargs={"trust_remote_code": True}, name="Snowflake/snowflake-arctic-embed-m-v2.0", + model_type=["dense"], revision="f2a7d59d80dfda5b1d14f096f3ce88bb6bf9ebdc", release_date="2024-12-04", # initial commit of hf model. languages=LANGUAGES_V2_0, @@ -317,6 +324,7 @@ arctic_embed_l_v2_0 = ModelMeta( loader=sentence_transformers_loader, name="Snowflake/snowflake-arctic-embed-l-v2.0", + model_type=["dense"], revision="edc2df7b6c25794b340229ca082e7c78782e6374", release_date="2024-12-04", # initial commit of hf model. languages=LANGUAGES_V2_0, diff --git a/mteb/models/model_implementations/b1ade_models.py b/mteb/models/model_implementations/b1ade_models.py index b6c71fd7ca..eeb499bd00 100644 --- a/mteb/models/model_implementations/b1ade_models.py +++ b/mteb/models/model_implementations/b1ade_models.py @@ -10,6 +10,7 @@ b1ade_embed = ModelMeta( loader=sentence_transformers_loader, name="w601sxs/b1ade-embed", + model_type=["dense"], languages=["eng-Latn"], revision="3bdac13927fdc888b903db93b2ffdbd90b295a69", open_weights=True, diff --git a/mteb/models/model_implementations/bedrock_models.py b/mteb/models/model_implementations/bedrock_models.py index 0e2d81afcc..80ae610d74 100644 --- a/mteb/models/model_implementations/bedrock_models.py +++ b/mteb/models/model_implementations/bedrock_models.py @@ -155,6 +155,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: amazon_titan_embed_text_v1 = ModelMeta( name="bedrock/amazon-titan-embed-text-v1", + model_type=["dense"], revision="1", release_date="2023-09-27", languages=None, # not specified @@ -181,6 +182,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: amazon_titan_embed_text_v2 = ModelMeta( name="bedrock/amazon-titan-embed-text-v2", + model_type=["dense"], revision="1", release_date="2024-04-30", languages=None, # not specified @@ -216,6 +218,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: model_prompts=cohere_model_prompts, ), name="bedrock/cohere-embed-english-v3", + model_type=["dense"], languages=["eng-Latn"], open_weights=False, reference="https://cohere.com/blog/introducing-embed-v3", @@ -243,6 +246,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: model_prompts=cohere_model_prompts, ), name="bedrock/cohere-embed-multilingual-v3", + model_type=["dense"], languages=cohere_supported_languages, open_weights=False, reference="https://cohere.com/blog/introducing-embed-v3", diff --git a/mteb/models/model_implementations/bge_models.py b/mteb/models/model_implementations/bge_models.py index fd99042d44..faa8da1848 100644 --- a/mteb/models/model_implementations/bge_models.py +++ b/mteb/models/model_implementations/bge_models.py @@ -319,6 +319,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-small-en-v1.5", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a", @@ -344,6 +345,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-base-en-v1.5", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a", @@ -369,6 +371,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-large-en-v1.5", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09", @@ -394,6 +397,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-small-zh", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="1d2363c5de6ce9ba9c890c8e23a4c72dce540ca8", @@ -419,6 +423,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-base-zh", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="0e5f83d4895db7955e4cb9ed37ab73f7ded339b6", @@ -444,6 +449,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-large-zh", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="b5d9f5c027e87b6f0b6fa4b614f8f9cdc45ce0e8", @@ -469,6 +475,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-small-en", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="4778d71a06863076696b03fd2777eb118712cad8", @@ -494,6 +501,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-base-en", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="b737bf5dcc6ee8bdc530531266b4804a5d77b5d8", @@ -519,6 +527,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-large-en", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="abe7d9d814b775ca171121fb03f394dc42974275", @@ -545,6 +554,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-small-zh-v1.5", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="7999e1d3359715c523056ef9478215996d62a620", @@ -569,6 +579,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-base-zh-v1.5", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", @@ -593,6 +604,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-large-zh-v1.5", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", @@ -614,6 +626,7 @@ bge_m3 = ModelMeta( loader=sentence_transformers_loader, name="BAAI/bge-m3", + model_type=["dense"], languages=bgem3_languages, open_weights=True, revision="5617a9f61b028005a4858fdac845db406aefb181", @@ -692,6 +705,7 @@ bge_multilingual_gemma2 = ModelMeta( loader=sentence_transformers_loader, name="BAAI/bge-multilingual-gemma2", + model_type=["dense"], languages=[ "eng-Latn", "zho-Hans", @@ -727,6 +741,7 @@ bge_en_icl = ModelMeta( loader=sentence_transformers_loader, name="BAAI/bge-en-icl", + model_type=["dense"], languages=[ "eng-Latn", ], @@ -762,6 +777,7 @@ bge_m3_unsupervised = ModelMeta( loader=sentence_transformers_loader, name="BAAI/bge-m3-unsupervised", + model_type=["dense"], languages=bgem3_languages, open_weights=True, revision="46f03bc86361cf88102b0b517b36c8259f2946b1", @@ -782,6 +798,7 @@ manu__bge_m3_custom_fr = ModelMeta( name="manu/bge-m3-custom-fr", + model_type=["dense"], revision="ed3ef88678ba83ddf4c0fab71a93cb90d89a9078", release_date="2024-04-11", languages=None, diff --git a/mteb/models/model_implementations/bica_model.py b/mteb/models/model_implementations/bica_model.py index 15b9876529..17a1b6fe25 100644 --- a/mteb/models/model_implementations/bica_model.py +++ b/mteb/models/model_implementations/bica_model.py @@ -2,6 +2,7 @@ bica_base = ModelMeta( name="bisectgroup/BiCA-base", + model_type=["dense"], loader=sentence_transformers_loader, languages=["eng-Latn"], open_weights=True, diff --git a/mteb/models/model_implementations/blip2_models.py b/mteb/models/model_implementations/blip2_models.py index ad66d745c6..87347d7eb8 100644 --- a/mteb/models/model_implementations/blip2_models.py +++ b/mteb/models/model_implementations/blip2_models.py @@ -166,6 +166,7 @@ def encode( blip2_opt_2_7b = ModelMeta( loader=blip2_loader, name="Salesforce/blip2-opt-2.7b", + model_type=["dense"], languages=["eng-Latn"], revision="51572668da0eb669e01a189dc22abe6088589a24", release_date="2024-03-22", @@ -189,6 +190,7 @@ def encode( blip2_opt_6_7b_coco = ModelMeta( loader=blip2_loader, name="Salesforce/blip2-opt-6.7b-coco", + model_type=["dense"], languages=["eng-Latn"], revision="0d580de59320a25a4d2c386387bcef310d5f286e", release_date="2024-03-31", diff --git a/mteb/models/model_implementations/blip_models.py b/mteb/models/model_implementations/blip_models.py index ce68b80062..9c83129346 100644 --- a/mteb/models/model_implementations/blip_models.py +++ b/mteb/models/model_implementations/blip_models.py @@ -130,6 +130,7 @@ def encode( blip_image_captioning_large = ModelMeta( loader=BLIPModel, # type: ignore name="Salesforce/blip-image-captioning-large", + model_type=["dense"], languages=["eng-Latn"], revision="2227ac38c9f16105cb0412e7cab4759978a8fd90", release_date="2023-12-07", @@ -157,6 +158,7 @@ def encode( blip_image_captioning_base = ModelMeta( loader=BLIPModel, # type: ignore name="Salesforce/blip-image-captioning-base", + model_type=["dense"], languages=["eng-Latn"], revision="89b09ea1789f7addf2f6d6f0dfc4ce10ab58ef84", release_date="2023-08-01", @@ -185,6 +187,7 @@ def encode( blip_vqa_base = ModelMeta( loader=BLIPModel, # type: ignore name="Salesforce/blip-vqa-base", + model_type=["dense"], languages=["eng-Latn"], revision="c7df8e7cd7aa2ee9af18f56e2b29e59a92651b64", release_date="2023-12-07", @@ -211,6 +214,7 @@ def encode( blip_vqa_capfilt_large = ModelMeta( loader=BLIPModel, # type: ignore name="Salesforce/blip-vqa-capfilt-large", + model_type=["dense"], languages=["eng-Latn"], revision="e53f95265aeab69013fabb5380500ab984adbbb4", release_date="2023-01-22", @@ -237,6 +241,7 @@ def encode( blip_itm_base_coco = ModelMeta( loader=BLIPModel, # type: ignore name="Salesforce/blip-itm-base-coco", + model_type=["dense"], languages=["eng-Latn"], revision="7eaa90c11850c0b17fc38c6a11e7d88bd6ac231f", release_date="2023-08-01", @@ -263,6 +268,7 @@ def encode( blip_itm_large_coco = ModelMeta( loader=BLIPModel, # type: ignore name="Salesforce/blip-itm-large-coco", + model_type=["dense"], languages=["eng-Latn"], revision="fef05cafc05298067cbbca00b125749394a77a6f", release_date="2023-08-01", @@ -290,6 +296,7 @@ def encode( blip_itm_base_flickr = ModelMeta( loader=BLIPModel, # type: ignore name="Salesforce/blip-itm-base-flickr", + model_type=["dense"], languages=["eng-Latn"], revision="1de29e660d91ae1786c1876212ea805a22eab251", release_date="2023-08-01", @@ -317,6 +324,7 @@ def encode( blip_itm_large_flickr = ModelMeta( loader=BLIPModel, # type: ignore name="Salesforce/blip-itm-large-flickr", + model_type=["dense"], languages=["eng-Latn"], revision="bda12e6506758f54261b5ab174b2c55a3ba143fb", release_date="2023-08-01", diff --git a/mteb/models/model_implementations/bm25.py b/mteb/models/model_implementations/bm25.py index 32ae883955..b143a597ab 100644 --- a/mteb/models/model_implementations/bm25.py +++ b/mteb/models/model_implementations/bm25.py @@ -121,6 +121,7 @@ def encode(self, texts: list[str]): bm25_s = ModelMeta( loader=bm25_loader, name="bm25s", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="0_1_10", diff --git a/mteb/models/model_implementations/bmretriever_models.py b/mteb/models/model_implementations/bmretriever_models.py index e8cd6ae871..4ed6aa96c7 100644 --- a/mteb/models/model_implementations/bmretriever_models.py +++ b/mteb/models/model_implementations/bmretriever_models.py @@ -90,6 +90,7 @@ def __init__( apply_instruction_to_passages=True, ), name="BMRetriever/BMRetriever-410M", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="e3569bfbcfe3a1bc48c142e11a7b0f38e86065a3", @@ -119,6 +120,7 @@ def __init__( apply_instruction_to_passages=True, ), name="BMRetriever/BMRetriever-1B", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="1b758c5f4d3af48ef6035cc4088bdbcd7df43ca6", @@ -148,6 +150,7 @@ def __init__( apply_instruction_to_passages=True, ), name="BMRetriever/BMRetriever-2B", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="718179afd57926369c347f46eee616db81084941", @@ -177,6 +180,7 @@ def __init__( apply_instruction_to_passages=True, ), name="BMRetriever/BMRetriever-7B", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="13e6adb9273c5f254e037987d6b44e9e4b005b9a", diff --git a/mteb/models/model_implementations/cadet_models.py b/mteb/models/model_implementations/cadet_models.py index be27d9d9ed..753c5b4c05 100644 --- a/mteb/models/model_implementations/cadet_models.py +++ b/mteb/models/model_implementations/cadet_models.py @@ -35,6 +35,7 @@ }, ), name="manveertamber/cadet-embed-base-v1", + model_type=["dense"], languages=["eng-Latn"], revision="8056d118be37a566f20972a5f35cda815f6bc47e", open_weights=True, diff --git a/mteb/models/model_implementations/cde_models.py b/mteb/models/model_implementations/cde_models.py index 697927eeeb..4772204a82 100644 --- a/mteb/models/model_implementations/cde_models.py +++ b/mteb/models/model_implementations/cde_models.py @@ -209,6 +209,7 @@ def _load_task_sample( trust_remote_code=True, ), name="jxm/cde-small-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="e151df18af0d7f1d1c37b074fee58406ececf19f", @@ -237,6 +238,7 @@ def _load_task_sample( trust_remote_code=True, ), name="jxm/cde-small-v2", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="4e1d021a6c3fd7ce8aa0a7204057eee5ae61d390", diff --git a/mteb/models/model_implementations/clip_models.py b/mteb/models/model_implementations/clip_models.py index 026f16a868..d4a0e14f07 100644 --- a/mteb/models/model_implementations/clip_models.py +++ b/mteb/models/model_implementations/clip_models.py @@ -117,6 +117,7 @@ def encode( clip_vit_large_patch14 = ModelMeta( loader=CLIPModel, # type: ignore name="openai/clip-vit-large-patch14", + model_type=["dense"], languages=["eng-Latn"], revision="32bd64288804d66eefd0ccbe215aa642df71cc41", release_date="2021-02-26", @@ -140,6 +141,7 @@ def encode( clip_vit_base_patch32 = ModelMeta( loader=CLIPModel, # type: ignore name="openai/clip-vit-base-patch32", + model_type=["dense"], languages=["eng-Latn"], revision="3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268", release_date="2021-02-26", @@ -163,6 +165,7 @@ def encode( clip_vit_base_patch16 = ModelMeta( loader=CLIPModel, # type: ignore name="openai/clip-vit-base-patch16", + model_type=["dense"], languages=["eng-Latn"], revision="57c216476eefef5ab752ec549e440a49ae4ae5f3", release_date="2021-02-26", diff --git a/mteb/models/model_implementations/clips_models.py b/mteb/models/model_implementations/clips_models.py index 17073d05cd..580c5f0062 100644 --- a/mteb/models/model_implementations/clips_models.py +++ b/mteb/models/model_implementations/clips_models.py @@ -24,6 +24,7 @@ model_prompts=model_prompts, ), name="clips/e5-small-trm-nl", + model_type=["dense"], languages=["nld-Latn"], open_weights=True, revision="0243664a6c5e12eef854b091eb283e51833c3e9f", @@ -50,6 +51,7 @@ model_prompts=model_prompts, ), name="clips/e5-base-trm-nl", + model_type=["dense"], languages=["nld-Latn"], open_weights=True, revision="6bd5722f236da48b4b8bcb28cc1fc478f7089956", @@ -76,6 +78,7 @@ model_prompts=model_prompts, ), name="clips/e5-large-trm-nl", + model_type=["dense"], languages=["nld-Latn"], open_weights=True, revision="683333f86ed9eb3699b5567f0fdabeb958d412b0", diff --git a/mteb/models/model_implementations/codefuse_models.py b/mteb/models/model_implementations/codefuse_models.py index fc615415b2..c7dca5d80c 100644 --- a/mteb/models/model_implementations/codefuse_models.py +++ b/mteb/models/model_implementations/codefuse_models.py @@ -142,6 +142,7 @@ def instruction_template( max_seq_length=8192, ), name="codefuse-ai/F2LLM-0.6B", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="36416618b83d4bd84a8ca30c2ee01ed518f9f2e7", @@ -171,6 +172,7 @@ def instruction_template( max_seq_length=8192, ), name="codefuse-ai/F2LLM-1.7B", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="fdce0e09655f42cea26f7f66f5a70cd4507ea45c", @@ -200,6 +202,7 @@ def instruction_template( max_seq_length=8192, ), name="codefuse-ai/F2LLM-4B", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="9fe95901ed2b6b59dd7673d6e93c9d76766a1e25", diff --git a/mteb/models/model_implementations/codesage_models.py b/mteb/models/model_implementations/codesage_models.py index 07250163a5..8e4c8d80ee 100644 --- a/mteb/models/model_implementations/codesage_models.py +++ b/mteb/models/model_implementations/codesage_models.py @@ -22,6 +22,7 @@ codesage_large = ModelMeta( loader=sentence_transformers_loader, name="codesage/codesage-large-v2", + model_type=["dense"], languages=codesage_languages, revision="6e5d6dc15db3e310c37c6dbac072409f95ffa5c5", release_date="2024-02-03", @@ -48,6 +49,7 @@ codesage_base = ModelMeta( loader=sentence_transformers_loader, name="codesage/codesage-base-v2", + model_type=["dense"], languages=codesage_languages, revision="92eac4f44c8674638f039f1b0d8280f2539cb4c7", release_date="2024-02-03", @@ -74,6 +76,7 @@ codesage_small = ModelMeta( loader=sentence_transformers_loader, name="codesage/codesage-small-v2", + model_type=["dense"], languages=codesage_languages, revision="4844c2f24b25e181aa43ca058cc73dd2622565c1", release_date="2024-02-03", diff --git a/mteb/models/model_implementations/cohere_models.py b/mteb/models/model_implementations/cohere_models.py index 59441aefe9..8758086c7a 100644 --- a/mteb/models/model_implementations/cohere_models.py +++ b/mteb/models/model_implementations/cohere_models.py @@ -380,6 +380,7 @@ def encode( model_prompts=model_prompts, ), name="Cohere/Cohere-embed-multilingual-v3.0", + model_type=["dense"], languages=supported_languages, open_weights=False, revision="1", @@ -404,6 +405,7 @@ def encode( model_prompts=model_prompts, ), name="Cohere/Cohere-embed-english-v3.0", + model_type=["dense"], languages=["eng-Latn"], open_weights=False, reference="https://cohere.com/blog/introducing-embed-v3", @@ -428,6 +430,7 @@ def encode( model_prompts=model_prompts, ), name="Cohere/Cohere-embed-multilingual-light-v3.0", + model_type=["dense"], languages=supported_languages, open_weights=False, revision="1", @@ -452,6 +455,7 @@ def encode( model_prompts=model_prompts, ), name="Cohere/Cohere-embed-english-light-v3.0", + model_type=["dense"], languages=["eng-Latn"], open_weights=False, reference="https://cohere.com/blog/introducing-embed-v3", diff --git a/mteb/models/model_implementations/cohere_v.py b/mteb/models/model_implementations/cohere_v.py index e6b5e3aea3..ff22c79ee3 100644 --- a/mteb/models/model_implementations/cohere_v.py +++ b/mteb/models/model_implementations/cohere_v.py @@ -381,6 +381,7 @@ def encode( loader=cohere_v_loader, # type: ignore loader_kwargs={"model_name": "embed-multilingual-v3.0"}, name="cohere/embed-multilingual-v3.0", + model_type=["dense"], languages=[], # Unknown, but support >100 languages revision="1", release_date="2024-10-24", @@ -404,6 +405,7 @@ def encode( loader=cohere_v_loader, # type: ignore loader_kwargs={"model_name": "embed-english-v3.0"}, name="cohere/embed-english-v3.0", + model_type=["dense"], languages=["eng-Latn"], revision="1", release_date="2024-10-24", @@ -426,6 +428,7 @@ def encode( cohere_embed_v4_multimodal = ModelMeta( loader=cohere_v_loader, loader_kwargs=dict(model_name="embed-v4.0"), + model_type=["dense"], name="Cohere/Cohere-embed-v4.0", languages=all_languages, revision="1", @@ -450,6 +453,7 @@ def encode( loader=cohere_v_loader, loader_kwargs=dict(embedding_type="binary"), name="Cohere/Cohere-embed-v4.0 (output_dtype=binary)", + model_type=["dense"], languages=all_languages, revision="1", release_date="2024-12-01", @@ -474,6 +478,7 @@ def encode( loader=cohere_v_loader, loader_kwargs=dict(embedding_type="int8"), name="Cohere/Cohere-embed-v4.0 (output_dtype=int8)", + model_type=["dense"], languages=all_languages, revision="1", release_date="2024-12-01", diff --git a/mteb/models/model_implementations/colpali_models.py b/mteb/models/model_implementations/colpali_models.py index e8a52ffe29..71ff3b8c93 100644 --- a/mteb/models/model_implementations/colpali_models.py +++ b/mteb/models/model_implementations/colpali_models.py @@ -213,6 +213,7 @@ def __init__( torch_dtype=torch.float16, ), name="vidore/colpali-v1.1", + model_type=["late-interaction"], languages=["eng-Latn"], revision="a0f15e3bcf97110e7ac1bb4be4bcd30eeb31992a", release_date="2024-08-21", @@ -239,6 +240,7 @@ def __init__( torch_dtype=torch.float16, ), name="vidore/colpali-v1.2", + model_type=["late-interaction"], languages=["eng-Latn"], revision="6b89bc63c16809af4d111bfe412e2ac6bc3c9451", release_date="2024-08-26", @@ -265,6 +267,7 @@ def __init__( torch_dtype=torch.float16, ), name="vidore/colpali-v1.3", + model_type=["late-interaction"], languages=["eng-Latn"], revision="1b5c8929330df1a66de441a9b5409a878f0de5b0", release_date="2024-11-01", diff --git a/mteb/models/model_implementations/colqwen_models.py b/mteb/models/model_implementations/colqwen_models.py index a8a81bf25a..31254b0c93 100644 --- a/mteb/models/model_implementations/colqwen_models.py +++ b/mteb/models/model_implementations/colqwen_models.py @@ -213,6 +213,7 @@ def similarity(self, a, b): torch_dtype=torch.float16, ), name="vidore/colqwen2-v1.0", + model_type=["late-interaction"], languages=["eng-Latn"], revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f", release_date="2025-11-03", @@ -239,6 +240,7 @@ def similarity(self, a, b): torch_dtype=torch.float16, ), name="vidore/colqwen2.5-v0.2", + model_type=["late-interaction"], languages=["eng-Latn"], revision="6f6fcdfd1a114dfe365f529701b33d66b9349014", release_date="2025-01-31", @@ -282,6 +284,7 @@ def similarity(self, a, b): colqwen3_8b = ModelMeta( loader=ColQwen3Wrapper, name="TomoroAI/tomoro-colqwen3-embed-8b", + model_type=["late-interaction"], languages=["eng-Latn"], revision="0b9fe28142910e209bbac15b1efe85507c27644f", release_date="2025-11-26", @@ -305,6 +308,7 @@ def similarity(self, a, b): colqwen3_4b = ModelMeta( loader=ColQwen3Wrapper, name="TomoroAI/tomoro-colqwen3-embed-4b", + model_type=["late-interaction"], languages=["eng-Latn"], revision="6a32fb68598730bf5620fbf18d832c784235c59c", release_date="2025-11-26", @@ -331,6 +335,7 @@ def similarity(self, a, b): torch_dtype=torch.float16, ), name="nomic-ai/colnomic-embed-multimodal-7b", + model_type=["late-interaction"], languages=["eng-Latn"], revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f", release_date="2025-03-31", @@ -375,6 +380,7 @@ def similarity(self, a, b): torch_dtype=torch.float16, attn_implementation="flash_attention_2" ), name="nomic-ai/colnomic-embed-multimodal-3b", + model_type=["late-interaction"], languages=COLNOMIC_LANGUAGES, revision="86627b4a9b0cade577851a70afa469084f9863a4", release_date="2025-03-31", @@ -401,6 +407,7 @@ def similarity(self, a, b): torch_dtype=torch.float16, ), name="nomic-ai/colnomic-embed-multimodal-7b", + model_type=["late-interaction"], languages=COLNOMIC_LANGUAGES, revision="09dbc9502b66605d5be56d2226019b49c9fd3293", release_date="2025-03-31", @@ -438,6 +445,7 @@ def similarity(self, a, b): torch_dtype=torch.float16, attn_implementation="flash_attention_2" ), name="ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-3B-v1", + model_type=["late-interaction"], languages=["eng-Latn"], revision="aeacaa2775f2758d82721eb1cf2f5daf1a392da9", release_date="2025-11-04", @@ -463,6 +471,7 @@ def similarity(self, a, b): torch_dtype=torch.float16, attn_implementation="flash_attention_2" ), name="ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-7B-v1", + model_type=["late-interaction"], languages=["eng-Latn"], revision="8952ac6ee0e7de2e9211b165921518caf9202110", release_date="2025-11-04", diff --git a/mteb/models/model_implementations/colsmol_models.py b/mteb/models/model_implementations/colsmol_models.py index 04b77c6456..f249915e5c 100644 --- a/mteb/models/model_implementations/colsmol_models.py +++ b/mteb/models/model_implementations/colsmol_models.py @@ -54,6 +54,7 @@ def __init__( torch_dtype=torch.float16, ), name="vidore/colSmol-256M", + model_type=["late-interaction"], languages=["eng-Latn"], revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f", release_date="2025-01-22", @@ -80,6 +81,7 @@ def __init__( torch_dtype=torch.float16, attn_implementation="flash_attention_2" ), name="vidore/colSmol-500M", + model_type=["late-interaction"], languages=["eng-Latn"], revision="1aa9325cba7ed2b3b9b97ede4d55026322504902", release_date="2025-01-22", diff --git a/mteb/models/model_implementations/conan_models.py b/mteb/models/model_implementations/conan_models.py index e9f5be1d92..0f1472cf69 100644 --- a/mteb/models/model_implementations/conan_models.py +++ b/mteb/models/model_implementations/conan_models.py @@ -190,6 +190,7 @@ def encode( Conan_embedding_v2 = ModelMeta( name="TencentBAC/Conan-embedding-v2", + model_type=["dense"], revision="e5c87c63889630bca87486f6a2645ed97c5ddb17", release_date="2025-04-10", languages=[ diff --git a/mteb/models/model_implementations/dino_models.py b/mteb/models/model_implementations/dino_models.py index a7ae824951..f122da25b2 100644 --- a/mteb/models/model_implementations/dino_models.py +++ b/mteb/models/model_implementations/dino_models.py @@ -106,6 +106,7 @@ def encode( dinov2_small = ModelMeta( loader=DINOModel, # type: ignore name="facebook/dinov2-small", + model_type=["dense"], languages=["eng-Latn"], revision="ed25f3a31f01632728cabb09d1542f84ab7b0056", release_date="2023-07-18", @@ -128,6 +129,7 @@ def encode( dinov2_base = ModelMeta( loader=DINOModel, # type: ignore name="facebook/dinov2-base", + model_type=["dense"], languages=["eng-Latn"], revision="f9e44c814b77203eaa57a6bdbbd535f21ede1415", release_date="2023-07-18", @@ -150,6 +152,7 @@ def encode( dinov2_large = ModelMeta( loader=DINOModel, # type: ignore name="facebook/dinov2-large", + model_type=["dense"], languages=["eng-Latn"], revision="47b73eefe95e8d44ec3623f8890bd894b6ea2d6c", release_date="2023-07-18", @@ -172,6 +175,7 @@ def encode( dinov2_giant = ModelMeta( loader=DINOModel, # type: ignore name="facebook/dinov2-giant", + model_type=["dense"], languages=["eng-Latn"], revision="611a9d42f2335e0f921f1e313ad3c1b7178d206d", release_date="2023-07-18", @@ -198,6 +202,7 @@ def encode( webssl_dino300m_full2b = ModelMeta( loader=DINOModel, name="facebook/webssl-dino300m-full2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="8529cdb3fb75014932af3b896455fc21c386168e", release_date="2025-04-24", @@ -220,6 +225,7 @@ def encode( webssl_dino1b_full2b = ModelMeta( loader=DINOModel, name="facebook/webssl-dino1b-full2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="d3bf033d9c8cc62ea9e73c40956642cad2ec568a", release_date="2025-04-24", @@ -242,6 +248,7 @@ def encode( webssl_dino2b_full2b = ModelMeta( loader=DINOModel, name="facebook/webssl-dino2b-full2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="cd5893e3fd2e988eb716792049b3dd53b3f1b68b", release_date="2025-04-24", @@ -264,6 +271,7 @@ def encode( webssl_dino3b_full2b = ModelMeta( loader=DINOModel, name="facebook/webssl-dino3b-full2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="2d015c340b16bc47bc6557fcb4e6c83a9d4aa1d3", release_date="2025-04-24", @@ -286,6 +294,7 @@ def encode( webssl_dino5b_full2b = ModelMeta( loader=DINOModel, name="facebook/webssl-dino5b-full2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="88006b18b9af369f6c611db7a64d908bde3714e0", release_date="2025-04-24", @@ -308,6 +317,7 @@ def encode( webssl_dino7b_full8b_224 = ModelMeta( loader=DINOModel, name="facebook/webssl-dino7b-full8b-224", + model_type=["dense"], languages=["eng-Latn"], revision="c6085463ea680043042a80c6d41db2c65e85f466", release_date="2025-04-24", @@ -330,6 +340,7 @@ def encode( webssl_dino7b_full8b_378 = ModelMeta( loader=DINOModel, name="facebook/webssl-dino7b-full8b-378", + model_type=["dense"], languages=["eng-Latn"], revision="53c8c5b43070bd2ddb3f66161140408ce832301f", release_date="2025-04-24", @@ -352,6 +363,7 @@ def encode( webssl_dino7b_full8b_518 = ModelMeta( loader=DINOModel, name="facebook/webssl-dino7b-full8b-518", + model_type=["dense"], languages=["eng-Latn"], revision="aee350d2c5e3e5fdb7ee6985291d808ea5eef431", release_date="2025-04-24", @@ -375,6 +387,7 @@ def encode( webssl_dino2b_light2b = ModelMeta( loader=DINOModel, name="facebook/webssl-dino2b-light2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="633a663f304e63cc3cbec3f7f9ca2fbc94736128", release_date="2025-04-24", @@ -397,6 +410,7 @@ def encode( webssl_dino2b_heavy2b = ModelMeta( loader=DINOModel, name="facebook/webssl-dino2b-heavy2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="9f46eb0c0129656a1ef195fde072e3765abdb7c6", release_date="2025-04-24", @@ -419,6 +433,7 @@ def encode( webssl_dino3b_light2b = ModelMeta( loader=DINOModel, name="facebook/webssl-dino3b-light2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="4d0160f60673805431f4ad14983e712ed88be5b8", release_date="2025-04-24", @@ -441,6 +456,7 @@ def encode( webssl_dino3b_heavy2b = ModelMeta( loader=DINOModel, name="facebook/webssl-dino3b-heavy2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="dd39c2910747561b332285d96c4dce0bdb240775", release_date="2025-04-24", @@ -463,6 +479,7 @@ def encode( webssl_mae300m_full2b = ModelMeta( loader=DINOModel, name="facebook/webssl-mae300m-full2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="4655a0ac1726c206ba14d5ccb26758c62a4d03b0", release_date="2025-04-24", @@ -485,6 +502,7 @@ def encode( webssl_mae700m_full2b = ModelMeta( loader=DINOModel, name="facebook/webssl-mae700m-full2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="c32be382e757d73a178de1ead62c27391d4b4280", release_date="2025-04-24", @@ -507,6 +525,7 @@ def encode( webssl_mae1b_full2b = ModelMeta( loader=DINOModel, name="facebook/webssl-mae1b-full2b-224", + model_type=["dense"], languages=["eng-Latn"], revision="5880aefedbad8db0f44d27358f6f08e8576f70fc", release_date="2025-04-24", diff --git a/mteb/models/model_implementations/e5_instruct.py b/mteb/models/model_implementations/e5_instruct.py index 6ff8c078aa..2bef567c59 100644 --- a/mteb/models/model_implementations/e5_instruct.py +++ b/mteb/models/model_implementations/e5_instruct.py @@ -40,6 +40,7 @@ normalized=True, ), name="intfloat/multilingual-e5-large-instruct", + model_type=["dense"], languages=XLMR_LANGUAGES, open_weights=True, revision="baa7be480a7de1539afce709c8f13f833a510e0a", @@ -78,6 +79,7 @@ normalized=True, ), name="intfloat/e5-mistral-7b-instruct", + model_type=["dense"], languages=MISTRAL_LANGUAGES, open_weights=True, revision="07163b72af1488142a360786df853f237b1a3ca1", @@ -125,6 +127,7 @@ normalized=True, ), name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + model_type=["dense"], revision="c791d37474fa6a5c72eb3a2522be346bc21fbfc3", release_date="2024-08-30", languages=["eng-Latn"], @@ -201,6 +204,7 @@ tokenizer_kwargs={"pad_token": ""}, ), name="BeastyZ/e5-R-mistral-7b", + model_type=["dense"], revision="3f810a6a7fd220369ad248e3705cf13d71803602", release_date="2024-06-28", languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/e5_models.py b/mteb/models/model_implementations/e5_models.py index c52d4c44ca..74ca25c585 100644 --- a/mteb/models/model_implementations/e5_models.py +++ b/mteb/models/model_implementations/e5_models.py @@ -70,6 +70,7 @@ model_prompts=model_prompts, ), name="intfloat/multilingual-e5-small", + model_type=["dense"], languages=XLMR_LANGUAGES, open_weights=True, revision="fd1525a9fd15316a2d503bf26ab031a61d056e98", @@ -96,6 +97,7 @@ model_prompts=model_prompts, ), name="intfloat/multilingual-e5-base", + model_type=["dense"], languages=XLMR_LANGUAGES, open_weights=True, revision="d13f1b27baf31030b7fd040960d60d909913633f", @@ -122,6 +124,7 @@ model_prompts=model_prompts, ), name="intfloat/multilingual-e5-large", + model_type=["dense"], languages=XLMR_LANGUAGES, open_weights=True, revision="ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb", @@ -148,6 +151,7 @@ model_prompts=model_prompts, ), name="intfloat/e5-small-v2", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="dca8b1a9dae0d4575df2bf423a5edb485a431236", @@ -174,6 +178,7 @@ model_prompts=model_prompts, ), name="intfloat/e5-small", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="e272f3049e853b47cb5ca3952268c6662abda68f", @@ -200,6 +205,7 @@ model_prompts=model_prompts, ), name="intfloat/e5-base-v2", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="1c644c92ad3ba1efdad3f1451a637716616a20e8", @@ -227,6 +233,7 @@ model_prompts=model_prompts, ), name="intfloat/e5-large-v2", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="b322e09026e4ea05f42beadf4d661fb4e101d311", @@ -254,6 +261,7 @@ model_prompts=model_prompts, ), name="intfloat/e5-large", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="4dc6d853a804b9c8886ede6dda8a073b7dc08a81", @@ -281,6 +289,7 @@ model_prompts=model_prompts, ), name="intfloat/e5-base", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="b533fe4636f4a2507c08ddab40644d20b0006d6a", diff --git a/mteb/models/model_implementations/e5_v.py b/mteb/models/model_implementations/e5_v.py index cc1a99f4b8..634b90cef6 100644 --- a/mteb/models/model_implementations/e5_v.py +++ b/mteb/models/model_implementations/e5_v.py @@ -160,6 +160,7 @@ def encode( device_map="auto", ), name="royokong/e5-v", + model_type=["dense"], languages=["eng-Latn"], revision="0c1f22679417b3ae925d779442221c40cd1861ab", release_date="2024-07-17", diff --git a/mteb/models/model_implementations/eagerworks_models.py b/mteb/models/model_implementations/eagerworks_models.py index b6be00ec24..fbdee8631a 100644 --- a/mteb/models/model_implementations/eagerworks_models.py +++ b/mteb/models/model_implementations/eagerworks_models.py @@ -141,6 +141,7 @@ def encode( image_size=784, ), name="eagerworks/eager-embed-v1", + model_type=["dense"], languages=["fra-Latn", "spa-Latn", "eng-Latn", "deu-Latn"], revision="a6bec272729c5056e2c26618ce085205c82a3b3c", release_date="2025-11-20", diff --git a/mteb/models/model_implementations/emillykkejensen_models.py b/mteb/models/model_implementations/emillykkejensen_models.py index 212a5b4a96..11ae32026e 100644 --- a/mteb/models/model_implementations/emillykkejensen_models.py +++ b/mteb/models/model_implementations/emillykkejensen_models.py @@ -4,6 +4,7 @@ embedding_gemma_300m_scandi = ModelMeta( loader=sentence_transformers_loader, # type: ignore name="emillykkejensen/EmbeddingGemma-Scandi-300m", + model_type=["dense"], languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"], open_weights=True, revision="9f3307b9f601db564a9190cb475324d128dcfe86", @@ -27,6 +28,7 @@ qwen_scandi = ModelMeta( loader=sentence_transformers_loader, # type: ignore name="emillykkejensen/Qwen3-Embedding-Scandi-0.6B", + model_type=["dense"], languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"], open_weights=True, revision="cf1e7ba36ebd3d605549d8f02930a18e17b54513", @@ -50,6 +52,7 @@ mmbert_scandi = ModelMeta( loader=sentence_transformers_loader, # type: ignore name="emillykkejensen/mmBERTscandi-base-embedding", + model_type=["dense"], languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"], open_weights=True, revision="82d74c7a5d8e1ddf31b132865df2d16b2b0294ee", diff --git a/mteb/models/model_implementations/en_code_retriever.py b/mteb/models/model_implementations/en_code_retriever.py index 399f2b5763..8418faa8e2 100644 --- a/mteb/models/model_implementations/en_code_retriever.py +++ b/mteb/models/model_implementations/en_code_retriever.py @@ -12,6 +12,7 @@ }, ), name="fyaronskiy/english_code_retriever", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="be653fab7d27a7348a0c2c3d16b9f92a7f10cb0c", diff --git a/mteb/models/model_implementations/euler_models.py b/mteb/models/model_implementations/euler_models.py index 35fc4718be..1a0aa17ac8 100644 --- a/mteb/models/model_implementations/euler_models.py +++ b/mteb/models/model_implementations/euler_models.py @@ -4,6 +4,7 @@ Euler_Legal_Embedding_V1 = ModelMeta( loader=sentence_transformers_loader, name="Mira190/Euler-Legal-Embedding-V1", + model_type=["dense"], revision="df607ed9e25e569514a99c27cdaaab16e76b6dd4", release_date="2025-11-06", languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/evaclip_models.py b/mteb/models/model_implementations/evaclip_models.py index b88a4ced18..ab61db6e96 100644 --- a/mteb/models/model_implementations/evaclip_models.py +++ b/mteb/models/model_implementations/evaclip_models.py @@ -138,6 +138,7 @@ def encode( EVA02_CLIP_B_16 = ModelMeta( loader=evaclip_loader, name="QuanSun/EVA02-CLIP-B-16", + model_type=["dense"], languages=["eng-Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", @@ -161,6 +162,7 @@ def encode( EVA02_CLIP_L_14 = ModelMeta( loader=evaclip_loader, name="QuanSun/EVA02-CLIP-L-14", + model_type=["dense"], languages=["eng-Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", @@ -184,6 +186,7 @@ def encode( EVA02_CLIP_bigE_14 = ModelMeta( loader=evaclip_loader, name="QuanSun/EVA02-CLIP-bigE-14", + model_type=["dense"], languages=["eng-Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", @@ -208,6 +211,7 @@ def encode( EVA02_CLIP_bigE_14_plus = ModelMeta( loader=evaclip_loader, name="QuanSun/EVA02-CLIP-bigE-14-plus", + model_type=["dense"], languages=["eng-Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", diff --git a/mteb/models/model_implementations/fa_models.py b/mteb/models/model_implementations/fa_models.py index 415becd890..47e0410339 100644 --- a/mteb/models/model_implementations/fa_models.py +++ b/mteb/models/model_implementations/fa_models.py @@ -6,6 +6,7 @@ parsbert = ModelMeta( loader=sentence_transformers_loader, name="HooshvareLab/bert-base-parsbert-uncased", + model_type=["dense"], languages=["fas-Arab"], open_weights=True, revision="d73a0e2c7492c33bd5819bcdb23eba207404dd19", @@ -41,6 +42,7 @@ bert_zwnj = ModelMeta( loader=sentence_transformers_loader, name="m3hrdadfi/bert-zwnj-wnli-mean-tokens", + model_type=["dense"], languages=["fas-Arab"], open_weights=True, revision="b9506ddc579ac8c398ae6dae680401ae0a1a5b23", @@ -66,6 +68,7 @@ roberta_zwnj = ModelMeta( loader=sentence_transformers_loader, name="m3hrdadfi/roberta-zwnj-wnli-mean-tokens", + model_type=["dense"], languages=["fas-Arab"], open_weights=True, revision="36f912ac44e22250aee16ea533a4ff8cd848c1a1", @@ -90,6 +93,7 @@ sentence_transformer_parsbert = ModelMeta( loader=sentence_transformers_loader, name="myrkur/sentence-transformer-parsbert-fa", + model_type=["dense"], languages=["fas-Arab"], open_weights=True, revision="72bd0a3557622f0ae08a092f4643609e0b950cdd", @@ -140,6 +144,7 @@ tooka_sbert = ModelMeta( loader=sentence_transformers_loader, name="PartAI/Tooka-SBERT", + model_type=["dense"], languages=["fas-Arab"], open_weights=True, revision="5d07f0c543aca654373b931ae07cd197769110fd", @@ -161,6 +166,7 @@ fa_bert = ModelMeta( loader=sentence_transformers_loader, name="sbunlp/fabert", + model_type=["dense"], languages=["fas-Arab"], open_weights=True, revision="a0e3973064c97768e121b9b95f21adc94e0ca3fb", @@ -185,6 +191,7 @@ tooka_sbert_v2_small = ModelMeta( loader=sentence_transformers_loader, name="PartAI/Tooka-SBERT-V2-Small", + model_type=["dense"], languages=["fas-Arab"], open_weights=True, revision="8bbed87e36669387f71437c061430ba56d1b496f", @@ -206,6 +213,7 @@ tooka_sbert_v2_large = ModelMeta( loader=sentence_transformers_loader, name="PartAI/Tooka-SBERT-V2-Large", + model_type=["dense"], languages=["fas-Arab"], open_weights=True, revision="b59682efa961122cc0e4408296d5852870c82eae", diff --git a/mteb/models/model_implementations/facebookai.py b/mteb/models/model_implementations/facebookai.py index 4347efb096..3e20909dd7 100644 --- a/mteb/models/model_implementations/facebookai.py +++ b/mteb/models/model_implementations/facebookai.py @@ -107,6 +107,7 @@ xlmr_base = ModelMeta( loader=sentence_transformers_loader, # type: ignore[arg-type] name="FacebookAI/xlm-roberta-base", + model_type=["dense"], languages=XLMR_LANGUAGES, open_weights=True, revision="e73636d4f797dec63c3081bb6ed5c7b0bb3f2089", @@ -128,6 +129,7 @@ xlmr_large = ModelMeta( loader=sentence_transformers_loader, # type: ignore[arg-type] name="FacebookAI/xlm-roberta-large", + model_type=["dense"], languages=XLMR_LANGUAGES, open_weights=True, revision="c23d21b0620b635a76227c604d44e43a9f0ee389", diff --git a/mteb/models/model_implementations/geogpt_models.py b/mteb/models/model_implementations/geogpt_models.py index 7f946aed38..b61eedbe24 100644 --- a/mteb/models/model_implementations/geogpt_models.py +++ b/mteb/models/model_implementations/geogpt_models.py @@ -7,6 +7,7 @@ geoembedding = ModelMeta( name="GeoGPT-Research-Project/GeoEmbedding", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="29803c28ea7ef6871194a8ebc85ad7bfe174928e", diff --git a/mteb/models/model_implementations/gme_v_models.py b/mteb/models/model_implementations/gme_v_models.py index 81ebd54e67..6db911f298 100644 --- a/mteb/models/model_implementations/gme_v_models.py +++ b/mteb/models/model_implementations/gme_v_models.py @@ -346,6 +346,7 @@ def fetch_image(image: Image.Image, size_factor: int = IMAGE_FACTOR) -> Image.Im gme_qwen2vl_2b = ModelMeta( loader=GmeQwen2VL, name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + model_type=["dense"], languages=["eng-Latn", "cmn-Hans"], open_weights=True, revision="ce765ae71b8cdb208203cd8fb64a170b1b84293a", @@ -369,6 +370,7 @@ def fetch_image(image: Image.Image, size_factor: int = IMAGE_FACTOR) -> Image.Im gme_qwen2vl_7b = ModelMeta( loader=GmeQwen2VL, name="Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", + model_type=["dense"], languages=["eng-Latn", "cmn-Hans"], open_weights=True, revision="477027a6480f8630363be77751f169cc3434b673", diff --git a/mteb/models/model_implementations/google_models.py b/mteb/models/model_implementations/google_models.py index bdfdb00074..c9836bd4e8 100644 --- a/mteb/models/model_implementations/google_models.py +++ b/mteb/models/model_implementations/google_models.py @@ -150,6 +150,7 @@ def encode( model_prompts=MODEL_PROMPTS, ), name="google/text-embedding-004", + model_type=["dense"], languages=["eng-Latn"], open_weights=False, revision="1", # revision is intended for implementation @@ -174,6 +175,7 @@ def encode( model_prompts=MODEL_PROMPTS, ), name="google/text-embedding-005", + model_type=["dense"], languages=["eng-Latn"], open_weights=False, revision="1", # revision is intended for implementation @@ -198,6 +200,7 @@ def encode( model_prompts=MODEL_PROMPTS, ), name="google/text-multilingual-embedding-002", + model_type=["dense"], languages=MULTILINGUAL_EVALUATED_LANGUAGES, # From the list of evaluated languages in https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api#supported_text_languages open_weights=False, revision="1", @@ -222,6 +225,7 @@ def encode( model_prompts=MODEL_PROMPTS, ), name="google/gemini-embedding-001", + model_type=["dense"], languages=MULTILINGUAL_EVALUATED_LANGUAGES, open_weights=False, revision="1", @@ -256,6 +260,7 @@ def gemma_embedding_loader(model_name: str, revision: str, **kwargs): embedding_gemma_300m = ModelMeta( loader=gemma_embedding_loader, name="google/embeddinggemma-300m", + model_type=["dense"], languages=MULTILINGUAL_EVALUATED_LANGUAGES, open_weights=True, revision="64614b0b8b64f0c6c1e52b07e4e9a4e8fe4d2da2", diff --git a/mteb/models/model_implementations/granite_vision_embedding_models.py b/mteb/models/model_implementations/granite_vision_embedding_models.py index f39865db56..9cdc2a753a 100644 --- a/mteb/models/model_implementations/granite_vision_embedding_models.py +++ b/mteb/models/model_implementations/granite_vision_embedding_models.py @@ -166,6 +166,7 @@ def similarity(self, a, b): torch_dtype=torch.float16, ), name="ibm-granite/granite-vision-3.3-2b-embedding", + model_type=["dense"], languages=["eng-Latn"], revision="cee615db64d89d1552a4ee39c50f25c0fc5c66ca", release_date="2025-06-11", diff --git a/mteb/models/model_implementations/gritlm_models.py b/mteb/models/model_implementations/gritlm_models.py index 8d483b4fa8..08d9801d9d 100644 --- a/mteb/models/model_implementations/gritlm_models.py +++ b/mteb/models/model_implementations/gritlm_models.py @@ -38,6 +38,7 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: torch_dtype="auto", ), name="GritLM/GritLM-7B", + model_type=["dense"], languages=["eng-Latn", "fra-Latn", "deu-Latn", "ita-Latn", "spa-Latn"], open_weights=True, revision="13f00a0e36500c80ce12870ea513846a066004af", @@ -66,6 +67,7 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: torch_dtype="auto", ), name="GritLM/GritLM-8x7B", + model_type=["dense"], languages=["eng-Latn", "fra-Latn", "deu-Latn", "ita-Latn", "spa-Latn"], open_weights=True, revision="7f089b13e3345510281733ca1e6ff871b5b4bc76", diff --git a/mteb/models/model_implementations/gte_models.py b/mteb/models/model_implementations/gte_models.py index ece89d895b..0a6a730aba 100644 --- a/mteb/models/model_implementations/gte_models.py +++ b/mteb/models/model_implementations/gte_models.py @@ -42,6 +42,7 @@ def instruction_template( embed_eos="<|endoftext|>", ), name="Alibaba-NLP/gte-Qwen2-7B-instruct", + model_type=["dense"], languages=None, open_weights=True, revision="e26182b2122f4435e8b3ebecbf363990f409b45b", @@ -73,6 +74,7 @@ def instruction_template( embed_eos="<|endoftext|>", ), name="Alibaba-NLP/gte-Qwen1.5-7B-instruct", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="07d27e5226328010336563bc1b564a5e3436a298", @@ -103,6 +105,7 @@ def instruction_template( embed_eos="<|endoftext|>", ), name="Alibaba-NLP/gte-Qwen2-1.5B-instruct", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="c6c1b92f4a3e1b92b326ad29dd3c8433457df8dd", @@ -124,6 +127,7 @@ def instruction_template( gte_small_zh = ModelMeta( loader=sentence_transformers_loader, name="thenlper/gte-small-zh", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", @@ -145,6 +149,7 @@ def instruction_template( gte_base_zh = ModelMeta( loader=sentence_transformers_loader, name="thenlper/gte-base-zh", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", @@ -166,6 +171,7 @@ def instruction_template( gte_large_zh = ModelMeta( loader=sentence_transformers_loader, name="thenlper/gte-large-zh", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="64c364e579de308104a9b2c170ca009502f4f545", @@ -288,6 +294,7 @@ def instruction_template( gte_multilingual_base = ModelMeta( loader=sentence_transformers_loader, name="Alibaba-NLP/gte-multilingual-base", + model_type=["dense"], languages=gte_multilingual_langs, open_weights=True, revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", @@ -309,6 +316,7 @@ def instruction_template( gte_modernbert_base = ModelMeta( loader=sentence_transformers_loader, name="Alibaba-NLP/gte-modernbert-base", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="7ca8b4ca700621b67618669f5378fe5f5820b8e4", @@ -331,6 +339,7 @@ def instruction_template( gte_base_en_v15 = ModelMeta( loader=sentence_transformers_loader, name="Alibaba-NLP/gte-base-en-v1.5", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="a829fd0e060bb84554da0dfd354d0de0f7712b7f", # can be any diff --git a/mteb/models/model_implementations/hinvec_models.py b/mteb/models/model_implementations/hinvec_models.py index f0529189cc..3ee30b79ae 100644 --- a/mteb/models/model_implementations/hinvec_models.py +++ b/mteb/models/model_implementations/hinvec_models.py @@ -37,6 +37,7 @@ def instruction_template( add_eos_token=True, ), name="Sailesh97/Hinvec", + model_type=["dense"], languages=["eng-Latn", "hin-Deva"], open_weights=True, revision="d4fc678720cc1b8c5d18599ce2d9a4d6090c8b6b", diff --git a/mteb/models/model_implementations/human.py b/mteb/models/model_implementations/human.py index b9222d0ee9..8848acfde7 100644 --- a/mteb/models/model_implementations/human.py +++ b/mteb/models/model_implementations/human.py @@ -3,6 +3,7 @@ human = ModelMeta( loader=None, name="Human", + model_type=["dense"], languages=["eng-Latn", "ara-Arab", "rus-Cyrl", "dan-Latn", "nob-Latn"], open_weights=True, revision="2025_09_25", diff --git a/mteb/models/model_implementations/ibm_granite_models.py b/mteb/models/model_implementations/ibm_granite_models.py index 279d97ae30..b815608f93 100644 --- a/mteb/models/model_implementations/ibm_granite_models.py +++ b/mteb/models/model_implementations/ibm_granite_models.py @@ -94,6 +94,7 @@ granite_107m_multilingual = ModelMeta( loader=sentence_transformers_loader, name="ibm-granite/granite-embedding-107m-multilingual", + model_type=["dense"], languages=GRANITE_LANGUAGES, open_weights=True, revision="47db56afe692f731540413c67dd818ff492277e7", @@ -118,6 +119,7 @@ granite_278m_multilingual = ModelMeta( loader=sentence_transformers_loader, name="ibm-granite/granite-embedding-278m-multilingual", + model_type=["dense"], languages=GRANITE_LANGUAGES, open_weights=True, revision="84e3546b88b0cb69f8078608a1df558020bcbf1f", @@ -142,6 +144,7 @@ granite_30m_english = ModelMeta( loader=sentence_transformers_loader, name="ibm-granite/granite-embedding-30m-english", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="eddbb57470f896b5f8e2bfcb823d8f0e2d2024a5", @@ -166,6 +169,7 @@ granite_125m_english = ModelMeta( loader=sentence_transformers_loader, name="ibm-granite/granite-embedding-125m-english", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="e48d3a5b47eaa18e3fe07d4676e187fd80f32730", @@ -191,6 +195,7 @@ granite_english_r2 = ModelMeta( loader=sentence_transformers_loader, name="ibm-granite/granite-embedding-english-r2", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="6e7b8ce0e76270394ac4669ba4bbd7133b60b7f9", @@ -215,6 +220,7 @@ granite_small_english_r2 = ModelMeta( loader=sentence_transformers_loader, name="ibm-granite/granite-embedding-small-english-r2", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="54a8d2616a0844355a5164432d3f6dafb37b17a3", diff --git a/mteb/models/model_implementations/inf_models.py b/mteb/models/model_implementations/inf_models.py index ed87aa086a..53461f18c9 100644 --- a/mteb/models/model_implementations/inf_models.py +++ b/mteb/models/model_implementations/inf_models.py @@ -50,6 +50,7 @@ trust_remote_code=True, ), name="infly/inf-retriever-v1", + model_type=["dense"], languages=["eng-Latn", "zho-Hans"], open_weights=True, revision="cb70ca7c31dfa866b2eff2dad229c144d8ddfd91", @@ -76,6 +77,7 @@ trust_remote_code=True, ), name="infly/inf-retriever-v1-1.5b", + model_type=["dense"], languages=["eng-Latn", "zho-Hans"], open_weights=True, revision="c9c05c2dd50707a486966ba81703021ae2094a06", diff --git a/mteb/models/model_implementations/jasper_models.py b/mteb/models/model_implementations/jasper_models.py index 376bd631e3..cd9eeaa8ae 100644 --- a/mteb/models/model_implementations/jasper_models.py +++ b/mteb/models/model_implementations/jasper_models.py @@ -286,6 +286,7 @@ def encode( instruction_template="Instruct: {instruction}\nQuery: ", ), name="NovaSearch/jasper_en_vision_language_v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="d6330ce98f8a0d741e781df845904c9484f00efa", @@ -332,6 +333,7 @@ def encode( loader=InstructSentenceTransformerModel, loader_kwargs=jasper_token_compression_600m_loader_kwargs, name="infgrad/Jasper-Token-Compression-600M", + model_type=["dense"], languages=["eng-Latn", "zho-Hans"], open_weights=True, revision="06a100f753a5a96d9e583b3af79c6fcdfacc4719", diff --git a/mteb/models/model_implementations/jina_clip.py b/mteb/models/model_implementations/jina_clip.py index 3c306fb64c..469a0c7b29 100644 --- a/mteb/models/model_implementations/jina_clip.py +++ b/mteb/models/model_implementations/jina_clip.py @@ -123,6 +123,7 @@ def encode( jina_clip_v1 = ModelMeta( loader=JinaCLIPModel, # type: ignore name="jinaai/jina-clip-v1", + model_type=["dense"], languages=["eng-Latn"], revision="06150c7c382d7a4faedc7d5a0d8cdb59308968f4", release_date="2024-05-30", diff --git a/mteb/models/model_implementations/jina_models.py b/mteb/models/model_implementations/jina_models.py index 1e87b78d47..f7d06931ec 100644 --- a/mteb/models/model_implementations/jina_models.py +++ b/mteb/models/model_implementations/jina_models.py @@ -720,6 +720,7 @@ def get_programming_task_override( trust_remote_code=True, ), name="jinaai/jina-reranker-v3", + model_type=["cross-encoder"], languages=multilingual_langs, open_weights=True, revision="050e171c4f75dfec5b648ed8470a2475e5a30f30", @@ -734,7 +735,6 @@ def get_programming_task_override( framework=["PyTorch"], use_instructions=None, reference="https://huggingface.co/jinaai/jina-reranker-v3", - is_cross_encoder=True, public_training_code=None, public_training_data=None, training_datasets=JINARerankerV3_TRAINING_DATA, @@ -763,6 +763,7 @@ def get_programming_task_override( }, ), name="jinaai/jina-embeddings-v4", + model_type=["dense"], languages=XLMR_LANGUAGES, open_weights=True, revision="4a58ca57710c49f51896e4bc820e202fbf64904b", @@ -811,6 +812,7 @@ def get_programming_task_override( }, ), name="jinaai/jina-embeddings-v3", + model_type=["dense"], languages=XLMR_LANGUAGES, open_weights=True, revision="215a6e121fa0183376388ac6b1ae230326bfeaed", @@ -864,6 +866,7 @@ def get_programming_task_override( trust_remote_code=True, ), name="jinaai/jina-embeddings-v2-base-en", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="6e85f575bc273f1fd840a658067d0157933c83f0", @@ -927,6 +930,7 @@ def get_programming_task_override( trust_remote_code=True, ), name="jinaai/jina-embeddings-v2-small-en", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="44e7d1d6caec8c883c2d4b207588504d519788d0", @@ -987,6 +991,7 @@ def get_programming_task_override( jina_embedding_b_en_v1 = ModelMeta( loader=SentenceTransformerEncoderWrapper, name="jinaai/jina-embedding-b-en-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="32aa658e5ceb90793454d22a57d8e3a14e699516", @@ -1043,6 +1048,7 @@ def get_programming_task_override( jina_embedding_s_en_v1 = ModelMeta( loader=SentenceTransformerEncoderWrapper, name="jinaai/jina-embedding-s-en-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="5ac6cd473e2324c6d5f9e558a6a9f65abb57143e", diff --git a/mteb/models/model_implementations/kalm_models.py b/mteb/models/model_implementations/kalm_models.py index 49b405a7de..6ee7d22b77 100644 --- a/mteb/models/model_implementations/kalm_models.py +++ b/mteb/models/model_implementations/kalm_models.py @@ -769,6 +769,7 @@ def encode( prompts_dict=KaLM_task_prompts, ), name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + model_type=["dense"], revision="45e42c89990c40aca042659133fc8b13c28634b5", release_date="2024-10-23", languages=["eng-Latn", "zho-Hans"], @@ -793,6 +794,7 @@ def encode( HIT_TMG__KaLM_embedding_multilingual_mini_v1 = ModelMeta( loader=sentence_transformers_loader, name="HIT-TMG/KaLM-embedding-multilingual-mini-v1", + model_type=["dense"], revision="8a82a0cd2b322b91723e252486f7cce6fd8ac9d3", release_date="2024-08-27", languages=["eng-Latn", "zho-Hans"], @@ -823,6 +825,7 @@ def encode( prompts_dict=KaLM_task_prompts, ), name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", + model_type=["dense"], revision="fcff2f8a54e4cd96b7766fef1ee960a43d42bb3c", release_date="2024-12-26", languages=["eng-Latn", "zho-Hans"], @@ -853,6 +856,7 @@ def encode( prompts_dict=KaLM_v2_task_prompts, ), name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2", + model_type=["dense"], revision="d2a21c232dc712ae8230af56d1027cf21b7864bf", release_date="2025-06-25", languages=["eng-Latn", "zho-Hans"], @@ -883,6 +887,7 @@ def encode( prompts_dict=KaLM_v2_task_prompts, ), name="KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5", + model_type=["dense"], revision="6a4cfc1084cb459ebd4729b53a8656a61448c720", release_date="2025-09-30", languages=["eng-Latn", "zho-Hans"], @@ -912,6 +917,7 @@ def encode( prompts_dict=KaLM_Embedding_gemma_3_12b_task_prompts, ), name="tencent/KaLM-Embedding-Gemma3-12B-2511", + model_type=["dense"], revision="edf22f4753f58b05e3f5495818d31f12db63056d", languages=None, open_weights=True, diff --git a/mteb/models/model_implementations/kblab.py b/mteb/models/model_implementations/kblab.py index 76dd98945b..82b7eff0c4 100644 --- a/mteb/models/model_implementations/kblab.py +++ b/mteb/models/model_implementations/kblab.py @@ -4,6 +4,7 @@ sbert_swedish = ModelMeta( loader=sentence_transformers_loader, # type: ignore[arg-type] name="KBLab/sentence-bert-swedish-cased", + model_type=["dense"], languages=["swe-Latn"], open_weights=True, revision="6b5e83cd29c03729cfdc33d13b1423399b0efb5c", diff --git a/mteb/models/model_implementations/kennethenevoldsen_models.py b/mteb/models/model_implementations/kennethenevoldsen_models.py index 9f811fd174..1b9b8ca4e1 100644 --- a/mteb/models/model_implementations/kennethenevoldsen_models.py +++ b/mteb/models/model_implementations/kennethenevoldsen_models.py @@ -6,6 +6,7 @@ dfm_enc_large = ModelMeta( loader=sentence_transformers_loader, # type: ignore name="KennethEnevoldsen/dfm-sentence-encoder-large", + model_type=["dense"], languages=["dan-Latn"], open_weights=True, revision="132c53391e7a780dc6a2f9a03724d0158fe7122c", @@ -40,6 +41,7 @@ dfm_enc_med = ModelMeta( loader=sentence_transformers_loader, # type: ignore name="KennethEnevoldsen/dfm-sentence-encoder-medium", + model_type=["dense"], languages=["dan-Latn"], open_weights=True, revision="701bce95d499fa97610d57e8823c54fd1fb79930", diff --git a/mteb/models/model_implementations/kfst.py b/mteb/models/model_implementations/kfst.py index 0d0433518a..361b814947 100644 --- a/mteb/models/model_implementations/kfst.py +++ b/mteb/models/model_implementations/kfst.py @@ -4,6 +4,7 @@ xlmr_scandi = ModelMeta( loader=sentence_transformers_loader, # type: ignore[arg-type] name="KFST/XLMRoberta-en-da-sv-nb", + model_type=["dense"], languages=["swe-Latn", "nob-Latn", "nno-Latn", "dan-Latn", "eng-Latn"], open_weights=True, revision="d40c10ca7b1e68b5a8372f2d112dac9eb3279df1", diff --git a/mteb/models/model_implementations/kowshik24_models.py b/mteb/models/model_implementations/kowshik24_models.py index dfde2ee9e6..1d2c7727ac 100644 --- a/mteb/models/model_implementations/kowshik24_models.py +++ b/mteb/models/model_implementations/kowshik24_models.py @@ -3,6 +3,7 @@ kowshik24_bangla_embedding_model = ModelMeta( loader=sentence_transformers_loader, name="Kowshik24/bangla-sentence-transformer-ft-matryoshka-paraphrase-multilingual-mpnet-base-v2", + model_type=["dense"], languages=["ben-Beng"], # Bengali using Bengali script open_weights=True, revision="6689c21e69be5950596bad084457cbaa138728d8", diff --git a/mteb/models/model_implementations/lens_models.py b/mteb/models/model_implementations/lens_models.py index d7bf5ef06f..613f39aaf3 100644 --- a/mteb/models/model_implementations/lens_models.py +++ b/mteb/models/model_implementations/lens_models.py @@ -12,6 +12,7 @@ lens_d4000 = ModelMeta( loader=None, name="yibinlei/LENS-d4000", + model_type=["dense"], languages=None, open_weights=True, revision="e473b33364e6c48a324796fd1411d3b93670c6fe", @@ -34,6 +35,7 @@ lens_d8000 = ModelMeta( loader=None, name="yibinlei/LENS-d8000", + model_type=["dense"], languages=None, open_weights=True, revision="a0b87bd91cb27b6f2f0b0fe22c28026da1d464ef", diff --git a/mteb/models/model_implementations/lgai_embedding_models.py b/mteb/models/model_implementations/lgai_embedding_models.py index c2b5536922..824345a6bf 100644 --- a/mteb/models/model_implementations/lgai_embedding_models.py +++ b/mteb/models/model_implementations/lgai_embedding_models.py @@ -44,6 +44,7 @@ lgai_embedding_en = ModelMeta( loader=sentence_transformers_loader, name="annamodels/LGAI-Embedding-Preview", + model_type=["dense"], languages=[ "eng-Latn", ], diff --git a/mteb/models/model_implementations/linq_models.py b/mteb/models/model_implementations/linq_models.py index 38073c2d47..63e6dd7a42 100644 --- a/mteb/models/model_implementations/linq_models.py +++ b/mteb/models/model_implementations/linq_models.py @@ -32,6 +32,7 @@ def instruction_template( normalized=True, ), name="Linq-AI-Research/Linq-Embed-Mistral", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="0c1a0b0589177079acc552433cad51d7c9132379", diff --git a/mteb/models/model_implementations/listconranker.py b/mteb/models/model_implementations/listconranker.py index d8345ca309..d7b09f7e0b 100644 --- a/mteb/models/model_implementations/listconranker.py +++ b/mteb/models/model_implementations/listconranker.py @@ -112,6 +112,7 @@ def predict( fp_options="float16", ), name="ByteDance/ListConRanker", + model_type=["cross-encoder"], languages=["zho-Hans"], open_weights=True, revision="95ae6a5f422a916bc36520f0f3e198e7d91520a0", @@ -128,6 +129,5 @@ def predict( use_instructions=False, public_training_code=None, public_training_data=None, - is_cross_encoder=True, citation=LISTCONRANKER_CITATION, ) diff --git a/mteb/models/model_implementations/llm2clip_models.py b/mteb/models/model_implementations/llm2clip_models.py index f2123c600e..fe66d5812e 100644 --- a/mteb/models/model_implementations/llm2clip_models.py +++ b/mteb/models/model_implementations/llm2clip_models.py @@ -183,6 +183,7 @@ def encode( llm2clip_openai_l_14_336 = ModelMeta( loader=llm2clip_loader, # type: ignore name="microsoft/LLM2CLIP-Openai-L-14-336", + model_type=["dense"], languages=["eng-Latn"], revision="92512331f393a003c3d98404677f991c188162c9", release_date="2024-11-07", @@ -207,6 +208,7 @@ def encode( llm2clip_openai_l_14_224 = ModelMeta( loader=llm2clip_loader, # type: ignore name="microsoft/LLM2CLIP-Openai-L-14-224", + model_type=["dense"], languages=["eng-Latn"], revision="6b8a11a94ff380fa220dfefe73ac9293d2677575", release_date="2024-11-07", @@ -230,6 +232,7 @@ def encode( llm2clip_openai_b_16 = ModelMeta( loader=llm2clip_loader, # type: ignore name="microsoft/LLM2CLIP-Openai-B-16", + model_type=["dense"], languages=["eng-Latn"], revision="ecfb347eb3dcfeb2fbc2a2eae7de6ac5a001aaf8", release_date="2024-11-07", diff --git a/mteb/models/model_implementations/llm2vec_models.py b/mteb/models/model_implementations/llm2vec_models.py index 4370eece1f..e5683a4755 100644 --- a/mteb/models/model_implementations/llm2vec_models.py +++ b/mteb/models/model_implementations/llm2vec_models.py @@ -132,6 +132,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="baa8ebf04a1c2500e61288e7dad65e8ae42601a7", @@ -161,6 +162,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-unsup-simcse", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="1cb7b735326d13a8541db8f57f35da5373f5e9c6", @@ -189,6 +191,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="0ae69bdd5816105778b971c3138e8f8a18eaa3ae", @@ -217,6 +220,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="2c055a5d77126c0d3dc6cd8ffa30e2908f4f45f8", @@ -245,6 +249,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="2c055a5d77126c0d3dc6cd8ffa30e2908f4f45f8", @@ -273,6 +278,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="a76944871d169ebe7c97eb921764cd063afed785", @@ -301,6 +307,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="a5943d406c6b016fef3f07906aac183cf1a0b47d", @@ -329,6 +336,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-unsup-simcse", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="a5943d406c6b016fef3f07906aac183cf1a0b47d", diff --git a/mteb/models/model_implementations/mcinext_models.py b/mteb/models/model_implementations/mcinext_models.py index bfeb9c0e83..507b708115 100644 --- a/mteb/models/model_implementations/mcinext_models.py +++ b/mteb/models/model_implementations/mcinext_models.py @@ -344,6 +344,7 @@ def encode( loader=HakimModelWrapper, loader_kwargs=dict( api_model_name="hakim", + model_type=["dense"], ), name="MCINext/Hakim", languages=["fas-Arab"], @@ -411,6 +412,7 @@ def encode( loader=HakimModelWrapper, loader_kwargs=dict( api_model_name="hakim-small", + model_type=["dense"], ), name="MCINext/Hakim-small", languages=["fas-Arab"], @@ -477,6 +479,7 @@ def encode( loader=HakimModelWrapper, loader_kwargs=dict( api_model_name="hakim-unsup", + model_type=["dense"], ), name="MCINext/Hakim-unsup", languages=["fas-Arab"], diff --git a/mteb/models/model_implementations/mdbr_models.py b/mteb/models/model_implementations/mdbr_models.py index 9da791a10b..b4790db698 100644 --- a/mteb/models/model_implementations/mdbr_models.py +++ b/mteb/models/model_implementations/mdbr_models.py @@ -30,6 +30,7 @@ model_prompts=model_prompts, ), name="MongoDB/mdbr-leaf-ir", + model_type=["dense"], revision="2e46f5aac796e621d51f678c306a66ede4712ecb", release_date="2025-08-27", languages=["eng-Latn"], @@ -57,6 +58,7 @@ model_prompts=model_prompts, ), name="MongoDB/mdbr-leaf-mt", + model_type=["dense"], revision="66c47ba6d753efc208d54412b5af6c744a39a4df", release_date="2025-08-27", languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/misc_models.py b/mteb/models/model_implementations/misc_models.py index a8e9912b01..89deabbb2c 100644 --- a/mteb/models/model_implementations/misc_models.py +++ b/mteb/models/model_implementations/misc_models.py @@ -13,6 +13,7 @@ Haon_Chen__speed_embedding_7b_instruct = ModelMeta( loader=sentence_transformers_loader, name="Haon-Chen/speed-embedding-7b-instruct", + model_type=["dense"], revision="c167e9a8144b397622ce47b85d9edcdeecef3d3f", release_date="2024-10-31", languages=["eng-Latn"], @@ -40,6 +41,7 @@ ) Gameselo__STS_multilingual_mpnet_base_v2 = ModelMeta( name="Gameselo/STS-multilingual-mpnet-base-v2", + model_type=["dense"], revision="449f917af30f590fc31f9ffb226c94f21a2f47b8", release_date="2024-06-07", languages=[], @@ -131,6 +133,7 @@ Hum_Works__lodestone_base_4096_v1 = ModelMeta( name="Hum-Works/lodestone-base-4096-v1", + model_type=["dense"], revision="9bbc2d0b57dd2198aea029404b0f976712a7d966", release_date="2023-08-25", languages=["eng-Latn"], @@ -197,6 +200,7 @@ ) Jaume__gemma_2b_embeddings = ModelMeta( name="Jaume/gemma-2b-embeddings", + model_type=["dense"], revision="86431f65d7c3f66b2af096c61e614a2958f191f1", release_date="2024-06-29", languages=[], @@ -228,6 +232,7 @@ Lajavaness__bilingual_embedding_base = ModelMeta( name="Lajavaness/bilingual-embedding-base", + model_type=["dense"], revision="0bfc54bb2aa2666dd84715289c7ef58a95eb4d8d", release_date="2024-06-26", languages=None, @@ -253,6 +258,7 @@ ) Lajavaness__bilingual_embedding_large = ModelMeta( name="Lajavaness/bilingual-embedding-large", + model_type=["dense"], revision="e83179d7a66e8aed1b3015e98bb5ae234ed89598", release_date="2024-06-24", languages=["fra-Latn", "eng-Latn"], @@ -278,6 +284,7 @@ ) Lajavaness__bilingual_embedding_small = ModelMeta( name="Lajavaness/bilingual-embedding-small", + model_type=["dense"], revision="ed4a1dd814de0db81d4a4e287c296a03194463e3", release_date="2024-07-17", languages=["fra-Latn", "eng-Latn"], @@ -303,6 +310,7 @@ ) Mihaiii__Bulbasaur = ModelMeta( name="Mihaiii/Bulbasaur", + model_type=["dense"], revision="6876f839e18ae36224049a41194a431953f08747", release_date="2024-04-27", languages=None, @@ -326,6 +334,7 @@ ) Mihaiii__Ivysaur = ModelMeta( name="Mihaiii/Ivysaur", + model_type=["dense"], revision="65914d976f45beb4bda7485c39d88865b4ce6554", release_date="2024-04-27", languages=None, @@ -349,6 +358,7 @@ ) Mihaiii__Squirtle = ModelMeta( name="Mihaiii/Squirtle", + model_type=["dense"], revision="5b991da48a9286637a256d4a35aab87a1a57b78a", release_date="2024-04-30", languages=None, @@ -372,6 +382,7 @@ ) Mihaiii__Venusaur = ModelMeta( name="Mihaiii/Venusaur", + model_type=["dense"], revision="0dc817f0addbb7bab8feeeeaded538f9ffeb3419", release_date="2024-04-29", languages=None, @@ -395,6 +406,7 @@ ) Mihaiii__Wartortle = ModelMeta( name="Mihaiii/Wartortle", + model_type=["dense"], revision="14caca5253414d38a7d28b62d1b7c30ef3293a87", release_date="2024-04-30", languages=None, @@ -418,6 +430,7 @@ ) Mihaiii__gte_micro = ModelMeta( name="Mihaiii/gte-micro", + model_type=["dense"], revision="6fd2397cb9dfa7c901aedf9a2a44d3c888ccafdd", release_date="2024-04-21", languages=None, @@ -440,6 +453,7 @@ ) Mihaiii__gte_micro_v4 = ModelMeta( name="Mihaiii/gte-micro-v4", + model_type=["dense"], revision="78e1a4b348f8524c3ab2e3e3475788f5adb8c98f", release_date="2024-04-22", languages=None, @@ -462,6 +476,7 @@ ) OrdalieTech__Solon_embeddings_large_0_1 = ModelMeta( name="OrdalieTech/Solon-embeddings-large-0.1", + model_type=["dense"], revision="9f6465f6ea2f6d10c6294bc15d84edf87d47cdef", release_date="2023-12-09", languages=["fra-Latn"], @@ -484,6 +499,7 @@ ) Omartificial_Intelligence_Space__Arabert_all_nli_triplet_Matryoshka = ModelMeta( name="Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", + model_type=["dense"], revision="d0361a36f6fe69febfc8550d0918abab174f6f30", release_date="2024-06-16", languages=["ara-Arab"], @@ -506,6 +522,7 @@ ) Omartificial_Intelligence_Space__Arabic_MiniLM_L12_v2_all_nli_triplet = ModelMeta( name="Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", + model_type=["dense"], revision="6916465c43b984e955aa6dc72851474f0128f428", release_date="2024-06-25", languages=["ara-Arab"], @@ -530,6 +547,7 @@ ) Omartificial_Intelligence_Space__Arabic_all_nli_triplet_Matryoshka = ModelMeta( name="Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", + model_type=["dense"], revision="1ca467cc576bd76666a4d21b24ee43afa914dd10", release_date="2024-06-14", languages=["ara-Arab"], @@ -554,6 +572,7 @@ ) Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta( name="Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", + model_type=["dense"], revision="ee6d5e33c78ed582ade47fd452a74ea52aa5bfe2", release_date="2024-06-16", languages=["ara-Arab"], @@ -578,6 +597,7 @@ ) Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta( name="Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", + model_type=["dense"], revision="2628cb641e040f44328195fadcdfb58e6d5cffa7", release_date="2024-06-15", languages=["ara-Arab"], @@ -602,6 +622,7 @@ ) Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta( name="Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", + model_type=["dense"], revision="ecf3274e164f057c4a3dd70691cae0265d87a9d0", release_date="2024-06-17", languages=["ara-Arab"], @@ -624,6 +645,7 @@ ) consciousai__cai_lunaris_text_embeddings = ModelMeta( name="consciousAI/cai-lunaris-text-embeddings", + model_type=["dense"], revision="8332c464d13505968ff7a6e2213f36fd8730b4c7", release_date="2023-06-22", languages=None, @@ -646,6 +668,7 @@ ) consciousai__cai_stellaris_text_embeddings = ModelMeta( name="consciousAI/cai-stellaris-text-embeddings", + model_type=["dense"], revision="c000ec4b29588daf0f4a0b2ad4e72ee807d8efc0", release_date="2023-06-23", languages=None, @@ -677,6 +700,7 @@ } manu__sentence_croissant_alpha_v0_2 = ModelMeta( name="manu/sentence_croissant_alpha_v0.2", + model_type=["dense"], revision="4610b8cea65d7dd59e0b04af50753933fe5b29b2", release_date="2024-03-15", languages=None, @@ -699,6 +723,7 @@ ) manu__sentence_croissant_alpha_v0_3 = ModelMeta( name="manu/sentence_croissant_alpha_v0.3", + model_type=["dense"], revision="4ac16754f3d81aba76cc32955dc9ee4122df96eb", release_date="2024-04-26", languages=None, @@ -721,6 +746,7 @@ ) manu__sentence_croissant_alpha_v0_4 = ModelMeta( name="manu/sentence_croissant_alpha_v0.4", + model_type=["dense"], revision="0ce6372e6a3c21134dcf26dcde13cca869c767fc", release_date="2024-04-27", languages=["fra-Latn", "eng-Latn"], @@ -744,6 +770,7 @@ ) thenlper__gte_base = ModelMeta( name="thenlper/gte-base", + model_type=["dense"], revision="c078288308d8dee004ab72c6191778064285ec0c", release_date="2023-07-27", languages=["eng-Latn"], @@ -766,6 +793,7 @@ ) thenlper__gte_large = ModelMeta( name="thenlper/gte-large", + model_type=["dense"], revision="4bef63f39fcc5e2d6b0aae83089f307af4970164", release_date="2023-07-27", languages=["eng-Latn"], @@ -788,6 +816,7 @@ ) thenlper__gte_small = ModelMeta( name="thenlper/gte-small", + model_type=["dense"], revision="17e1f347d17fe144873b1201da91788898c639cd", release_date="2023-07-27", languages=["eng-Latn"], @@ -810,6 +839,7 @@ ) OrlikB__KartonBERT_USE_base_v1 = ModelMeta( name="OrlikB/KartonBERT-USE-base-v1", + model_type=["dense"], revision="1f59dd58fe57995c0e867d5e29f03763eae99645", release_date="2024-09-30", languages=["pol-Latn"], @@ -832,6 +862,7 @@ ) OrlikB__st_polish_kartonberta_base_alpha_v1 = ModelMeta( name="OrlikB/st-polish-kartonberta-base-alpha-v1", + model_type=["dense"], revision="5590a0e2d7bb43674e44d7076b3ff157f7d4a1cb", release_date="2023-11-12", languages=["pol-Latn"], @@ -854,6 +885,7 @@ ) sdadas__mmlw_e5_base = ModelMeta( name="sdadas/mmlw-e5-base", + model_type=["dense"], revision="f10628ed55b5ec400502aff439bd714a6da0af30", release_date="2023-11-17", languages=["pol-Latn"], @@ -876,6 +908,7 @@ ) dwzhu__e5_base_4k = ModelMeta( name="dwzhu/e5-base-4k", + model_type=["dense"], revision="1b5664b8cb2bccd8c309429b7bfe5864402e8fbc", release_date="2024-03-28", languages=["eng-Latn"], @@ -898,6 +931,7 @@ ) sdadas__mmlw_e5_large = ModelMeta( name="sdadas/mmlw-e5-large", + model_type=["dense"], revision="5c143fb045ebed664fd85b43fc45155999eb110f", release_date="2023-11-17", languages=["pol-Latn"], @@ -920,6 +954,7 @@ ) sdadas__mmlw_e5_small = ModelMeta( name="sdadas/mmlw-e5-small", + model_type=["dense"], revision="ff1298cb6d997f18b794d2f3d73cad2ba2ad739a", release_date="2023-11-17", languages=["pol-Latn"], @@ -942,6 +977,7 @@ ) sdadas__mmlw_roberta_base = ModelMeta( name="sdadas/mmlw-roberta-base", + model_type=["dense"], revision="0ac7f23f6c96af601fa6a17852bd08d5136d6365", release_date="2023-11-17", languages=["pol-Latn"], @@ -964,6 +1000,7 @@ ) sdadas__mmlw_roberta_large = ModelMeta( name="sdadas/mmlw-roberta-large", + model_type=["dense"], revision="b8058066a8de32d0737b3cd82d8b4f4108745af9", release_date="2023-11-17", languages=["pol-Latn"], @@ -1041,6 +1078,7 @@ izhx__udever_bloom_1b1 = ModelMeta( name="izhx/udever-bloom-1b1", + model_type=["dense"], revision="7bf1ee29878cb040b2708a691aa4b61f27eaa252", release_date="2023-10-24", languages=udever_languages, @@ -1063,6 +1101,7 @@ ) izhx__udever_bloom_3b = ModelMeta( name="izhx/udever-bloom-3b", + model_type=["dense"], revision="4edd8affe80ca89ba0f6b6ba4103fc7f25fc57b2", release_date="2023-10-24", languages=udever_languages, @@ -1085,6 +1124,7 @@ ) izhx__udever_bloom_560m = ModelMeta( name="izhx/udever-bloom-560m", + model_type=["dense"], revision="b2a723e355946ec5a5c5fbed3459766627ded2bb", release_date="2023-10-24", languages=udever_languages, @@ -1107,6 +1147,7 @@ ) izhx__udever_bloom_7b1 = ModelMeta( name="izhx/udever-bloom-7b1", + model_type=["dense"], revision="18e8d3e6dbd94868584877f2e72a105a17df22ef", release_date="2023-10-24", languages=udever_languages, @@ -1129,6 +1170,7 @@ ) avsolatorio__gist_embedding_v0 = ModelMeta( name="avsolatorio/GIST-Embedding-v0", + model_type=["dense"], revision="bf6b2e55e92f510a570ad4d7d2da2ec8cd22590c", release_date="2024-01-31", languages=["eng-Latn"], @@ -1168,6 +1210,7 @@ ) avsolatorio__gist_all_minilm_l6_v2 = ModelMeta( name="avsolatorio/GIST-all-MiniLM-L6-v2", + model_type=["dense"], revision="ea89dfad053bba14677bb784a4269898abbdce44", release_date="2024-02-03", languages=["eng-Latn"], @@ -1207,6 +1250,7 @@ ) avsolatorio__gist_large_embedding_v0 = ModelMeta( name="avsolatorio/GIST-large-Embedding-v0", + model_type=["dense"], revision="7831200e2f7819b994490c091cf3258a2b821f0c", release_date="2024-02-14", languages=["eng-Latn"], @@ -1246,6 +1290,7 @@ ) avsolatorio__gist_small_embedding_v0 = ModelMeta( name="avsolatorio/GIST-small-Embedding-v0", + model_type=["dense"], revision="d6c4190f9e01b9994dc7cac99cf2f2b85cfb57bc", release_date="2024-02-03", languages=["eng-Latn"], @@ -1285,6 +1330,7 @@ ) bigscience__sgpt_bloom_7b1_msmarco = ModelMeta( name="bigscience/sgpt-bloom-7b1-msmarco", + model_type=["dense"], revision="dc579f3d2d5a0795eba2049e16c3e36c74007ad3", release_date="2022-08-26", languages=None, @@ -1307,6 +1353,7 @@ ) aari1995__german_semantic_sts_v2 = ModelMeta( name="aari1995/German_Semantic_STS_V2", + model_type=["dense"], revision="22912542b0ec7a7ef369837e28ffe6352a27afc9", release_date="2022-11-17", languages=["deu-Latn"], @@ -1330,6 +1377,7 @@ ) abhinand__medembed_small_v0_1 = ModelMeta( name="abhinand/MedEmbed-small-v0.1", + model_type=["dense"], revision="40a5850d046cfdb56154e332b4d7099b63e8d50e", release_date="2024-10-20", languages=["eng-Latn"], @@ -1361,6 +1409,7 @@ ) avsolatorio__noinstruct_small_embedding_v0 = ModelMeta( name="avsolatorio/NoInstruct-small-Embedding-v0", + model_type=["dense"], revision="b38747000553d8268915c95a55fc87e707c9aadd", release_date="2024-05-01", languages=["eng-Latn"], @@ -1383,6 +1432,7 @@ ) brahmairesearch__slx_v0_1 = ModelMeta( name="brahmairesearch/slx-v0.1", + model_type=["dense"], revision="688c83fd1a7f34b25575a2bc26cfd87c11b4ce71", release_date="2024-08-13", languages=["eng-Latn"], @@ -1405,6 +1455,7 @@ ) deepfile__embedder_100p = ModelMeta( name="deepfile/embedder-100p", + model_type=["dense"], revision="aa02f08f11517977fbcdc94dc9dbf9a1ca152d9b", release_date="2023-07-24", languages=None, @@ -1427,6 +1478,7 @@ ) infgrad__stella_base_en_v2 = ModelMeta( name="infgrad/stella-base-en-v2", + model_type=["dense"], revision="c9e80ff9892d80b39dc54e30a7873f91ea161034", release_date="2023-10-19", languages=["eng-Latn"], @@ -1449,6 +1501,7 @@ ) malenia1__ternary_weight_embedding = ModelMeta( name="malenia1/ternary-weight-embedding", + model_type=["dense"], revision="a1208fb7f646647bb62639fd2e1eb6cc2ef3738e", release_date="2024-10-23", languages=None, @@ -1471,6 +1524,7 @@ ) omarelshehy__arabic_english_sts_matryoshka = ModelMeta( name="omarelshehy/arabic-english-sts-matryoshka", + model_type=["dense"], revision="763d116fbe8bf7883c64635c862feeaa3768bb64", release_date="2024-10-13", languages=["ara-Arab", "eng-Latn"], @@ -1502,6 +1556,7 @@ # https://huggingface.co/openbmb/MiniCPM-Embedding/blob/c0cb2de33fb366e17c30f9d53142ff11bc18e049/README.md?code=true#L405 ), name="openbmb/MiniCPM-Embedding", + model_type=["dense"], revision="c0cb2de33fb366e17c30f9d53142ff11bc18e049", release_date="2024-09-04", languages=["zho-Hans", "eng-Latn"], @@ -1524,6 +1579,7 @@ silma_ai__silma_embedding_matryoshka_v0_1 = ModelMeta( name="silma-ai/silma-embeddding-matryoshka-v0.1", + model_type=["dense"], revision="a520977a9542ebdb8a7206df6b7ff6977f1886ea", release_date="2024-10-12", languages=["ara-Arab", "eng-Latn"], @@ -1547,6 +1603,7 @@ sbert_chinese_general_v1 = ModelMeta( name="DMetaSoul/sbert-chinese-general-v1", + model_type=["dense"], revision="bd27765956bcc2fcf682de0097819947ac10037e", release_date="2022-03-25", languages=["zho-Hans"], @@ -1574,6 +1631,7 @@ dmeta_embedding_zh_small = ModelMeta( name="DMetaSoul/Dmeta-embedding-zh-small", + model_type=["dense"], revision="2050d3439a2f68999dd648c1697471acaac37a29", release_date="2024-03-25", languages=["zho-Hans"], @@ -1596,6 +1654,7 @@ xiaobu_embedding = ModelMeta( name="lier007/xiaobu-embedding", + model_type=["dense"], revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92", release_date="2024-01-09", languages=["zho-Hans"], @@ -1619,6 +1678,7 @@ xiaobu_embedding_v2 = ModelMeta( name="lier007/xiaobu-embedding-v2", + model_type=["dense"], revision="1912f2e59a5c2ef802a471d735a38702a5c9485e", release_date="2024-06-30", languages=["zho-Hans"], @@ -1642,6 +1702,7 @@ yinka_embedding = ModelMeta( name="Classical/Yinka", + model_type=["dense"], revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92", release_date="2024-01-09", languages=["zho-Hans"], @@ -1664,6 +1725,7 @@ ) conan_embedding = ModelMeta( name="TencentBAC/Conan-embedding-v1", + model_type=["dense"], revision="bb9749a57d4f02fd71722386f8d0f5a9398d7eeb", release_date="2024-08-22", languages=["zho-Hans"], @@ -1688,6 +1750,7 @@ ember_v1 = ModelMeta( loader=sentence_transformers_loader, name="llmrails/ember-v1", + model_type=["dense"], revision="5e5ce5904901f6ce1c353a95020f17f09e5d021d", release_date="2023-10-10", languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/mme5_models.py b/mteb/models/model_implementations/mme5_models.py index 16895ad246..8a40673f22 100644 --- a/mteb/models/model_implementations/mme5_models.py +++ b/mteb/models/model_implementations/mme5_models.py @@ -12,6 +12,7 @@ "trust_remote_code": True, }, name="intfloat/mmE5-mllama-11b-instruct", + model_type=["dense"], revision="cbb328b9bf9ff5362c852c3166931903226d46f1", release_date="2025-02-12", languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/moco_models.py b/mteb/models/model_implementations/moco_models.py index 761df8bf95..236fca0fd3 100644 --- a/mteb/models/model_implementations/moco_models.py +++ b/mteb/models/model_implementations/moco_models.py @@ -119,6 +119,7 @@ def encode( mocov3_vit_base = ModelMeta( loader=mocov3_loader, # type: ignore name="nyu-visionx/moco-v3-vit-b", + model_type=["dense"], languages=["eng-Latn"], revision="7d091cd70772c5c0ecf7f00b5f12ca609a99d69d", release_date="2024-06-03", @@ -142,6 +143,7 @@ def encode( mocov3_vit_large = ModelMeta( loader=mocov3_loader, # type: ignore name="nyu-visionx/moco-v3-vit-l", + model_type=["dense"], languages=["eng-Latn"], revision="7bf75358d616f39b9716148bf4e3425f3bd35b47", release_date="2024-06-03", diff --git a/mteb/models/model_implementations/model2vec_models.py b/mteb/models/model_implementations/model2vec_models.py index f0ce608aa3..0f45674aac 100644 --- a/mteb/models/model_implementations/model2vec_models.py +++ b/mteb/models/model_implementations/model2vec_models.py @@ -161,6 +161,7 @@ def encode( m2v_base_glove_subword = ModelMeta( loader=Model2VecModel, name="minishlab/M2V_base_glove_subword", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="5f4f5ca159b7321a8b39739bba0794fa0debddf4", @@ -186,6 +187,7 @@ def encode( m2v_base_glove = ModelMeta( loader=Model2VecModel, name="minishlab/M2V_base_glove", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="38ebd7f10f71e67fa8db898290f92b82e9cfff2b", @@ -210,6 +212,7 @@ def encode( m2v_base_output = ModelMeta( loader=Model2VecModel, name="minishlab/M2V_base_output", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="02460ae401a22b09d2c6652e23371398329551e2", @@ -234,6 +237,7 @@ def encode( m2v_multilingual_output = ModelMeta( loader=Model2VecModel, name="minishlab/M2V_multilingual_output", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="2cf4ec4e1f51aeca6c55cf9b93097d00711a6305", @@ -258,6 +262,7 @@ def encode( potion_base_2m = ModelMeta( loader=Model2VecModel, name="minishlab/potion-base-2M", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="86db093558fbced2072b929eb1690bce5272bd4b", @@ -282,6 +287,7 @@ def encode( potion_base_4m = ModelMeta( loader=Model2VecModel, name="minishlab/potion-base-4M", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="81b1802ada41afcd0987a37dc15e569c9fa76f04", @@ -306,6 +312,7 @@ def encode( potion_base_8m = ModelMeta( loader=Model2VecModel, name="minishlab/potion-base-8M", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="dcbec7aa2d52fc76754ac6291803feedd8c619ce", @@ -330,6 +337,7 @@ def encode( potion_multilingual_128m = ModelMeta( loader=Model2VecModel, name="minishlab/potion-multilingual-128M", + model_type=["dense"], languages=_POTION_MULTILINGUAL_128M_LANGUAGES, open_weights=True, revision="38ebd7f10f71e67fa8db898290f92b82e9cfff2a", @@ -354,6 +362,7 @@ def encode( pubmed_bert_100k = ModelMeta( loader=Model2VecModel, name="NeuML/pubmedbert-base-embeddings-100K", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="bac5e3b12fb8c650e92a19c41b436732c4f16e9e", @@ -377,6 +386,7 @@ def encode( pubmed_bert_500k = ModelMeta( loader=Model2VecModel, name="NeuML/pubmedbert-base-embeddings-500K", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="34ba71e35c393fdad7ed695113f653feb407b16b", @@ -400,6 +410,7 @@ def encode( pubmed_bert_1m = ModelMeta( loader=Model2VecModel, name="NeuML/pubmedbert-base-embeddings-1M", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="2b7fed222594708da6d88bcda92ae9b434b7ddd1", @@ -423,6 +434,7 @@ def encode( pubmed_bert_2m = ModelMeta( loader=Model2VecModel, name="NeuML/pubmedbert-base-embeddings-2M", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="1d7bbe04d6713e425161146bfdc71473cbed498a", @@ -446,6 +458,7 @@ def encode( pubmed_bert_8m = ModelMeta( loader=Model2VecModel, name="NeuML/pubmedbert-base-embeddings-8M", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="387d350015e963744f4fafe56a574b7cd48646c9", diff --git a/mteb/models/model_implementations/moka_models.py b/mteb/models/model_implementations/moka_models.py index 1c45cc25fc..bbf7036077 100644 --- a/mteb/models/model_implementations/moka_models.py +++ b/mteb/models/model_implementations/moka_models.py @@ -91,6 +91,7 @@ m3e_base = ModelMeta( loader=sentence_transformers_loader, name="moka-ai/m3e-base", + model_type=["dense"], languages=["zho-Hans", "eng-Latn"], open_weights=True, revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c", @@ -116,6 +117,7 @@ m3e_small = ModelMeta( loader=sentence_transformers_loader, name="moka-ai/m3e-small", + model_type=["dense"], languages=["zho-Hans", "eng-Latn"], open_weights=True, revision="44c696631b2a8c200220aaaad5f987f096e986df", @@ -141,6 +143,7 @@ m3e_large = ModelMeta( loader=sentence_transformers_loader, name="moka-ai/m3e-large", + model_type=["dense"], languages=["zho-Hans", "eng-Latn"], open_weights=True, revision="12900375086c37ba5d83d1e417b21dc7d1d1f388", diff --git a/mteb/models/model_implementations/mxbai_models.py b/mteb/models/model_implementations/mxbai_models.py index 5c8fc8430b..d19036fc13 100644 --- a/mteb/models/model_implementations/mxbai_models.py +++ b/mteb/models/model_implementations/mxbai_models.py @@ -21,6 +21,7 @@ }, ), name="mixedbread-ai/mxbai-embed-large-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="990580e27d329c7408b3741ecff85876e128e203", @@ -57,6 +58,7 @@ mxbai_embed_2d_large_v1 = ModelMeta( loader=sentence_transformers_loader, name="mixedbread-ai/mxbai-embed-2d-large-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="7e639ca8e344af398876ead3b19ec3c0b9068f49", @@ -81,6 +83,7 @@ mxbai_embed_xsmall_v1 = ModelMeta( loader=sentence_transformers_loader, name="mixedbread-ai/mxbai-embed-xsmall-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="2f741ec33328bb57e4704e1238fc59a4a5745705", diff --git a/mteb/models/model_implementations/nbailab.py b/mteb/models/model_implementations/nbailab.py index 17385668a9..a8036f56e7 100644 --- a/mteb/models/model_implementations/nbailab.py +++ b/mteb/models/model_implementations/nbailab.py @@ -6,6 +6,7 @@ nb_sbert = ModelMeta( loader=SentenceTransformerEncoderWrapper, # type: ignore[arg-type] name="NbAiLab/nb-sbert-base", + model_type=["dense"], languages=["nno-Latn", "nob-Latn", "swe-Latn", "dan-Latn"], open_weights=True, revision="b95656350a076aeafd2d23763660f80655408cc6", @@ -27,6 +28,7 @@ nb_bert_large = ModelMeta( loader=SentenceTransformerEncoderWrapper, # type: ignore[arg-type] name="NbAiLab/nb-bert-large", + model_type=["dense"], languages=["nno-Latn", "nob-Latn"], open_weights=True, revision="f9d0fc184adab4dc354d85e1854b7634540d7550", @@ -48,6 +50,7 @@ nb_bert_base = ModelMeta( loader=SentenceTransformerEncoderWrapper, # type: ignore[arg-type] name="NbAiLab/nb-bert-base", + model_type=["dense"], languages=["nno-Latn", "nob-Latn"], open_weights=True, revision="9417c3f62a3adc99f17ff92bff446f35d011f994", diff --git a/mteb/models/model_implementations/no_instruct_sentence_models.py b/mteb/models/model_implementations/no_instruct_sentence_models.py index e4b3823cf0..eaed149431 100644 --- a/mteb/models/model_implementations/no_instruct_sentence_models.py +++ b/mteb/models/model_implementations/no_instruct_sentence_models.py @@ -97,6 +97,7 @@ def encode( no_instruct_small_v0 = ModelMeta( loader=NoInstructModel, name="avsolatorio/NoInstruct-small-Embedding-v0", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="b38747000553d8268915c95a55fc87e707c9aadd", diff --git a/mteb/models/model_implementations/nomic_models.py b/mteb/models/model_implementations/nomic_models.py index 98e05a8d78..bf62c46647 100644 --- a/mteb/models/model_implementations/nomic_models.py +++ b/mteb/models/model_implementations/nomic_models.py @@ -199,6 +199,7 @@ def encode( model_prompts=model_prompts, ), name="nomic-ai/nomic-embed-text-v1.5", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="b0753ae76394dd36bcfb912a46018088bca48be0", @@ -227,6 +228,7 @@ def encode( model_prompts=model_prompts, ), name="nomic-ai/nomic-embed-text-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="0759316f275aa0cb93a5b830973843ca66babcf5", @@ -255,6 +257,7 @@ def encode( model_prompts=model_prompts, ), name="nomic-ai/nomic-embed-text-v1-ablated", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="7d948905c5d5d3874fa55a925d68e49dbf411e5f", @@ -282,6 +285,7 @@ def encode( model_prompts=model_prompts, ), name="nomic-ai/nomic-embed-text-v1-unsupervised", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="b53d557b15ae63852847c222d336c1609eced93c", @@ -309,6 +313,7 @@ def encode( model_prompts=model_prompts, ), name="nomic-ai/modernbert-embed-base", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="5960f1566fb7cb1adf1eb6e816639cf4646d9b12", @@ -439,6 +444,7 @@ def encode( model_prompts=model_prompts, ), name="nomic-ai/nomic-embed-text-v2-moe", + model_type=["dense"], languages=m_languages, open_weights=True, revision="1066b6599d099fbb93dfcb64f9c37a7c9e503e85", diff --git a/mteb/models/model_implementations/nomic_models_vision.py b/mteb/models/model_implementations/nomic_models_vision.py index 730b6ca397..c19dcb3e5a 100644 --- a/mteb/models/model_implementations/nomic_models_vision.py +++ b/mteb/models/model_implementations/nomic_models_vision.py @@ -168,6 +168,7 @@ def encode( "text_model_revision": "a03db6748c80237063eb0546ac6b627eca2318cb", }, name="nomic-ai/nomic-embed-vision-v1.5", + model_type=["dense"], languages=["eng-Latn"], revision="af2246fffdab78d8458418480e4886a8e48b70a7", release_date="2024-06-08", diff --git a/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py b/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py index 72c31a9253..388d0cfc04 100644 --- a/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +++ b/mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py @@ -144,6 +144,7 @@ def encode( trust_remote_code=True, ), name="nvidia/llama-nemoretriever-colembed-1b-v1", + model_type=["late-interaction"], languages=["eng-Latn"], revision="1f0fdea7f5b19532a750be109b19072d719b8177", release_date="2025-06-27", @@ -170,6 +171,7 @@ def encode( trust_remote_code=True, ), name="nvidia/llama-nemoretriever-colembed-3b-v1", + model_type=["late-interaction"], languages=["eng-Latn"], revision="50c36f4d5271c6851aa08bd26d69f6e7ca8b870c", release_date="2025-06-27", diff --git a/mteb/models/model_implementations/nvidia_models.py b/mteb/models/model_implementations/nvidia_models.py index b7c232e791..91a7c2d904 100644 --- a/mteb/models/model_implementations/nvidia_models.py +++ b/mteb/models/model_implementations/nvidia_models.py @@ -111,6 +111,7 @@ def instruction_template( add_eos_token=True, ), name="nvidia/NV-Embed-v2", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="7604d305b621f14095a1aa23d351674c2859553a", @@ -141,6 +142,7 @@ def instruction_template( add_eos_token=True, ), name="nvidia/NV-Embed-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="570834afd5fef5bf3a3c2311a2b6e0a66f6f4f2c", @@ -528,6 +530,7 @@ def _extract_embeddings( llama_embed_nemotron_8b = ModelMeta( loader=LlamaEmbedNemotron, name="nvidia/llama-embed-nemotron-8b", + model_type=["dense"], languages=llama_embed_nemotron_evaluated_languages, open_weights=True, revision="84a375593d27d3528beb4e104822515659e093b4", diff --git a/mteb/models/model_implementations/openai_models.py b/mteb/models/model_implementations/openai_models.py index 53781c4c6f..fc10683921 100644 --- a/mteb/models/model_implementations/openai_models.py +++ b/mteb/models/model_implementations/openai_models.py @@ -167,6 +167,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: text_embedding_3_small = ModelMeta( name="openai/text-embedding-3-small", + model_type=["dense"], revision="3", release_date="2024-01-25", languages=None, # supported languages not specified @@ -191,6 +192,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: ) text_embedding_3_large = ModelMeta( name="openai/text-embedding-3-large", + model_type=["dense"], revision="3", release_date="2024-01-25", languages=None, # supported languages not specified @@ -215,6 +217,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: ) text_embedding_ada_002 = ModelMeta( name="openai/text-embedding-ada-002", + model_type=["dense"], revision="3", release_date="2022-12-15", languages=None, # supported languages not specified @@ -240,6 +243,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: text_embedding_3_small_512 = ModelMeta( name="openai/text-embedding-3-small (embed_dim=512)", + model_type=["dense"], revision="3", release_date="2024-01-25", languages=None, # supported languages not specified @@ -266,6 +270,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: text_embedding_3_large_512 = ModelMeta( name="openai/text-embedding-3-large (embed_dim=512)", + model_type=["dense"], revision="3", release_date="2024-01-25", languages=None, # supported languages not specified diff --git a/mteb/models/model_implementations/openclip_models.py b/mteb/models/model_implementations/openclip_models.py index 1fa695b3cc..6c05bb7457 100644 --- a/mteb/models/model_implementations/openclip_models.py +++ b/mteb/models/model_implementations/openclip_models.py @@ -122,6 +122,7 @@ def encode( CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta( loader=openclip_loader, # type: ignore name="laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K", + model_type=["dense"], languages=["eng-Latn"], revision="84c9828e63dc9a9351d1fe637c346d4c1c4db341", release_date="2023-04-26", @@ -147,6 +148,7 @@ def encode( CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta( loader=openclip_loader, # type: ignore name="laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K", + model_type=["dense"], languages=["eng-Latn"], revision="f0e2ffa09cbadab3db6a261ec1ec56407ce42912", release_date="2023-04-26", @@ -172,6 +174,7 @@ def encode( CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta( loader=openclip_loader, # type: ignore name="laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K", + model_type=["dense"], languages=["eng-Latn"], revision="d110532e8d4ff91c574ee60a342323f28468b287", release_date="2023-04-26", @@ -197,6 +200,7 @@ def encode( CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta( loader=openclip_loader, # type: ignore name="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", + model_type=["dense"], languages=["eng-Latn"], revision="bc7788f151930d91b58474715fdce5524ad9a189", release_date="2023-01-23", @@ -222,6 +226,7 @@ def encode( CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta( loader=openclip_loader, # type: ignore name="laion/CLIP-ViT-g-14-laion2B-s34B-b88K", + model_type=["dense"], languages=["eng-Latn"], revision="15efd0f6ac0c40c0f9da7becca03c974d7012604", release_date="2023-03-06", @@ -247,6 +252,7 @@ def encode( CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta( loader=openclip_loader, # type: ignore name="laion/CLIP-ViT-H-14-laion2B-s32B-b79K", + model_type=["dense"], languages=["eng-Latn"], revision="de081ac0a0ca8dc9d1533eed1ae884bb8ae1404b", release_date="2022-09-15", @@ -272,6 +278,7 @@ def encode( CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta( loader=openclip_loader, # type: ignore name="laion/CLIP-ViT-L-14-laion2B-s32B-b82K", + model_type=["dense"], languages=["eng-Latn"], revision="1627032197142fbe2a7cfec626f4ced3ae60d07a", release_date="2022-09-15", @@ -297,6 +304,7 @@ def encode( CLIP_ViT_B_32_laion2B_s34B_b79K = ModelMeta( loader=openclip_loader, name="laion/CLIP-ViT-B-32-laion2B-s34B-b79K", + model_type=["dense"], languages=["eng-Latn"], revision="08f73555f1b2fb7c82058aebbd492887a94968ef", release_date="2022-09-15", diff --git a/mteb/models/model_implementations/opensearch_neural_sparse_models.py b/mteb/models/model_implementations/opensearch_neural_sparse_models.py index 47d5b33fe6..02a20a1617 100644 --- a/mteb/models/model_implementations/opensearch_neural_sparse_models.py +++ b/mteb/models/model_implementations/opensearch_neural_sparse_models.py @@ -128,6 +128,7 @@ def encode( opensearch_neural_sparse_encoding_doc_v3_gte = ModelMeta( name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="a8abaa916125ee512a7a8f4d706d07eb0128a8e6", @@ -153,6 +154,7 @@ def encode( opensearch_neural_sparse_encoding_doc_v3_distill = ModelMeta( name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="babf71f3c48695e2e53a978208e8aba48335e3c0", @@ -174,6 +176,7 @@ def encode( opensearch_neural_sparse_encoding_doc_v2_distill = ModelMeta( name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="8921a26c78b8559d6604eb1f5c0b74c079bee38f", @@ -196,6 +199,7 @@ def encode( opensearch_neural_sparse_encoding_doc_v2_mini = ModelMeta( name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="4af867a426867dfdd744097531046f4289a32fdd", @@ -217,6 +221,7 @@ def encode( opensearch_neural_sparse_encoding_doc_v1 = ModelMeta( name="opensearch-project/opensearch-neural-sparse-encoding-doc-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="98cdcbd72867c547f72f2b7b7bed9cdf9f09922d", diff --git a/mteb/models/model_implementations/ops_moa_models.py b/mteb/models/model_implementations/ops_moa_models.py index 5fb56a51b4..2aff73fa2c 100644 --- a/mteb/models/model_implementations/ops_moa_models.py +++ b/mteb/models/model_implementations/ops_moa_models.py @@ -22,6 +22,7 @@ def encode(self, sentences: list[str], **kwargs) -> np.ndarray: ops_moa_conan_embedding = ModelMeta( name="OpenSearch-AI/Ops-MoA-Conan-embedding-v1", + model_type=["dense"], revision="46dcd58753f3daa920c66f89e47086a534089350", release_date="2025-03-26", languages=["zho-Hans"], @@ -53,6 +54,7 @@ def encode(self, sentences: list[str], **kwargs) -> np.ndarray: ops_moa_yuan_embedding = ModelMeta( name="OpenSearch-AI/Ops-MoA-Yuan-embedding-1.0", + model_type=["dense"], revision="23712d0766417b0eb88a2513c6e212a58b543268", release_date="2025-03-26", languages=["zho-Hans"], diff --git a/mteb/models/model_implementations/pawan_models.py b/mteb/models/model_implementations/pawan_models.py index fb6b1468d1..d6c2e16337 100644 --- a/mteb/models/model_implementations/pawan_models.py +++ b/mteb/models/model_implementations/pawan_models.py @@ -14,6 +14,7 @@ pawan_embd_68m = ModelMeta( loader=sentence_transformers_loader, name="dmedhi/PawanEmbd-68M", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="32f295145802bdbd65699ad65fd27d2a5b69a909", diff --git a/mteb/models/model_implementations/piccolo_models.py b/mteb/models/model_implementations/piccolo_models.py index 569b0bd701..9e9f3e80f3 100644 --- a/mteb/models/model_implementations/piccolo_models.py +++ b/mteb/models/model_implementations/piccolo_models.py @@ -6,6 +6,7 @@ piccolo_base_zh = ModelMeta( loader=sentence_transformers_loader, name="sensenova/piccolo-base-zh", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="47c0a63b8f667c3482e05b2fd45577bb19252196", @@ -29,6 +30,7 @@ piccolo_large_zh_v2 = ModelMeta( loader=sentence_transformers_loader, name="sensenova/piccolo-large-zh-v2", + model_type=["dense"], languages=["zho-Hans"], open_weights=False, # They "temporarily" removed it in may last year # "Due to certain internal company considerations" diff --git a/mteb/models/model_implementations/promptriever_models.py b/mteb/models/model_implementations/promptriever_models.py index 65af1a41c8..8bb35120f7 100644 --- a/mteb/models/model_implementations/promptriever_models.py +++ b/mteb/models/model_implementations/promptriever_models.py @@ -75,6 +75,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: model_prompts=model_prompts, ), name="samaya-ai/promptriever-llama2-7b-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision @@ -106,6 +107,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: model_prompts=model_prompts, ), name="samaya-ai/promptriever-llama3.1-8b-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="48d6d0fc4e02fb1269b36940650a1b7233035cbb-2ead22cfb1b0e0c519c371c63c2ab90ffc511b8a", # base-peft revision @@ -138,6 +140,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: model_prompts=model_prompts, ), name="samaya-ai/promptriever-llama3.1-8b-instruct-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision @@ -170,6 +173,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: model_prompts=model_prompts, ), name="samaya-ai/promptriever-mistral-v0.1-7b-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="7231864981174d9bee8c7687c24c8344414eae6b-876d63e49b6115ecb6839893a56298fadee7e8f5", # base-peft revision diff --git a/mteb/models/model_implementations/pylate_models.py b/mteb/models/model_implementations/pylate_models.py index b0b15209f0..3fd5421d31 100644 --- a/mteb/models/model_implementations/pylate_models.py +++ b/mteb/models/model_implementations/pylate_models.py @@ -337,6 +337,7 @@ def encode( colbert_v2 = ModelMeta( loader=MultiVectorModel, name="colbert-ir/colbertv2.0", + model_type=["late-interaction"], languages=["eng-Latn"], open_weights=True, revision="c1e84128e85ef755c096a95bdb06b47793b13acf", @@ -369,6 +370,7 @@ def encode( trust_remote_code=True, ), name="jinaai/jina-colbert-v2", + model_type=["late-interaction"], languages=[ "ara-Arab", "ben-Beng", @@ -421,6 +423,7 @@ def encode( lightonai__gte_moderncolbert_v1 = ModelMeta( loader=MultiVectorModel, name="lightonai/GTE-ModernColBERT-v1", + model_type=["late-interaction"], languages=[ "eng-Latn", ], diff --git a/mteb/models/model_implementations/qodo_models.py b/mteb/models/model_implementations/qodo_models.py index 183bb53606..fb39ac18b9 100644 --- a/mteb/models/model_implementations/qodo_models.py +++ b/mteb/models/model_implementations/qodo_models.py @@ -30,6 +30,7 @@ Qodo_Embed_1_1_5B = ModelMeta( loader=sentence_transformers_loader, name="Qodo/Qodo-Embed-1-1.5B", + model_type=["dense"], languages=qodo_languages, open_weights=True, revision="84bbef079b32e8823ec226d4e9e92902706b0eb6", @@ -52,6 +53,7 @@ Qodo_Embed_1_7B = ModelMeta( loader=sentence_transformers_loader, name="Qodo/Qodo-Embed-1-7B", + model_type=["dense"], languages=qodo_languages, open_weights=True, revision="f9edd9bf7f687c0e832424058e265120f603cd81", diff --git a/mteb/models/model_implementations/qtack_models.py b/mteb/models/model_implementations/qtack_models.py index fdb1e4adf1..6ad8b5f110 100644 --- a/mteb/models/model_implementations/qtack_models.py +++ b/mteb/models/model_implementations/qtack_models.py @@ -25,6 +25,7 @@ mini_gte = ModelMeta( loader=sentence_transformers_loader, name="prdev/mini-gte", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="7fbe6f9b4cc42615e0747299f837ad7769025492", diff --git a/mteb/models/model_implementations/qwen3_models.py b/mteb/models/model_implementations/qwen3_models.py index 91802ac965..929b94993f 100644 --- a/mteb/models/model_implementations/qwen3_models.py +++ b/mteb/models/model_implementations/qwen3_models.py @@ -134,6 +134,7 @@ def q3e_instruct_loader( Qwen3_Embedding_0B6 = ModelMeta( loader=q3e_instruct_loader, name="Qwen/Qwen3-Embedding-0.6B", + model_type=["dense"], languages=multilingual_langs, open_weights=True, revision="b22da495047858cce924d27d76261e96be6febc0", # Commit of @tomaarsen @@ -156,6 +157,7 @@ def q3e_instruct_loader( Qwen3_Embedding_4B = ModelMeta( loader=q3e_instruct_loader, name="Qwen/Qwen3-Embedding-4B", + model_type=["dense"], languages=multilingual_langs, open_weights=True, revision="636cd9bf47d976946cdbb2b0c3ca0cb2f8eea5ff", # Commit of @tomaarsen @@ -178,6 +180,7 @@ def q3e_instruct_loader( Qwen3_Embedding_8B = ModelMeta( loader=q3e_instruct_loader, name="Qwen/Qwen3-Embedding-8B", + model_type=["dense"], languages=multilingual_langs, open_weights=True, revision="4e423935c619ae4df87b646a3ce949610c66241c", # Commit of @tomaarsen diff --git a/mteb/models/model_implementations/qzhou_models.py b/mteb/models/model_implementations/qzhou_models.py index 65557c9641..4e7c08689c 100644 --- a/mteb/models/model_implementations/qzhou_models.py +++ b/mteb/models/model_implementations/qzhou_models.py @@ -58,6 +58,7 @@ def instruction_template( apply_instruction_to_passages=False, ), name="Kingsoft-LLM/QZhou-Embedding", + model_type=["dense"], languages=["eng-Latn", "zho-Hans"], open_weights=True, revision="f1e6c03ee3882e7b9fa5cec91217715272e433b8", @@ -91,6 +92,7 @@ def instruction_template( apply_instruction_to_passages=False, ), name="Kingsoft-LLM/QZhou-Embedding-Zh", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="0321ccb126413d1e49c5ce908e802b63d35f18e2", diff --git a/mteb/models/model_implementations/random_baseline.py b/mteb/models/model_implementations/random_baseline.py index 46247c4a2e..6ae502844e 100644 --- a/mteb/models/model_implementations/random_baseline.py +++ b/mteb/models/model_implementations/random_baseline.py @@ -189,6 +189,7 @@ def similarity_pairwise( random_encoder_baseline = ModelMeta( loader=RandomEncoderBaseline, # type: ignore name="baseline/random-encoder-baseline", + model_type=["dense"], modalities=["text", "image"], **_common_mock_metadata, ) @@ -233,7 +234,7 @@ def predict( random_cross_encoder_baseline = ModelMeta( loader=RandomCrossEncoderBaseline, # type: ignore name="baseline/random-cross-encoder-baseline", + model_type=["cross-encoder"], modalities=["text", "image"], - is_cross_encoder=True, **_common_mock_metadata, ) diff --git a/mteb/models/model_implementations/rasgaard_models.py b/mteb/models/model_implementations/rasgaard_models.py index 00e84130c4..8e9b237d57 100644 --- a/mteb/models/model_implementations/rasgaard_models.py +++ b/mteb/models/model_implementations/rasgaard_models.py @@ -6,6 +6,7 @@ potion_base_8m = ModelMeta( loader=Model2VecModel, # type: ignore name="rasgaard/m2v-dfm-large", + model_type=["dense"], languages=["dan-Latn"], open_weights=True, revision="387897cfb09992e6d45ea9cd7b28b9fcf119e23a", diff --git a/mteb/models/model_implementations/reasonir_model.py b/mteb/models/model_implementations/reasonir_model.py index 17e1eb5a9a..2f8d6d138f 100644 --- a/mteb/models/model_implementations/reasonir_model.py +++ b/mteb/models/model_implementations/reasonir_model.py @@ -44,6 +44,7 @@ def instruction_template( trust_remote_code=True, ), name="ReasonIR/ReasonIR-8B", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="c3d0690370ff4a8c3d3882d8dfa85c43650034fa", diff --git a/mteb/models/model_implementations/repllama_models.py b/mteb/models/model_implementations/repllama_models.py index 6695265756..179f3c4757 100644 --- a/mteb/models/model_implementations/repllama_models.py +++ b/mteb/models/model_implementations/repllama_models.py @@ -162,6 +162,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: model_prompts=model_prompts, ), name="castorini/repllama-v1-7b-lora-passage", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9-6097554dfe6e7d93e92f55010b678bcca1e233a8", # base-peft revision @@ -194,6 +195,7 @@ def loader_inner(**kwargs: Any) -> EncoderProtocol: model_prompts=model_prompts, ), name="samaya-ai/RepLLaMA-reproduced", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9-ad5c1d0938a1e02954bcafb4d811ba2f34052e71", # base-peft revision diff --git a/mteb/models/model_implementations/rerankers_custom.py b/mteb/models/model_implementations/rerankers_custom.py index 1272e35de5..cebcfdc2c6 100644 --- a/mteb/models/model_implementations/rerankers_custom.py +++ b/mteb/models/model_implementations/rerankers_custom.py @@ -219,6 +219,7 @@ def predict( fp_options="float16", ), name="castorini/monobert-large-msmarco", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="0a97706f3827389da43b83348d5d18c9d53876fa", @@ -234,7 +235,6 @@ def predict( use_instructions=None, training_datasets=None, framework=["Sentence Transformers", "PyTorch"], - is_cross_encoder=True, ) # languages unclear: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual/discussions/28 @@ -244,6 +244,7 @@ def predict( fp_options="float16", ), name="jinaai/jina-reranker-v2-base-multilingual", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="126747772a932960028d9f4dc93bd5d9c4869be4", @@ -259,7 +260,6 @@ def predict( use_instructions=None, training_datasets=None, framework=["Sentence Transformers", "PyTorch"], - is_cross_encoder=True, ) bge_reranker_v2_m3 = ModelMeta( @@ -268,6 +268,7 @@ def predict( fp_options="float16", ), name="BAAI/bge-reranker-v2-m3", + model_type=["cross-encoder"], languages=[ "eng-Latn", "ara-Arab", @@ -316,7 +317,6 @@ def predict( use_instructions=None, training_datasets=bge_m3_training_data, framework=["Sentence Transformers", "PyTorch"], - is_cross_encoder=True, citation=""" @misc{li2023making, title={Making Large Language Models A Better Foundation For Dense Retrieval}, diff --git a/mteb/models/model_implementations/rerankers_monot5_based.py b/mteb/models/model_implementations/rerankers_monot5_based.py index f51b544714..00a44b8e8a 100644 --- a/mteb/models/model_implementations/rerankers_monot5_based.py +++ b/mteb/models/model_implementations/rerankers_monot5_based.py @@ -315,6 +315,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="castorini/monot5-small-msmarco-10k", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="77f8e3f7b1eb1afe353aa21a7c3a2fc8feca702e", @@ -330,7 +331,6 @@ def get_prediction_tokens(self, *args, **kwargs): use_instructions=None, training_datasets=None, framework=["PyTorch"], - is_cross_encoder=True, citation="""@misc{rosa2022parameterleftbehinddistillation, title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval}, author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira}, @@ -348,6 +348,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="castorini/monot5-base-msmarco-10k", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="f15657ab3d2a5dd0b9a30c8c0b6a0a73c9cb5884", @@ -372,7 +373,6 @@ def get_prediction_tokens(self, *args, **kwargs): use_instructions=None, training_datasets=None, framework=["PyTorch"], - is_cross_encoder=True, ) monot5_large = ModelMeta( @@ -381,6 +381,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="castorini/monot5-large-msmarco-10k", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="48cfad1d8dd587670393f27ee8ec41fde63e3d98", @@ -396,7 +397,6 @@ def get_prediction_tokens(self, *args, **kwargs): use_instructions=None, training_datasets=None, framework=["PyTorch"], - is_cross_encoder=True, citation="""@misc{rosa2022parameterleftbehinddistillation, title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval}, author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira}, @@ -414,6 +414,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="castorini/monot5-3b-msmarco-10k", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="bc0c419a438c81f592f878ce32430a1823f5db6c", @@ -429,7 +430,6 @@ def get_prediction_tokens(self, *args, **kwargs): use_instructions=None, training_datasets=None, framework=["PyTorch"], - is_cross_encoder=True, citation="""@misc{rosa2022parameterleftbehinddistillation, title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval}, author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira}, @@ -447,6 +447,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="google/flan-t5-base", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="7bcac572ce56db69c1ea7c8af255c5d7c9672fc2", @@ -484,7 +485,6 @@ def get_prediction_tokens(self, *args, **kwargs): similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], - is_cross_encoder=True, ) flant5_large = ModelMeta( @@ -493,6 +493,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="google/flan-t5-large", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="0613663d0d48ea86ba8cb3d7a44f0f65dc596a2a", @@ -530,7 +531,6 @@ def get_prediction_tokens(self, *args, **kwargs): similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], - is_cross_encoder=True, ) flant5_xl = ModelMeta( @@ -539,6 +539,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="google/flan-t5-xl", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="7d6315df2c2fb742f0f5b556879d730926ca9001", @@ -576,7 +577,6 @@ def get_prediction_tokens(self, *args, **kwargs): similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], - is_cross_encoder=True, ) flant5_xxl = ModelMeta( @@ -585,6 +585,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="google/flan-t5-xxl", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="ae7c9136adc7555eeccc78cdd960dfd60fb346ce", @@ -622,7 +623,6 @@ def get_prediction_tokens(self, *args, **kwargs): similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], - is_cross_encoder=True, ) @@ -632,6 +632,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="meta-llama/Llama-2-7b-hf", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9", @@ -656,7 +657,6 @@ def get_prediction_tokens(self, *args, **kwargs): primaryClass={cs.CL}, url={https://arxiv.org/abs/2307.09288}, }""", - is_cross_encoder=True, ) llama2_7b_chat = ModelMeta( @@ -665,6 +665,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="meta-llama/Llama-2-7b-chat-hf", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="f5db02db724555f92da89c216ac04704f23d4590", @@ -689,7 +690,6 @@ def get_prediction_tokens(self, *args, **kwargs): use_instructions=None, training_datasets=None, framework=["PyTorch"], - is_cross_encoder=True, ) mistral_7b = ModelMeta( @@ -698,6 +698,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="mistralai/Mistral-7B-Instruct-v0.2", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="3ad372fc79158a2148299e3318516c786aeded6c", @@ -722,7 +723,6 @@ def get_prediction_tokens(self, *args, **kwargs): primaryClass={cs.CL}, url={https://arxiv.org/abs/2310.06825}, }""", - is_cross_encoder=True, ) followir_7b = ModelMeta( @@ -731,6 +731,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="jhu-clsp/FollowIR-7B", + model_type=["cross-encoder"], languages=["eng-Latn"], open_weights=True, revision="4d25d437e38b510c01852070c0731e8f6e1875d1", @@ -758,7 +759,6 @@ def get_prediction_tokens(self, *args, **kwargs): primaryClass={cs.IR} } """, - is_cross_encoder=True, ) @@ -874,6 +874,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="unicamp-dl/mt5-base-mmarco-v2", + model_type=["cross-encoder"], languages=mt5_languages, open_weights=True, revision="cc0a949b9f21efcaba45c8cabb998ad02ce8d4e7", @@ -898,7 +899,6 @@ def get_prediction_tokens(self, *args, **kwargs): similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], - is_cross_encoder=True, ) mt5_13b_mmarco_100k = ModelMeta( @@ -907,6 +907,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="unicamp-dl/mt5-13b-mmarco-100k", + model_type=["cross-encoder"], languages=mt5_languages, open_weights=True, revision="e1a4317e102a525ea9e16745ad21394a4f1bffbc", @@ -922,5 +923,4 @@ def get_prediction_tokens(self, *args, **kwargs): use_instructions=None, training_datasets=None, framework=["PyTorch"], - is_cross_encoder=True, ) diff --git a/mteb/models/model_implementations/richinfoai_models.py b/mteb/models/model_implementations/richinfoai_models.py index fff3f48565..ece1932bcc 100644 --- a/mteb/models/model_implementations/richinfoai_models.py +++ b/mteb/models/model_implementations/richinfoai_models.py @@ -9,6 +9,7 @@ ritrieve_zh_v1 = ModelMeta( loader=SentenceTransformerEncoderWrapper, name="richinfoai/ritrieve_zh_v1", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="f8d5a707656c55705027678e311f9202c8ced12c", diff --git a/mteb/models/model_implementations/ru_sentence_models.py b/mteb/models/model_implementations/ru_sentence_models.py index 904ef7956a..8ada77a8f1 100644 --- a/mteb/models/model_implementations/ru_sentence_models.py +++ b/mteb/models/model_implementations/ru_sentence_models.py @@ -238,6 +238,7 @@ rubert_tiny = ModelMeta( loader=sentence_transformers_loader, name="cointegrated/rubert-tiny", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", @@ -263,6 +264,7 @@ rubert_tiny2 = ModelMeta( loader=sentence_transformers_loader, name="cointegrated/rubert-tiny2", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", @@ -289,6 +291,7 @@ sbert_large_nlu_ru = ModelMeta( loader=sentence_transformers_loader, name="ai-forever/sbert_large_nlu_ru", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="af977d5dfa46a3635e29bf0ef383f2df2a08d47a", @@ -314,6 +317,7 @@ sbert_large_mt_nlu_ru = ModelMeta( loader=sentence_transformers_loader, name="ai-forever/sbert_large_mt_nlu_ru", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="05300876c2b83f46d3ddd422a7f17e45cf633bb0", @@ -341,6 +345,7 @@ model_prompts={"query": "query: ", "document": "passage: "}, ), name="deepvk/USER-base", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="436a489a2087d61aa670b3496a9915f84e46c861", @@ -401,6 +406,7 @@ user_bge_m3 = ModelMeta( loader=sentence_transformers_loader, name="deepvk/USER-bge-m3", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="0cc6cfe48e260fb0474c753087a69369e88709ae", @@ -444,6 +450,7 @@ deberta_v1_ru = ModelMeta( loader=sentence_transformers_loader, name="deepvk/deberta-v1-base", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="bdd30b0e19757e6940c92c7aff19e8fc0a60dff4", @@ -474,6 +481,7 @@ rubert_base_cased = ModelMeta( loader=sentence_transformers_loader, name="DeepPavlov/rubert-base-cased", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="4036cab694767a299f2b9e6492909664d9414229", @@ -509,6 +517,7 @@ distilrubert_small_cased_conversational = ModelMeta( loader=sentence_transformers_loader, name="DeepPavlov/distilrubert-small-cased-conversational", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="e348066b4a7279b97138038299bddc6580a9169a", @@ -543,6 +552,7 @@ rubert_base_cased_sentence = ModelMeta( loader=sentence_transformers_loader, name="DeepPavlov/rubert-base-cased-sentence", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="78b5122d6365337dd4114281b0d08cd1edbb3bc8", @@ -567,6 +577,7 @@ labse_en_ru = ModelMeta( loader=sentence_transformers_loader, name="cointegrated/LaBSE-en-ru", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="cf0714e606d4af551e14ad69a7929cd6b0da7f7e", @@ -594,6 +605,7 @@ rubert_tiny_turbo = ModelMeta( loader=sentence_transformers_loader, name="sergeyzh/rubert-tiny-turbo", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="8ce0cf757446ce9bb2d5f5a4ac8103c7a1049054", @@ -616,6 +628,7 @@ rubert_mini_frida = ModelMeta( loader=sentence_transformers_loader, name="sergeyzh/rubert-mini-frida", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="19b279b78afd945b5ccae78f63e284909814adc2", @@ -643,6 +656,7 @@ labse_ru_turbo = ModelMeta( loader=sentence_transformers_loader, name="sergeyzh/LaBSE-ru-turbo", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="1940b046c6b5e125df11722b899130329d0a46da", @@ -691,6 +705,7 @@ model_prompts=rosberta_prompts, ), name="ai-forever/ru-en-RoSBERTa", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="89fb1651989adbb1cfcfdedafd7d102951ad0555", @@ -856,6 +871,7 @@ model_prompts=frida_prompts, ), name="ai-forever/FRIDA", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="7292217af9a9e6dbf07048f76b434ad1e2aa8b76", @@ -888,6 +904,7 @@ }, ), name="ai-sage/Giga-Embeddings-instruct", + model_type=["dense"], languages=["eng-Latn", "rus-Cyrl"], open_weights=True, revision="0ad5b29bfecd806cecc9d66b927d828a736594dc", @@ -919,6 +936,7 @@ berta = ModelMeta( loader=sentence_transformers_loader, name="sergeyzh/BERTA", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="914c8c8aed14042ed890fc2c662d5e9e66b2faa7", @@ -991,6 +1009,7 @@ model_prompts=user2_prompts, ), name="deepvk/USER2-small", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="23f65b34cf7632032061f5cc66c14714e6d4cee4", @@ -1016,6 +1035,7 @@ model_prompts=user2_prompts, ), name="deepvk/USER2-base", + model_type=["dense"], languages=["rus-Cyrl"], open_weights=True, revision="0942cf96909b6d52e61f79a01e2d30c7be640b27", diff --git a/mteb/models/model_implementations/ruri_models.py b/mteb/models/model_implementations/ruri_models.py index 1490d397e3..27d2d60e22 100644 --- a/mteb/models/model_implementations/ruri_models.py +++ b/mteb/models/model_implementations/ruri_models.py @@ -32,6 +32,7 @@ model_prompts=RURI_V3_PROMPTS, ), name="cl-nagoya/ruri-v3-30m", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="24899e5de370b56d179604a007c0d727bf144504", @@ -62,6 +63,7 @@ model_prompts=RURI_V3_PROMPTS, ), name="cl-nagoya/ruri-v3-70m", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="07a8b0aba47d29d2ca21f89b915c1efe2c23d1cc", @@ -90,6 +92,7 @@ model_prompts=RURI_V3_PROMPTS, ), name="cl-nagoya/ruri-v3-130m", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="e3114c6ee10dbab8b4b235fbc6dcf9dd4d5ac1a6", @@ -118,6 +121,7 @@ model_prompts=RURI_V3_PROMPTS, ), name="cl-nagoya/ruri-v3-310m", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="18b60fb8c2b9df296fb4212bb7d23ef94e579cd3", @@ -147,6 +151,7 @@ trust_remote_code=True, ), name="cl-nagoya/ruri-small-v2", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="db18646e673b713cd0518a5bb0fefdce21e77cd9", @@ -175,6 +180,7 @@ model_prompts=RURI_V1_V2_PROMPTS, ), name="cl-nagoya/ruri-base-v2", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="8ce03882903668a01c83ca3b8111ac025a3bc734", @@ -203,6 +209,7 @@ model_prompts=RURI_V1_V2_PROMPTS, ), name="cl-nagoya/ruri-large-v2", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="42898ef34a5574977380ebf0dfd28cbfbd36438b", @@ -232,6 +239,7 @@ trust_remote_code=True, ), name="cl-nagoya/ruri-small", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="bc56ce90cd7a979f6eb199fc52dfe700bfd94bc3", @@ -260,6 +268,7 @@ model_prompts=RURI_V1_V2_PROMPTS, ), name="cl-nagoya/ruri-base", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="1ae40b8b6c78518a499425086bab8fc16c2e4b0e", @@ -289,6 +298,7 @@ model_prompts=RURI_V1_V2_PROMPTS, ), name="cl-nagoya/ruri-large", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="a011c39b13e8bc137ee13c6bc82191ece46c414c", diff --git a/mteb/models/model_implementations/salesforce_models.py b/mteb/models/model_implementations/salesforce_models.py index 7a7785b55d..b215cb37e4 100644 --- a/mteb/models/model_implementations/salesforce_models.py +++ b/mteb/models/model_implementations/salesforce_models.py @@ -46,6 +46,7 @@ def instruction_template( normalized=True, ), name="Salesforce/SFR-Embedding-2_R", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="91762139d94ed4371a9fa31db5551272e0b83818", @@ -83,6 +84,7 @@ def instruction_template( normalized=True, ), name="Salesforce/SFR-Embedding-Code-2B_R", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="c73d8631a005876ed5abde34db514b1fb6566973", @@ -120,6 +122,7 @@ def instruction_template( normalized=True, ), name="Salesforce/SFR-Embedding-Mistral", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="938c560d1c236aa563b2dbdf084f28ab28bccb11", diff --git a/mteb/models/model_implementations/samilpwc_models.py b/mteb/models/model_implementations/samilpwc_models.py index 3d425abf33..b5e804a184 100644 --- a/mteb/models/model_implementations/samilpwc_models.py +++ b/mteb/models/model_implementations/samilpwc_models.py @@ -43,6 +43,7 @@ def instruct_loader(*args, **kwargs): apply_instruction_to_passages=False, ), name="SamilPwC-AXNode-GenAI/PwC-Embedding_expr", + model_type=["dense"], languages=[ "kor-Hang", ], diff --git a/mteb/models/model_implementations/sarashina_embedding_models.py b/mteb/models/model_implementations/sarashina_embedding_models.py index acbcaaf8e6..75320cc110 100644 --- a/mteb/models/model_implementations/sarashina_embedding_models.py +++ b/mteb/models/model_implementations/sarashina_embedding_models.py @@ -118,6 +118,7 @@ def sarashina_instruction_template( max_seq_length=8192, ), name="sbintuitions/sarashina-embedding-v2-1b", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="1f3408afaa7b617e3445d891310a9c26dd0c68a5", @@ -143,6 +144,7 @@ def sarashina_instruction_template( sbintuitions_sarashina_embedding_v1_1b = ModelMeta( loader=sentence_transformers_loader, name="sbintuitions/sarashina-embedding-v1-1b", + model_type=["dense"], languages=["jpn-Jpan"], open_weights=True, revision="d060fcd8984075071e7fad81baff035cbb3b6c7e", diff --git a/mteb/models/model_implementations/searchmap_models.py b/mteb/models/model_implementations/searchmap_models.py index c3b1060532..782f24c57b 100644 --- a/mteb/models/model_implementations/searchmap_models.py +++ b/mteb/models/model_implementations/searchmap_models.py @@ -20,6 +20,7 @@ "model_prompts": task_instructions, }, name="VPLabs/SearchMap_Preview", + model_type=["dense"], revision="69de17ef48278ed08ba1a4e65ead8179912b696e", languages=["eng-Latn"], open_weights=True, diff --git a/mteb/models/model_implementations/seed_1_6_embedding_models.py b/mteb/models/model_implementations/seed_1_6_embedding_models.py index 25d4970245..5b56594f4a 100644 --- a/mteb/models/model_implementations/seed_1_6_embedding_models.py +++ b/mteb/models/model_implementations/seed_1_6_embedding_models.py @@ -413,6 +413,7 @@ def encode( seed_embedding = ModelMeta( name="Bytedance/Seed1.6-embedding", + model_type=["dense"], revision="1", release_date="2025-06-18", languages=[ diff --git a/mteb/models/model_implementations/seed_models.py b/mteb/models/model_implementations/seed_models.py index c01bd658bd..7d7b1e0125 100644 --- a/mteb/models/model_implementations/seed_models.py +++ b/mteb/models/model_implementations/seed_models.py @@ -236,6 +236,7 @@ def encode( seed_embedding = ModelMeta( name="ByteDance-Seed/Seed1.5-Embedding", + model_type=["dense"], revision="4", release_date="2025-04-25", languages=[ diff --git a/mteb/models/model_implementations/sentence_transformers_models.py b/mteb/models/model_implementations/sentence_transformers_models.py index 890c2599c6..29cc827455 100644 --- a/mteb/models/model_implementations/sentence_transformers_models.py +++ b/mteb/models/model_implementations/sentence_transformers_models.py @@ -113,6 +113,7 @@ all_minilm_l6_v2 = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/all-MiniLM-L6-v2", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="8b3219a92973c328a8e22fadcfa821b5dc75636a", @@ -137,6 +138,7 @@ all_minilm_l12_v2 = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/all-MiniLM-L12-v2", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", @@ -161,6 +163,7 @@ paraphrase_multilingual_minilm_l12_v2 = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + model_type=["dense"], languages=paraphrase_langs, open_weights=True, revision="bf3bf13ab40c3157080a7ab344c831b9ad18b5eb", @@ -185,6 +188,7 @@ paraphrase_multilingual_mpnet_base_v2 = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", + model_type=["dense"], languages=paraphrase_langs, open_weights=True, revision="79f2382ceacceacdf38563d7c5d16b9ff8d725d6", @@ -220,6 +224,7 @@ labse = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/LaBSE", + model_type=["dense"], languages=paraphrase_langs, open_weights=True, revision="e34fab64a3011d2176c99545a93d5cbddc9a91b7", @@ -257,6 +262,7 @@ multi_qa_minilm_l6_cos_v1 = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="b207367332321f8e44f96e224ef15bc607f4dbf0", @@ -281,6 +287,7 @@ all_mpnet_base_v2 = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/all-mpnet-base-v2", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="9a3225965996d404b775526de6dbfe85d3368642", @@ -380,6 +387,7 @@ static_similarity_mrl_multilingual_v1 = ModelMeta( name="sentence-transformers/static-similarity-mrl-multilingual-v1", + model_type=["dense"], loader=SentenceTransformerEncoderWrapper, loader_kwargs=dict( device="cpu", # CPU is just as quick, if not quicker @@ -407,6 +415,7 @@ contriever = ModelMeta( loader=SentenceTransformerEncoderWrapper, name="facebook/contriever-msmarco", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="abe8c1493371369031bcb1e02acb754cf4e162fa", @@ -436,6 +445,7 @@ microllama_text_embedding = ModelMeta( loader=sentence_transformers_loader, name="keeeeenw/MicroLlama-text-embedding", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="98f70f14cdf12d7ea217ed2fd4e808b0195f1e7e", @@ -470,6 +480,7 @@ sentence_t5_base = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/sentence-t5-base", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="50c53e206f8b01c9621484a3c0aafce4e55efebf", @@ -491,6 +502,7 @@ sentence_t5_large = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/sentence-t5-large", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="1fc08ea477205aa54a3e5b13f0971ae16b86410a", @@ -512,6 +524,7 @@ sentence_t5_xl = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/sentence-t5-xl", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="2965d31b368fb14117688e0bde77cbd720e91f53", @@ -533,6 +546,7 @@ sentence_t5_xxl = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/sentence-t5-xxl", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="4d122282ba80e807e9e6eb8c358269e92796365d", @@ -553,6 +567,7 @@ gtr_t5_large = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/gtr-t5-large", + model_type=["dense"], languages=["eng-Latn"], # in format eng-Latn open_weights=True, revision="a2c8ac47f998531948d4cbe32a0b577a7037a5e3", @@ -586,6 +601,7 @@ gtr_t5_xl = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/gtr-t5-xl", + model_type=["dense"], languages=["eng-Latn"], # in format eng-Latn open_weights=True, revision="23a8d667a1ad2578af181ce762867003c498d1bf", @@ -618,6 +634,7 @@ gtr_t5_xxl = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/gtr-t5-xxl", + model_type=["dense"], languages=["eng-Latn"], # in format eng-Latn open_weights=True, revision="73f2a9156a3dcc2194dfdb2bf201cd7d17e17884", @@ -651,6 +668,7 @@ gtr_t5_base = ModelMeta( loader=sentence_transformers_loader, name="sentence-transformers/gtr-t5-base", + model_type=["dense"], languages=["eng-Latn"], # in format eng-Latn open_weights=True, revision="7027e9594267928589816394bdd295273ddc0739", diff --git a/mteb/models/model_implementations/shuu_model.py b/mteb/models/model_implementations/shuu_model.py index dfc2f9891f..bc3f130f55 100644 --- a/mteb/models/model_implementations/shuu_model.py +++ b/mteb/models/model_implementations/shuu_model.py @@ -1,31 +1,32 @@ -from mteb.models.model_meta import ModelMeta -from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader - -codemodernbert_crow_meta = ModelMeta( - loader=sentence_transformers_loader, - name="Shuu12121/CodeSearch-ModernBERT-Crow-Plus", - languages=["eng-Latn"], - open_weights=True, - revision="044a7a4b552f86e284817234c336bccf16f895ce", - release_date="2025-04-21", - n_parameters=151668480, - memory_usage_mb=607, - embed_dim=768, - license="apache-2.0", - max_tokens=1024, - reference="https://huggingface.co/Shuu12121/CodeSearch-ModernBERT-Crow-Plus", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets={ - "CodeSearchNetRetrieval", - # "code-search-net/code_search_net", - # "Shuu12121/python-codesearch-filtered", - # "Shuu12121/java-codesearch-filtered", - # "Shuu12121/javascript-codesearch-filtered", - # "Shuu12121/ruby-codesearch-filtered", - # "Shuu12121/rust-codesearch-filtered", - }, -) +from mteb.models.model_meta import ModelMeta +from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader + +codemodernbert_crow_meta = ModelMeta( + loader=sentence_transformers_loader, + name="Shuu12121/CodeSearch-ModernBERT-Crow-Plus", + model_type=["dense"], + languages=["eng-Latn"], + open_weights=True, + revision="044a7a4b552f86e284817234c336bccf16f895ce", + release_date="2025-04-21", + n_parameters=151668480, + memory_usage_mb=607, + embed_dim=768, + license="apache-2.0", + max_tokens=1024, + reference="https://huggingface.co/Shuu12121/CodeSearch-ModernBERT-Crow-Plus", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + "CodeSearchNetRetrieval", + # "code-search-net/code_search_net", + # "Shuu12121/python-codesearch-filtered", + # "Shuu12121/java-codesearch-filtered", + # "Shuu12121/javascript-codesearch-filtered", + # "Shuu12121/ruby-codesearch-filtered", + # "Shuu12121/rust-codesearch-filtered", + }, +) diff --git a/mteb/models/model_implementations/siglip_models.py b/mteb/models/model_implementations/siglip_models.py index 82b716ce9e..b295ba6bd8 100644 --- a/mteb/models/model_implementations/siglip_models.py +++ b/mteb/models/model_implementations/siglip_models.py @@ -125,6 +125,7 @@ def encode( siglip_so400m_patch14_224 = ModelMeta( loader=SiglipModelWrapper, # type: ignore name="google/siglip-so400m-patch14-224", + model_type=["dense"], languages=["eng-Latn"], revision="d04cf29fca7b6374f74d8bea1969314492266b5e", release_date="2024-01-08", @@ -148,6 +149,7 @@ def encode( siglip_so400m_patch14_384 = ModelMeta( loader=SiglipModelWrapper, # type: ignore name="google/siglip-so400m-patch14-384", + model_type=["dense"], languages=["eng-Latn"], revision="9fdffc58afc957d1a03a25b10dba0329ab15c2a3", release_date="2024-01-08", @@ -171,6 +173,7 @@ def encode( siglip_so400m_patch16_256_i18n = ModelMeta( loader=SiglipModelWrapper, # type: ignore name="google/siglip-so400m-patch16-256-i18n", + model_type=["dense"], languages=["eng-Latn"], revision="365d321c0cfdea96bc28e3a29787a11a062681a1", release_date="2024-01-08", @@ -194,6 +197,7 @@ def encode( siglip_base_patch16_256_multilingual = ModelMeta( loader=SiglipModelWrapper, # type: ignore name="google/siglip-base-patch16-256-multilingual", + model_type=["dense"], languages=["eng-Latn"], revision="8952a4eafcde3cb7ab46b1dd629b33f8784ca9c6", release_date="2024-01-08", @@ -217,6 +221,7 @@ def encode( siglip_base_patch16_256 = ModelMeta( loader=SiglipModelWrapper, # type: ignore name="google/siglip-base-patch16-256", + model_type=["dense"], languages=["eng-Latn"], revision="b078df89e446d623010d890864d4207fe6399f61", release_date="2024-01-08", @@ -240,6 +245,7 @@ def encode( siglip_base_patch16_512 = ModelMeta( loader=SiglipModelWrapper, # type: ignore name="google/siglip-base-patch16-512", + model_type=["dense"], languages=["eng-Latn"], revision="753a949581523b60257d93e18391e8c27f72eb22", release_date="2024-01-08", @@ -263,6 +269,7 @@ def encode( siglip_base_patch16_384 = ModelMeta( loader=SiglipModelWrapper, # type: ignore name="google/siglip-base-patch16-384", + model_type=["dense"], languages=["eng-Latn"], revision="41aec1c83b32e0a6fca20ad88ba058aa5b5ea394", release_date="2024-01-08", @@ -286,6 +293,7 @@ def encode( siglip_base_patch16_224 = ModelMeta( loader=SiglipModelWrapper, # type: ignore name="google/siglip-base-patch16-224", + model_type=["dense"], languages=["eng-Latn"], revision="7fd15f0689c79d79e38b1c2e2e2370a7bf2761ed", release_date="2024-01-08", @@ -309,6 +317,7 @@ def encode( siglip_large_patch16_256 = ModelMeta( loader=SiglipModelWrapper, # type: ignore name="google/siglip-large-patch16-256", + model_type=["dense"], languages=["eng-Latn"], revision="d0da9f876e7d66b4e250cd2450c3ba2ce735e447", release_date="2024-01-08", @@ -332,6 +341,7 @@ def encode( siglip_large_patch16_384 = ModelMeta( loader=SiglipModelWrapper, # type: ignore name="google/siglip-large-patch16-384", + model_type=["dense"], languages=["eng-Latn"], revision="ce005573a40965dfd21fd937fbdeeebf2439fc35", release_date="2024-01-08", diff --git a/mteb/models/model_implementations/sonar_models.py b/mteb/models/model_implementations/sonar_models.py index f67cdc1f12..09478ea7c3 100644 --- a/mteb/models/model_implementations/sonar_models.py +++ b/mteb/models/model_implementations/sonar_models.py @@ -218,6 +218,7 @@ sonar = ModelMeta( loader=None, name="facebook/SONAR", + model_type=["dense"], languages=sonar_langs, open_weights=True, use_instructions=False, # it does take a language code as input diff --git a/mteb/models/model_implementations/spartan8806_atles_champion.py b/mteb/models/model_implementations/spartan8806_atles_champion.py index 28b18758a7..2c29d31329 100644 --- a/mteb/models/model_implementations/spartan8806_atles_champion.py +++ b/mteb/models/model_implementations/spartan8806_atles_champion.py @@ -6,6 +6,7 @@ spartan8806_atles_champion_embedding = ModelMeta( loader=sentence_transformers_loader, name="spartan8806/atles-champion-embedding", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="d4c74d7000bbd25f3597fc0f2dcde59ef1386e8f", diff --git a/mteb/models/model_implementations/stella_models.py b/mteb/models/model_implementations/stella_models.py index 90715c98d0..0df08d836a 100644 --- a/mteb/models/model_implementations/stella_models.py +++ b/mteb/models/model_implementations/stella_models.py @@ -59,6 +59,7 @@ torch_dtype="auto", ), name="NovaSearch/stella_en_400M_v5", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, use_instructions=True, @@ -87,6 +88,7 @@ torch_dtype="auto", ), name="NovaSearch/stella_en_1.5B_v5", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, use_instructions=True, @@ -109,6 +111,7 @@ stella_large_zh_v3_1792d = ModelMeta( loader=sentence_transformers_loader, name="dunzhang/stella-large-zh-v3-1792d", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="d5d39eb8cd11c80a63df53314e59997074469f09", @@ -135,6 +138,7 @@ stella_base_zh_v3_1792d = ModelMeta( loader=sentence_transformers_loader, name="infgrad/stella-base-zh-v3-1792d", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="82254892a0fba125aa2abf3a4800d2dd12821343", @@ -162,6 +166,7 @@ stella_mrl_large_zh_v3_5_1792d = ModelMeta( loader=sentence_transformers_loader, name="dunzhang/stella-mrl-large-zh-v3.5-1792d", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="17bb1c32a93a8fc5f6fc9e91d5ea86da99983cfe", @@ -185,6 +190,7 @@ zpoint_large_embedding_zh = ModelMeta( loader=sentence_transformers_loader, name="iampanda/zpoint_large_embedding_zh", + model_type=["dense"], languages=["zho-Hans"], open_weights=True, revision="b1075144f440ab4409c05622c1179130ebd57d03", diff --git a/mteb/models/model_implementations/tarka_models.py b/mteb/models/model_implementations/tarka_models.py index 4c584b2625..bee50dfc2d 100644 --- a/mteb/models/model_implementations/tarka_models.py +++ b/mteb/models/model_implementations/tarka_models.py @@ -321,6 +321,7 @@ tarka_embedding_150m_v1 = ModelMeta( loader=gemma_embedding_loader, name="Tarka-AIR/Tarka-Embedding-150M-V1", + model_type=["dense"], languages=MULTILINGUAL_EVALUATED_LANGUAGES, open_weights=True, revision="b0ffecc4ef0d873e517507ed080e43b88b2704b9", @@ -354,6 +355,7 @@ loader=InstructSentenceTransformerModel, loader_kwargs=tark_embedding_350_v1_kwargs, name="Tarka-AIR/Tarka-Embedding-350M-V1", + model_type=["dense"], languages=MULTILINGUAL_EVALUATED_LANGUAGES, open_weights=True, revision="a850d6a329145474727424fed6b12b62096b8ba3", diff --git a/mteb/models/model_implementations/ua_sentence_models.py b/mteb/models/model_implementations/ua_sentence_models.py index bb617813b6..63921a96cc 100644 --- a/mteb/models/model_implementations/ua_sentence_models.py +++ b/mteb/models/model_implementations/ua_sentence_models.py @@ -5,6 +5,7 @@ xlm_roberta_ua_distilled = ModelMeta( name="panalexeu/xlm-roberta-ua-distilled", + model_type=["dense"], loader=sentence_transformers_loader, n_parameters=278_000_000, memory_usage_mb=1061, diff --git a/mteb/models/model_implementations/uae_models.py b/mteb/models/model_implementations/uae_models.py index ebc5358e90..7f9d83f4dd 100644 --- a/mteb/models/model_implementations/uae_models.py +++ b/mteb/models/model_implementations/uae_models.py @@ -61,6 +61,7 @@ def encode( }, ), name="WhereIsAI/UAE-Large-V1", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="369c368f70f16a613f19f5598d4f12d9f44235d4", diff --git a/mteb/models/model_implementations/vdr_models.py b/mteb/models/model_implementations/vdr_models.py index 2b30bca9be..0b63abcc8d 100644 --- a/mteb/models/model_implementations/vdr_models.py +++ b/mteb/models/model_implementations/vdr_models.py @@ -25,6 +25,7 @@ def instruction_template( apply_instruction_to_passages=True, ), name="llamaindex/vdr-2b-multi-v1", + model_type=["dense"], languages=vdr_languages, open_weights=True, revision="2c4e54c8db4071cc61fc3c62f4490124e40c37db", diff --git a/mteb/models/model_implementations/vi_vn_models.py b/mteb/models/model_implementations/vi_vn_models.py index 31cfbdec37..8ceb595589 100644 --- a/mteb/models/model_implementations/vi_vn_models.py +++ b/mteb/models/model_implementations/vi_vn_models.py @@ -7,6 +7,7 @@ greennode_embedding_large_vn_v1 = ModelMeta( name="GreenNode/GreenNode-Embedding-Large-VN-V1", + model_type=["dense"], revision="660def1f6e1c8ecdf39f6f9c95829e3cf0cef837", release_date="2024-04-11", languages=[ @@ -31,6 +32,7 @@ greennode_embedding_large_vn_mixed_v1 = ModelMeta( name="GreenNode/GreenNode-Embedding-Large-VN-Mixed-V1", + model_type=["dense"], revision="1d3dddb3862292dab4bd3eddf0664c0335ad5843", release_date="2024-04-11", languages=[ @@ -55,6 +57,7 @@ aiteamvn_vietnamese_embeddings = ModelMeta( name="AITeamVN/Vietnamese_Embedding", + model_type=["dense"], revision="fcbbb905e6c3757d421aaa5db6fd7c53d038f6fb", release_date="2024-03-17", languages=[ @@ -79,6 +82,7 @@ hiieu_halong_embedding = ModelMeta( name="hiieu/halong_embedding", + model_type=["dense"], revision="b57776031035f70ed2030d2e35ecc533eb0f8f71", release_date="2024-07-06", languages=[ @@ -103,6 +107,7 @@ sup_simcse_vietnamese_phobert_base_ = ModelMeta( name="VoVanPhuc/sup-SimCSE-VietNamese-phobert-base", + model_type=["dense"], revision="608779b86741a8acd8c8d38132974ff04086b138", release_date="2021-05-26", languages=[ @@ -126,6 +131,7 @@ bkai_foundation_models_vietnamese_bi_encoder = ModelMeta( name="bkai-foundation-models/vietnamese-bi-encoder", + model_type=["dense"], revision="84f9d9ada0d1a3c37557398b9ae9fcedcdf40be0", release_date="2023-09-09", languages=[ diff --git a/mteb/models/model_implementations/vista_models.py b/mteb/models/model_implementations/vista_models.py index 645f0467e8..4cec94c611 100644 --- a/mteb/models/model_implementations/vista_models.py +++ b/mteb/models/model_implementations/vista_models.py @@ -247,6 +247,7 @@ def encode( image_tokens_num=196, ), name="BAAI/bge-visualized-base", + model_type=["dense"], languages=["eng-Latn"], revision="98db10b10d22620010d06f11733346e1c98c34aa", release_date="2024-06-06", @@ -274,6 +275,7 @@ def encode( image_tokens_num=256, ), name="BAAI/bge-visualized-m3", + model_type=["dense"], languages=["eng-Latn"], revision="98db10b10d22620010d06f11733346e1c98c34aa", release_date="2024-06-06", diff --git a/mteb/models/model_implementations/vlm2vec_models.py b/mteb/models/model_implementations/vlm2vec_models.py index 55273fa2ce..8b7bd0929d 100644 --- a/mteb/models/model_implementations/vlm2vec_models.py +++ b/mteb/models/model_implementations/vlm2vec_models.py @@ -269,6 +269,7 @@ def encode( vlm2vec_lora = ModelMeta( loader=VLM2VecWrapper, name="TIGER-Lab/VLM2Vec-LoRA", + model_type=["dense"], languages=["eng-Latn"], revision="7403b6327958071c1e33c822c7453adadccc7298", release_date="2024-10-08", @@ -292,6 +293,7 @@ def encode( vlm2vec_full = ModelMeta( loader=VLM2VecWrapper, name="TIGER-Lab/VLM2Vec-Full", + model_type=["dense"], languages=["eng-Latn"], revision="e9afa98002097ac2471827ba23ea1f2ddd229480", release_date="2024-10-08", diff --git a/mteb/models/model_implementations/voyage_models.py b/mteb/models/model_implementations/voyage_models.py index 6f74713477..45e6498d43 100644 --- a/mteb/models/model_implementations/voyage_models.py +++ b/mteb/models/model_implementations/voyage_models.py @@ -208,6 +208,7 @@ def _batched_encode( voyage_3_large = ModelMeta( name="voyageai/voyage-3-large", # Date of publication of this post https://blog.voyageai.com/2025/01/07/voyage-3-large/ + model_type=["dense"], revision="1", release_date="2025-01-07", languages=None, # supported languages not specified @@ -234,6 +235,7 @@ def _batched_encode( voyage_3_5 = ModelMeta( name="voyageai/voyage-3.5", + model_type=["dense"], revision="1", release_date="2025-01-21", languages=None, # supported languages not specified @@ -259,6 +261,7 @@ def _batched_encode( voyage_3_5_int8 = ModelMeta( name="voyageai/voyage-3.5 (output_dtype=int8)", + model_type=["dense"], revision="1", release_date="2025-01-21", languages=None, # supported languages not specified @@ -285,6 +288,7 @@ def _batched_encode( voyage_3_5_binary = ModelMeta( name="voyageai/voyage-3.5 (output_dtype=binary)", + model_type=["dense"], revision="1", release_date="2025-01-21", languages=None, # supported languages not specified @@ -311,6 +315,7 @@ def _batched_encode( voyage_large_2_instruct = ModelMeta( name="voyageai/voyage-large-2-instruct", + model_type=["dense"], revision="1", release_date="2024-05-05", languages=None, # supported languages not specified @@ -336,6 +341,7 @@ def _batched_encode( voyage_finance_2 = ModelMeta( name="voyageai/voyage-finance-2", + model_type=["dense"], revision="1", release_date="2024-05-30", languages=None, # supported languages not specified @@ -361,6 +367,7 @@ def _batched_encode( voyage_law_2 = ModelMeta( name="voyageai/voyage-law-2", + model_type=["dense"], revision="1", release_date="2024-04-15", languages=None, # supported languages not specified @@ -386,6 +393,7 @@ def _batched_encode( voyage_code_2 = ModelMeta( name="voyageai/voyage-code-2", + model_type=["dense"], revision="1", release_date="2024-01-23", languages=None, # supported languages not specified @@ -411,6 +419,7 @@ def _batched_encode( voyage_code_3 = ModelMeta( name="voyageai/voyage-code-3", + model_type=["dense"], revision="1", release_date="2024-12-04", languages=None, # supported languages not specified @@ -437,6 +446,7 @@ def _batched_encode( voyage_large_2 = ModelMeta( name="voyageai/voyage-large-2", # Date of publication of this post https://blog.voyageai.com/2023/10/29/voyage-embeddings/ + model_type=["dense"], revision="1", release_date="2023-10-29", languages=None, # supported languages not specified @@ -462,6 +472,7 @@ def _batched_encode( voyage_2 = ModelMeta( name="voyageai/voyage-2", + model_type=["dense"], revision="1", release_date="2023-10-29", languages=None, # supported languages not specified @@ -486,6 +497,7 @@ def _batched_encode( ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", + model_type=["dense"], revision="1", release_date="2024-06-10", languages=None, # supported languages not specified @@ -511,6 +523,7 @@ def _batched_encode( voyage_3 = ModelMeta( name="voyageai/voyage-3", + model_type=["dense"], revision="1", release_date="2024-09-18", languages=None, # supported languages not specified @@ -536,6 +549,7 @@ def _batched_encode( voyage_3_lite = ModelMeta( name="voyageai/voyage-3-lite", + model_type=["dense"], revision="1", release_date="2024-09-18", languages=None, # supported languages not specified @@ -561,6 +575,7 @@ def _batched_encode( voyage_3_exp = ModelMeta( name="voyageai/voyage-3-m-exp", + model_type=["dense"], revision="1", release_date="2025-01-08", languages=["eng-Latn"], diff --git a/mteb/models/model_implementations/voyage_v.py b/mteb/models/model_implementations/voyage_v.py index 6386bc2d06..5f7e2f1f4b 100644 --- a/mteb/models/model_implementations/voyage_v.py +++ b/mteb/models/model_implementations/voyage_v.py @@ -204,6 +204,7 @@ def encode( voyage_v = ModelMeta( loader=voyage_v_loader, # type: ignore name="voyageai/voyage-multimodal-3", + model_type=["dense"], languages=[], # Unknown revision="1", release_date="2024-11-10", diff --git a/mteb/models/model_implementations/xyz_models.py b/mteb/models/model_implementations/xyz_models.py index 2999d3a308..4d9c7a7823 100644 --- a/mteb/models/model_implementations/xyz_models.py +++ b/mteb/models/model_implementations/xyz_models.py @@ -24,6 +24,7 @@ xyz_embedding = ModelMeta( name="fangxq/XYZ-embedding", + model_type=["dense"], languages=["zho-Hans"], loader=sentence_transformers_loader, open_weights=True, diff --git a/mteb/models/model_implementations/youtu_models.py b/mteb/models/model_implementations/youtu_models.py index fbce101089..309e5ba72a 100644 --- a/mteb/models/model_implementations/youtu_models.py +++ b/mteb/models/model_implementations/youtu_models.py @@ -115,6 +115,7 @@ def instruction_template( max_seq_length=8192, ), name="tencent/Youtu-Embedding", + model_type=["dense"], languages=["zho-Hans"], revision="32e04afc24817c187a8422e7bdbb493b19796d47", release_date="2025-09-28", diff --git a/mteb/models/model_implementations/yuan_models.py b/mteb/models/model_implementations/yuan_models.py index 0e18d038a5..27ed6abb80 100644 --- a/mteb/models/model_implementations/yuan_models.py +++ b/mteb/models/model_implementations/yuan_models.py @@ -13,6 +13,7 @@ yuan_embedding_2_zh = ModelMeta( name="IEITYuan/Yuan-embedding-2.0-zh", + model_type=["dense"], loader=sentence_transformers_loader, languages=["zho-Hans"], open_weights=True, diff --git a/mteb/models/model_implementations/yuan_models_en.py b/mteb/models/model_implementations/yuan_models_en.py index a099c2159e..6ac935797d 100644 --- a/mteb/models/model_implementations/yuan_models_en.py +++ b/mteb/models/model_implementations/yuan_models_en.py @@ -37,6 +37,7 @@ def instruction_template( apply_instruction_to_passages=False, ), name="IEITYuan/Yuan-embedding-2.0-en", + model_type=["dense"], languages=["eng-Latn"], open_weights=True, revision="b2fd15da3bcae3473c8529593825c15068f09fce", diff --git a/mteb/models/model_meta.py b/mteb/models/model_meta.py index a4a657a843..66ee2ebb21 100644 --- a/mteb/models/model_meta.py +++ b/mteb/models/model_meta.py @@ -26,7 +26,7 @@ RepositoryNotFoundError, SafetensorsParsingError, ) -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import BaseModel, ConfigDict, field_validator, model_validator from transformers import AutoConfig from typing_extensions import Self @@ -57,6 +57,8 @@ "ColPali", ] +MODEL_TYPES = Literal["dense", "cross-encoder", "late-interaction"] + class ScoringFunction(HelpfulStrEnum): """The scoring function used by the models.""" @@ -114,7 +116,7 @@ class ModelMeta(BaseModel): a benchmark as well as mark dataset contaminations. adapted_from: Name of the model from which this model is adapted. For quantizations, fine-tunes, long doc extensions, etc. superseded_by: Name of the model that supersedes this model, e.g., nvidia/NV-Embed-v2 supersedes v1. - is_cross_encoder: Whether the model can act as a cross-encoder or not. + model_type: A list of strings representing the type of model. modalities: A list of strings representing the modalities the model supports. Default is ["text"]. contacts: The people to contact in case of a problem in the model, preferably a GitHub handle. """ @@ -144,10 +146,49 @@ class ModelMeta(BaseModel): adapted_from: str | None = None superseded_by: str | None = None modalities: list[Modalities] = ["text"] - is_cross_encoder: bool | None = None + model_type: list[MODEL_TYPES] = ["dense"] citation: str | None = None contacts: list[str] | None = None + @model_validator(mode="before") + @classmethod + def handle_legacy_is_cross_encoder(cls, data: Any) -> Any: + """Handle legacy is_cross_encoder field by converting it to model_type. + + This validator handles backward compatibility for the deprecated is_cross_encoder field. + If is_cross_encoder=True is provided, it adds "cross_encoder" to model_type. + """ + if isinstance(data, dict) and "is_cross_encoder" in data: + is_cross_encoder_value = data.pop("is_cross_encoder") + + if is_cross_encoder_value is not None: + warnings.warn( + "is_cross_encoder is deprecated and will be removed in a future version. " + "Use model_type=['cross-encoder'] instead.", + DeprecationWarning, + stacklevel=2, + ) + + model_type = data.get("model_type", ["dense"]) + + if is_cross_encoder_value: + if "cross-encoder" not in model_type: + data["model_type"] = ["cross-encoder"] + else: + if "cross-encoder" in model_type: + model_type = [t for t in model_type if t != "cross-encoder"] + data["model_type"] = model_type if model_type else ["dense"] + + return data + + @property + def is_cross_encoder(self) -> bool: + """Returns True if the model is a cross-encoder. + + Derived from model_type field. A model is considered a cross-encoder if "cross-encoder" is in its model_type list. + """ + return "cross-encoder" in self.model_type + @field_validator("similarity_fn_name", mode="before") @classmethod def _validate_similarity_fn_name(cls, value: str) -> ScoringFunction | None: @@ -183,6 +224,7 @@ def to_dict(self): else dict_repr["training_datasets"] ) dict_repr["loader"] = _get_loader_name(loader) + dict_repr["is_cross_encoder"] = self.is_cross_encoder return dict_repr @field_validator("languages") @@ -425,6 +467,7 @@ def from_cross_encoder( meta.loader = CrossEncoderWrapper meta.embed_dim = None meta.modalities = ["text"] + meta.model_type = ["cross-encoder"] return meta def is_zero_shot_on(self, tasks: Sequence[AbsTask] | Sequence[str]) -> bool | None: diff --git a/tests/test_models/test_model_meta.py b/tests/test_models/test_model_meta.py index 03c589aed5..336097f368 100644 --- a/tests/test_models/test_model_meta.py +++ b/tests/test_models/test_model_meta.py @@ -153,7 +153,7 @@ def test_model_to_python(): adapted_from=None, superseded_by=None, modalities=['text'], - is_cross_encoder=None, + model_type=['dense'], citation=\'@inproceedings{reimers-2019-sentence-bert,\\n title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",\\n author = "Reimers, Nils and Gurevych, Iryna",\\n booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",\\n month = "11",\\n year = "2019",\\n publisher = "Association for Computational Linguistics",\\n url = "http://arxiv.org/abs/1908.10084",\\n}\\n\', contacts=None, )"""