diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 5e8fcae0ac..4b2611eca6 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -1708,3 +1708,91 @@ training_datasets=None, # They don't specify superseded_by=None, ) +xiaobu_embedding = ModelMeta( + name="lier007/xiaobu-embedding", + revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92", + release_date="2024-01-09", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + memory_usage=None, + max_tokens=512, + embed_dim=1024, + license="not specified", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/lier007/xiaobu-embedding", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # Finetuned from GTE, none of them disclose training data + superseded_by=None, + adapted_from="thenlper/gte-large-zh", +) +xiaobu_embedding_v2 = ModelMeta( + name="lier007/xiaobu-embedding-v2", + revision="1912f2e59a5c2ef802a471d735a38702a5c9485e", + release_date="2024-06-30", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + memory_usage=None, + max_tokens=512, + embed_dim=768, + license="not specified", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/lier007/xiaobu-embedding-v2", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # Finetuned from piccolo-embedding, none of them say + superseded_by=None, + adapted_from="sensenova/piccolo-base-zh", +) +yinka_embedding = ModelMeta( + name="Classical/Yinka", + revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92", + release_date="2024-01-09", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + memory_usage=None, + max_tokens=512, + embed_dim=1024, + license="not specified", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Classical/Yinka", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # Not disclosed + superseded_by=None, + adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d", +) +conan_embedding = ModelMeta( + name="TencentBAC/Conan-embedding-v1", + revision="bb9749a57d4f02fd71722386f8d0f5a9398d7eeb", + release_date="2024-08-22", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + memory_usage=None, + max_tokens=512, + embed_dim=768, + license="cc-by-nc-4.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Classical/Yinka", + similarity_fn_name="cosine", + use_instructions=None, + # source: https://arxiv.org/pdf/2408.15710 + training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage + superseded_by=None, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 634530089f..ea0fa1524c 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -36,6 +36,7 @@ nomic_models, nvidia_models, openai_models, + piccolo_models, promptriever_models, repllama_models, rerankers_custom, @@ -44,6 +45,7 @@ salesforce_models, sentence_transformers_models, stella_models, + text2vec_models, uae_models, voyage_models, ) @@ -69,11 +71,13 @@ llm2vec_models, mxbai_models, model2vec_models, + moka_models, misc_models, nomic_models, no_instruct_sentence_models, nvidia_models, openai_models, + piccolo_models, promptriever_models, repllama_models, rerankers_custom, @@ -88,6 +92,7 @@ jina_models, jasper_models, uae_models, + text2vec_models, stella_models, uae_models, voyage_models, diff --git a/mteb/models/text2vec_models.py b/mteb/models/text2vec_models.py new file mode 100644 index 0000000000..e26108e0ae --- /dev/null +++ b/mteb/models/text2vec_models.py @@ -0,0 +1,103 @@ +"""Implementation of Text2Vec models""" + +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +# I couldn't find the large model on HF for some reason +text2vec_base_chinese = ModelMeta( + name="shibing624/text2vec-base-chinese", + languages=["zho-Hans"], + open_weights=True, + revision="183bb99aa7af74355fb58d16edf8c13ae7c5433e", + release_date="2022-01-23", + n_parameters=102 * 1e6, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/shibing624/text2vec-base-chinese", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, # Couldn't find it + public_training_data=True, + training_datasets={ + # source: https://huggingface.co/shibing624/text2vec-base-chinese + # Not in MTEB + # - shibing624/nli-zh-all/text2vec-base-chinese-sentence-dataset + # (Could have overlaps I'm not aware of) + }, +) + +text2vec_base_chinese_paraphrase = ModelMeta( + name="shibing624/text2vec-base-chinese-paraphrase", + languages=["zho-Hans"], + open_weights=True, + revision="e90c150a9c7fb55a67712a766d6820c55fb83cdd", + release_date="2023-06-19", + n_parameters=118 * 1e6, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, # Couldn't find it + public_training_data=True, + training_datasets={ + # source: https://huggingface.co/shibing624/text2vec-base-chinese + # Not in MTEB + # - shibing624/nli-zh-all/text2vec-base-chinese-paraphrase + # (Could have overlaps I'm not aware of) + }, +) + + +text2vec_multi_langs = [ + "deu-Latn", # German (de) + "eng-Latn", # English (en) + "spa-Latn", # Spanish (es) + "fra-Latn", # French (fr) + "ita-Latn", # Italian (it) + "nld-Latn", # Dutch (nl) + "pol-Latn", # Polish (pl) + "por-Latn", # Portuguese (pt) + "rus-Cyrl", # Russian (ru) + "zho-Hans", # Chinese (Simplified, zh) +] +text2vec_base_multilingual = ModelMeta( + name="shibing624/text2vec-base-multilingual", + languages=text2vec_multi_langs, + open_weights=True, + revision="6633dc49e554de7105458f8f2e96445c6598e9d1", + release_date="2023-06-22", + # While it can be loaded with SBERT, it has one suspicious file according to huggingface + # So probably best not to. + loader=None, + n_parameters=118 * 1e6, + memory_usage=None, + embed_dim=384, + license="apache-2.0", + max_tokens=256, + reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + public_training_code=False, # Couldn't find it + public_training_data=True, + training_datasets={ + # source: https://huggingface.co/shibing624/text2vec-base-chinese + # Not in MTEB + # - shibing624/nli-zh-all/tree/main/text2vec-base-multilingual-dataset + # # (Could have overlaps I'm not aware of) + }, +)