diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 740405fe97..33f32cd4d1 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -13,6 +13,7 @@ from mteb.model_meta import ModelMeta from mteb.models import ( align_models, + ara_models, arctic_models, bedrock_models, bge_models, @@ -70,11 +71,11 @@ stella_models, text2vec_models, uae_models, + vdr_models, vista_models, vlm2vec_models, voyage_models, voyage_v, - ara_models, ) logger = logging.getLogger(__name__) @@ -141,6 +142,7 @@ text2vec_models, uae_models, voyage_models, + vdr_models, fa_models, ara_models, ] diff --git a/mteb/models/vdr_models.py b/mteb/models/vdr_models.py new file mode 100644 index 0000000000..bc1cd66c83 --- /dev/null +++ b/mteb/models/vdr_models.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from functools import partial + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper + + +def instruction_template( + instruction: str, prompt_type: PromptType | None = None +) -> str: + return "{instruction}" + + +languages = [ + "eng_Latn", + "ita_Latn", + "fra_Latn", + "deu_Latn", + "spa_Latn", +] + +vdr_2b_multi_v1 = ModelMeta( + loader=partial( + InstructSentenceTransformerWrapper, + model_name="llamaindex/vdr-2b-multi-v1", + instruction_template=instruction_template, + max_seq_length=32768, + apply_instruction_to_passages=True, + ), + name="llamaindex/vdr-2b-multi-v1", + languages=languages, + open_weights=True, + revision="2c4e54c8db4071cc61fc3c62f4490124e40c37db", + release_date="2024-01-08", + modalities=["text"], # TODO: integrate with image + n_parameters=2_000_000_000, + memory_usage_mb=4213, + max_tokens=32768, + embed_dim=1536, + license="apache-2.0", + reference="https://huggingface.co/llamaindex/vdr-2b-multi-v1", + similarity_fn_name="cosine", + framework=["PyTorch", "Sentence Transformers"], + use_instructions=True, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/llamaindex/vdr-multilingual-train", + training_datasets={ + # llamaindex/vdr-multilingual-train + }, +)