diff --git a/mteb/model_meta.py b/mteb/model_meta.py index b105f301b6..c88326edc6 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -66,7 +66,8 @@ class ModelMeta(BaseModel): release_date: The date the model's revision was released. license: The license under which the model is released. Required if open_weights is True. open_weights: Whether the model is open source or proprietary. - public_training_code: Whether the code used to train the model is publicly available. + public_training_code: A link to the publicly available training code. If none it is assumed that the training code is not publicly available. + public_training_data: A link to the publicly available training data. If none it is assumed that the training data is not publicly available. similarity_fn_name: The distance metric used by the model. framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`. reference: A URL to the model's page on huggingface or another source. @@ -94,6 +95,7 @@ class ModelMeta(BaseModel): license: str | None open_weights: bool | None public_training_code: str | None + public_training_data: str | None framework: list[FRAMEWORKS] reference: STR_URL | None = None similarity_fn_name: DISTANCE_METRICS | None diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index 66822d41b0..f765b01bff 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -102,7 +102,8 @@ use_instructions=True, adapted_from="sentence-transformers/all-MiniLM-L6-v2", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -151,7 +152,8 @@ use_instructions=True, adapted_from="intfloat/e5-small-unsupervised", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -200,7 +202,8 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -249,7 +252,8 @@ use_instructions=True, adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -298,7 +302,8 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -350,6 +355,7 @@ adapted_from=None, superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -375,7 +381,8 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-multilingual-base", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -423,7 +430,8 @@ use_instructions=True, adapted_from="BAAI/bge-m3-retromae", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index d8270c573b..d9eb64246d 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -372,7 +372,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, ) @@ -397,6 +398,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, ) @@ -421,6 +423,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, ) @@ -444,7 +447,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -468,7 +472,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -492,7 +497,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -516,6 +522,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=bgem3_training_data, ) @@ -549,5 +556,6 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # not disclosed ) diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py index ea56fd432b..6e3d3747d9 100644 --- a/mteb/models/bm25.py +++ b/mteb/models/bm25.py @@ -139,5 +139,6 @@ def encode(self, texts: list[str], **kwargs): framework=[], use_instructions=False, public_training_code="https://github.com/xhluca/bm25s", + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 8718a2e2a3..60ff63ee81 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -234,7 +234,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -257,7 +258,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -280,7 +282,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -303,6 +306,7 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 87b5fdb93a..89b09de28e 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -153,6 +153,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: open_weights=True, revision="c1e84128e85ef755c096a95bdb06b47793b13acf", public_training_code=None, + public_training_data=None, release_date="2024-09-21", n_parameters=110 * 1e6, max_tokens=180, # Reduced for Benchmarking - see ColBERT paper @@ -205,6 +206,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: open_weights=True, revision="4cf816e5e2b03167b132a3c847a9ecd48ba708e1", public_training_code=None, + public_training_data=None, release_date="2024-08-16", n_parameters=559 * 1e6, max_tokens=8192, diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index f4d5909350..c89b64fc72 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -15,6 +15,16 @@ E5_INSTRUCTION = "Instruct: {instruction}\nQuery: " +E5_MISTRAL_TRAINING_DATA = { + **E5_TRAINING_DATA, + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + e5_instruct = ModelMeta( loader=partial( # type: ignore instruct_wrapper, @@ -40,6 +50,7 @@ license="mit", max_tokens=514, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -70,5 +81,6 @@ license="mit", max_tokens=32768, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index ace25ca08d..0ad15e7320 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -126,6 +126,16 @@ "NQ-PL": ["train"], # translation not trained on } +ME5_TRAINING_DATA = { + **E5_TRAINING_DATA, + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + e5_mult_small = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -147,7 +157,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets=E5_TRAINING_DATA, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, ) e5_mult_base = ModelMeta( @@ -170,7 +181,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets=E5_TRAINING_DATA, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, ) e5_mult_large = ModelMeta( @@ -194,7 +206,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets=E5_TRAINING_DATA, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, ) e5_eng_small_v2 = ModelMeta( @@ -217,6 +230,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -241,6 +255,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -267,6 +282,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -293,6 +309,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -319,6 +336,7 @@ superseded_by="intfloat/e5-large-v2", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -345,5 +363,6 @@ superseded_by="intfloat/e5-base-v2", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index 08065f7af0..40d316fee7 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -151,7 +151,8 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -173,7 +174,8 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -195,6 +197,7 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index a68502b06d..d15c1f4a55 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -11,6 +11,18 @@ logger = logging.getLogger(__name__) +GRIT_LM_TRAINING_DATA = { + **E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # also uses medi2 which contains fever and hotpotqa: + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + + def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: return ( "<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n" @@ -38,9 +50,10 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, - training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + training_datasets=GRIT_LM_TRAINING_DATA, # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data public_training_code="https://github.com/ContextualAI/gritlm", + public_training_data=None, ) gritlm8x7b = ModelMeta( loader=partial( # type: ignore @@ -63,7 +76,8 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, - training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + training_datasets=GRIT_LM_TRAINING_DATA, # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data public_training_code="https://github.com/ContextualAI/gritlm", + public_training_data=None, ) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index da265e79c2..4de4b610f2 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -46,6 +46,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, max_tokens=131072, ) @@ -77,6 +78,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -107,6 +109,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -130,6 +133,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -153,6 +157,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -176,6 +181,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -291,6 +297,7 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets=gte_multi_training_data, ) diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py index 78bad6097f..63679879c2 100644 --- a/mteb/models/ibm_granite_models.py +++ b/mteb/models/ibm_granite_models.py @@ -42,6 +42,7 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, training_datasets=None, ) @@ -67,6 +68,7 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, training_datasets=None, ) @@ -92,6 +94,7 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, training_datasets=None, ) @@ -117,6 +120,7 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, training_datasets=None, ) diff --git a/mteb/models/inf_models.py b/mteb/models/inf_models.py index dc31adccd2..0d40ff3ef2 100644 --- a/mteb/models/inf_models.py +++ b/mteb/models/inf_models.py @@ -26,5 +26,6 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct", public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 1dc06d5640..dbd1615ad8 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -93,4 +93,5 @@ def encode( training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 265d512371..4f1b58a352 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -224,6 +224,7 @@ def encode( reference="https://huggingface.co/jinaai/jina-embeddings-v3", training_datasets=None, public_training_code=None, + public_training_data=None, ) @@ -245,6 +246,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embeddings_v2_small_en = ModelMeta( @@ -265,6 +267,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embedding_b_en_v1 = ModelMeta( @@ -285,6 +288,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embedding_s_en_v1 = ModelMeta( @@ -305,4 +309,5 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/linq_models.py b/mteb/models/linq_models.py index 11cfa74ed1..ead10ebf71 100644 --- a/mteb/models/linq_models.py +++ b/mteb/models/linq_models.py @@ -40,5 +40,6 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index a5f1a69a36..28197e5c84 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -126,6 +126,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_llama3_8b_unsupervised = ModelMeta( @@ -151,6 +152,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) @@ -177,6 +179,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_mistral7b_unsupervised = ModelMeta( @@ -202,6 +205,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) llm2vec_llama2_7b_supervised = ModelMeta( @@ -227,6 +231,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_llama2_7b_unsupervised = ModelMeta( @@ -252,6 +257,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) llm2vec_sheared_llama_supervised = ModelMeta( @@ -277,6 +283,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_sheared_llama_unsupervised = ModelMeta( @@ -302,4 +309,5 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 5233ecec6b..ba6e3e8163 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -22,6 +22,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/Haon-Chen/speed-embedding-7b-instruct", similarity_fn_name="cosine", @@ -42,6 +43,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2", similarity_fn_name="cosine", @@ -62,6 +64,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", similarity_fn_name="cosine", @@ -82,6 +85,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", similarity_fn_name="cosine", @@ -102,6 +106,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/Hum-Works/lodestone-base-4096-v1", similarity_fn_name="cosine", @@ -164,6 +169,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Jaume/gemma-2b-embeddings", similarity_fn_name="cosine", @@ -184,6 +190,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", similarity_fn_name="cosine", @@ -210,6 +217,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-base", similarity_fn_name="cosine", @@ -235,6 +243,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-large", similarity_fn_name="cosine", @@ -260,6 +269,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-small", similarity_fn_name="cosine", @@ -280,6 +290,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", similarity_fn_name="cosine", @@ -301,6 +312,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", similarity_fn_name="cosine", @@ -322,6 +334,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Squirtle", similarity_fn_name="cosine", @@ -343,6 +356,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Venusaur", similarity_fn_name="cosine", @@ -364,6 +378,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Wartortle", similarity_fn_name="cosine", @@ -385,6 +400,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro", similarity_fn_name="cosine", @@ -405,6 +421,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro-v4", similarity_fn_name="cosine", @@ -425,6 +442,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrdalieTech/Solon-embeddings-large-0.1", similarity_fn_name="cosine", @@ -445,6 +463,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -465,6 +484,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", similarity_fn_name="cosine", @@ -487,6 +507,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -509,6 +530,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", similarity_fn_name="cosine", @@ -531,6 +553,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", similarity_fn_name="cosine", @@ -553,6 +576,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -573,6 +597,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-lunaris-text-embeddings", similarity_fn_name="cosine", @@ -593,6 +618,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-stellaris-text-embeddings", similarity_fn_name="cosine", @@ -613,6 +639,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/bge-m3-custom-fr", similarity_fn_name="cosine", @@ -633,6 +660,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.2", similarity_fn_name="cosine", @@ -653,6 +681,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.3", similarity_fn_name="cosine", @@ -673,6 +702,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", similarity_fn_name="cosine", @@ -694,6 +724,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-base", similarity_fn_name="cosine", @@ -714,6 +745,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-large", similarity_fn_name="cosine", @@ -734,6 +766,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-small", similarity_fn_name="cosine", @@ -754,6 +787,7 @@ license="gpl-3.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/KartonBERT-USE-base-v1", similarity_fn_name="cosine", @@ -774,6 +808,7 @@ license="lgpl", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/st-polish-kartonberta-base-alpha-v1", similarity_fn_name="cosine", @@ -794,6 +829,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-base", similarity_fn_name="cosine", @@ -814,6 +850,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/dwzhu/e5-base-4k", similarity_fn_name="cosine", @@ -834,6 +871,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-large", similarity_fn_name="cosine", @@ -854,6 +892,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-small", similarity_fn_name="cosine", @@ -874,6 +913,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-base", similarity_fn_name="cosine", @@ -894,6 +934,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-large", similarity_fn_name="cosine", @@ -960,6 +1001,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-1b1", similarity_fn_name="cosine", @@ -1026,6 +1068,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-3b", similarity_fn_name="cosine", @@ -1092,6 +1135,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-560m", similarity_fn_name="cosine", @@ -1158,6 +1202,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-7b1", similarity_fn_name="cosine", @@ -1178,6 +1223,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-Embedding-v0", similarity_fn_name="cosine", @@ -1198,6 +1244,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-all-MiniLM-L6-v2", similarity_fn_name="cosine", @@ -1218,6 +1265,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-large-Embedding-v0", similarity_fn_name="cosine", @@ -1238,6 +1286,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-small-Embedding-v0", similarity_fn_name="cosine", @@ -1258,6 +1307,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/bigscience/sgpt-bloom-7b1-msmarco", similarity_fn_name="cosine", @@ -1278,6 +1328,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", similarity_fn_name="cosine", @@ -1299,6 +1350,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/abhinand/MedEmbed-small-v0.1", similarity_fn_name="cosine", @@ -1325,6 +1377,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0", similarity_fn_name="cosine", @@ -1345,6 +1398,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/brahmairesearch/slx-v0.1", similarity_fn_name="cosine", @@ -1365,6 +1419,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/deepfile/embedder-100p", similarity_fn_name="cosine", @@ -1385,6 +1440,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/deepvk/USER-bge-m3", similarity_fn_name="cosine", @@ -1416,6 +1472,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/infgrad/stella-base-en-v2", similarity_fn_name="cosine", @@ -1436,6 +1493,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/malenia1/ternary-weight-embedding", similarity_fn_name="cosine", @@ -1456,6 +1514,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/omarelshehy/arabic-english-sts-matryoshka", similarity_fn_name="cosine", @@ -1486,6 +1545,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/openbmb/MiniCPM-Embedding", similarity_fn_name="cosine", @@ -1516,6 +1576,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/shibing624/text2vec-base-multilingual", similarity_fn_name="cosine", @@ -1537,6 +1598,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/silma-ai/silma-embeddding-matryoshka-v0.1", similarity_fn_name="cosine", @@ -1557,6 +1619,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", similarity_fn_name="cosine", @@ -1577,6 +1640,7 @@ license="apache-2", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/sbert-chinese-general-v1", similarity_fn_name="cosine", @@ -1601,6 +1665,7 @@ license="apache-2", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/", similarity_fn_name="cosine", @@ -1620,6 +1685,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding", similarity_fn_name="cosine", @@ -1640,6 +1706,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding-v2", similarity_fn_name="cosine", @@ -1660,6 +1727,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", similarity_fn_name="cosine", @@ -1680,6 +1748,7 @@ license="cc-by-nc-4.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", similarity_fn_name="cosine", diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index afbf9df627..33da211c7a 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -75,7 +75,8 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code="https://github.com/MinishLab/model2vec", # + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) @@ -101,6 +102,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) m2v_base_output = ModelMeta( @@ -125,6 +127,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) m2v_multilingual_output = ModelMeta( @@ -149,6 +152,7 @@ def encode( superseded_by=None, training_datasets=None, public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_2m = ModelMeta( @@ -173,6 +177,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_4m = ModelMeta( @@ -197,6 +202,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_8m = ModelMeta( @@ -221,4 +227,5 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py index d3943d78d7..1504b40789 100644 --- a/mteb/models/moka_models.py +++ b/mteb/models/moka_models.py @@ -96,7 +96,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) @@ -117,7 +118,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) @@ -139,6 +141,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index 04978a190d..921db17871 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -27,5 +27,6 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/no_instruct_sentence_models.py b/mteb/models/no_instruct_sentence_models.py index a0596b9bd1..9ff5cf901f 100644 --- a/mteb/models/no_instruct_sentence_models.py +++ b/mteb/models/no_instruct_sentence_models.py @@ -100,5 +100,6 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index 5d9da7b596..772d92902d 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -127,6 +127,7 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -154,6 +155,7 @@ def encode( # type: ignore adapted_from=None, superseded_by="nomic-ai/nomic-embed-text-v1.5", public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -181,6 +183,7 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -209,6 +212,7 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -238,5 +242,6 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 1f345a62be..1997a85274 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -141,6 +141,7 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, + public_training_data=None, ) NV_embed_v1 = ModelMeta( @@ -164,4 +165,5 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 863c9d7828..079e7c9361 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -135,7 +135,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: similarity_fn_name="cosine", framework=["API"], use_instructions=False, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) text_embedding_3_large = ModelMeta( @@ -156,7 +157,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, license=None, similarity_fn_name=None, @@ -179,7 +181,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, license=None, similarity_fn_name=None, diff --git a/mteb/models/overview.py b/mteb/models/overview.py index ad93efb314..e444b1105f 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -226,6 +226,7 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: embed_dim=None, open_weights=True, public_training_code=None, + public_training_data=None, use_instructions=None, ) except Exception as e: @@ -241,6 +242,7 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: license=None, open_weights=True, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -273,6 +275,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe license=None, open_weights=True, public_training_code=None, + public_training_data=None, use_instructions=None, training_datasets=None, ) @@ -291,6 +294,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe license=None, open_weights=True, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/mteb/models/piccolo_models.py b/mteb/models/piccolo_models.py index bb92b55673..d51487b8ba 100644 --- a/mteb/models/piccolo_models.py +++ b/mteb/models/piccolo_models.py @@ -21,6 +21,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=None, # They don't specify ) @@ -42,5 +43,6 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=None, # They don't say ) diff --git a/mteb/models/promptriever_models.py b/mteb/models/promptriever_models.py index a7066817a4..287fd3ef91 100644 --- a/mteb/models/promptriever_models.py +++ b/mteb/models/promptriever_models.py @@ -65,6 +65,7 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, ) promptriever_llama3 = ModelMeta( @@ -90,6 +91,7 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, ) @@ -116,6 +118,7 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, ) promptriever_mistral_v1 = ModelMeta( @@ -141,4 +144,5 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/repllama_models.py b/mteb/models/repllama_models.py index 5ae4c0d8cb..2c5ef6e446 100644 --- a/mteb/models/repllama_models.py +++ b/mteb/models/repllama_models.py @@ -150,6 +150,7 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, ) @@ -176,5 +177,6 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index 5609fdf83a..1a0fd1f6ba 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -209,6 +209,7 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -233,6 +234,7 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -289,6 +291,7 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/mteb/models/rerankers_monot5_based.py b/mteb/models/rerankers_monot5_based.py index 5bc50bad70..c53b364000 100644 --- a/mteb/models/rerankers_monot5_based.py +++ b/mteb/models/rerankers_monot5_based.py @@ -301,6 +301,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -324,6 +325,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -347,6 +349,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -370,6 +373,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -405,6 +409,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -439,6 +444,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -473,6 +479,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -507,6 +514,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -530,6 +538,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -553,6 +562,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -576,6 +586,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -600,6 +611,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -728,6 +740,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -750,6 +763,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index d8c7e84518..297c7f3142 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -23,6 +23,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -41,6 +42,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -59,6 +61,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -77,6 +80,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -124,6 +128,7 @@ # "CarlBrendt/Summ_Dialog_News": ["train"], }, public_training_code=None, + public_training_data=None, ) deberta_v1_ru = ModelMeta( @@ -141,6 +146,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -159,6 +165,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -177,6 +184,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -195,6 +203,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -213,6 +222,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -231,6 +241,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # source model in unknown # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) @@ -252,6 +263,7 @@ training_datasets=None, # source model in unknown # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, public_training_code=None, + public_training_data=None, ) @@ -279,6 +291,7 @@ license="mit", similarity_fn_name="cosine", public_training_code=None, + public_training_data=None, training_datasets=None, framework=["Sentence Transformers", "PyTorch"], ) diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index 4d4a60b621..235057a6f8 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -6,6 +6,8 @@ from mteb.model_meta import ModelMeta from mteb.models.instruct_wrapper import instruct_wrapper +from .e5_instruct import E5_MISTRAL_TRAINING_DATA + def instruction_template( instruction: str, prompt_type: PromptType | None = None @@ -13,6 +15,19 @@ def instruction_template( return f"Instruct: {instruction}\nQuery: " if instruction else "" +SFR_TRAINING_DATA = { # inherits from e5 + **E5_MISTRAL_TRAINING_DATA, + # From previously released blogpost which now have been taken down: + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + SFR_Embedding_2_R = ModelMeta( loader=partial( # type: ignore instruct_wrapper, @@ -41,16 +56,8 @@ def instruction_template( use_instructions=True, adapted_from="intfloat/e5-mistral-7b-instruct", public_training_code=None, - training_datasets={ # inherits from e5 - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - }, + public_training_data=None, + training_datasets=SFR_TRAINING_DATA, ) @@ -79,14 +86,6 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets={ # inherits from e5 - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - }, + public_training_data=None, + training_datasets=SFR_TRAINING_DATA, ) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index fa48ae7ccc..eec65049d5 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -112,6 +112,7 @@ adapted_from=None, training_datasets=sent_trf_training_dataset, public_training_code=None, + public_training_data=None, ) all_MiniLM_L12_v2 = ModelMeta( @@ -132,6 +133,7 @@ adapted_from=None, training_datasets=sent_trf_training_dataset, public_training_code=None, + public_training_data=None, ) paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( @@ -152,6 +154,7 @@ adapted_from=None, training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) public_training_code=None, + public_training_data=None, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( @@ -183,6 +186,7 @@ # "yahoo-answers": yahoo_answers_train_dataset, # "stack-exchange": stack_exchange_train_dataset, public_training_code=None, + public_training_data=None, ) labse = ModelMeta( @@ -203,6 +207,7 @@ adapted_from=None, training_datasets=None, # scraped and mined webdata including CC, wiki, see section 3.1 https://aclanthology.org/2022.acl-long.62.pdf public_training_code="https://www.kaggle.com/models/google/labse/tensorFlow2/labse/2?tfhub-redirect=true", + public_training_data=None, ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( @@ -223,6 +228,7 @@ adapted_from="nreimers/MiniLM-L6-H384-uncased", training_datasets=sent_trf_training_dataset, # assumed public_training_code=None, + public_training_data=None, ) all_mpnet_base_v2 = ModelMeta( @@ -243,6 +249,7 @@ adapted_from=None, training_datasets=sent_trf_training_dataset, public_training_code=None, + public_training_data=None, ) @@ -274,4 +281,5 @@ # "sentence-transformers/natural-questions": ["train"], }, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 44aa1f8604..7210b287cb 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -30,6 +30,7 @@ reference="https://huggingface.co/dunzhang/stella_en_400M_v5", training_datasets=None, public_training_code=None, + public_training_data=None, ) stella_en_1_5b = ModelMeta( @@ -56,6 +57,7 @@ reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", training_datasets=None, public_training_code=None, + public_training_data=None, ) stella_large_zh_v3_1792d = ModelMeta( @@ -75,6 +77,7 @@ superseded_by="dunzhang/stella-mrl-large-zh-v3.5-1792d", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -99,6 +102,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -124,6 +128,7 @@ superseded_by=None, adapted_from="dunzhang/stella-large-zh-v3-1792d", public_training_code=None, + public_training_data=None, training_datasets=None, # Not specified ) @@ -144,6 +149,7 @@ superseded_by=None, adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d", public_training_code=None, + public_training_data=None, training_datasets={ # It's a bit unclear what they have trained on to be honest, because they don't list all # And they also have some rather cryptic description of their training procedure, but at diff --git a/mteb/models/text2vec_models.py b/mteb/models/text2vec_models.py index 12322e69e9..86a9bcca4f 100644 --- a/mteb/models/text2vec_models.py +++ b/mteb/models/text2vec_models.py @@ -21,7 +21,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -46,7 +47,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -87,7 +89,8 @@ use_instructions=False, superseded_by=None, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index bd8be48693..8d97703ef6 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -83,4 +83,5 @@ def encode( "SNLI": [], }, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index a98bc041bc..a637dee36a 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -12,6 +12,11 @@ from .wrapper import Wrapper +VOYAGE_TRAINING_DATA = { + # Self-reported (message from VoyageAI member) + # synthetic data +} + def token_limit(max_tpm: int, interval: int = 60): limit_interval_start_ts = time.time() @@ -156,8 +161,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_finance_2 = ModelMeta( @@ -179,8 +185,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_law_2 = ModelMeta( @@ -202,8 +209,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_code_2 = ModelMeta( @@ -225,8 +233,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_large_2 = ModelMeta( @@ -248,8 +257,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_2 = ModelMeta( @@ -271,8 +281,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -293,8 +304,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_3 = ModelMeta( @@ -316,8 +328,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_3_lite = ModelMeta( @@ -339,6 +352,79 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, +) + + +voyage_3_exp = ModelMeta( + name="voyageai/voyage-3-m-exp", + revision="1", + release_date=None, # not released + languages=None, # supported languages not specified + loader=partial( + VoyageWrapper, + model_name="voyage-3-m-exp", + model_prompts=model_prompts, + ), + max_tokens=32000, + embed_dim=512, + open_weights=False, + n_parameters=None, + license=None, + reference="https://huggingface.co/voyageai/voyage-3-m-exp", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets={ + # MTEB(eng, classic) training data: + "ArguAna": ["train"], + "ArguAna-PL": ["train"], + "NanoArguAnaRetrieval": ["train"], + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], # translation not trained on + "STS12": ["train"], + "STS22": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "ImdbClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + "STSBenchmark": ["train"], + "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on + }, public_training_code=None, + public_training_data=None, ) diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index a192fa1341..4ae87fdbca 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -242,6 +242,7 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: license=None, open_weights=True, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index fb0cf6cf5a..dc65dae905 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -377,6 +377,7 @@ def test_reranker_same_ndcg1(): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, reference=None, similarity_fn_name=None, use_instructions=None,