Skip to content
Merged
147 changes: 138 additions & 9 deletions mteb/models/misc_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,109 @@
adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
superseded_by=None,
)

kalm_training_data = {
# from technical report
# not in MTEB:
# ExpertQA
# MEDI2BGE
# OpenOrca
# PAQ
# PubMedQA
# SearchQA
# arxiv_qa
# rag-dataset-12000
# CC-News
# SQuAD 2.0
# TriviaQA
# WebGPT Comparisons
# MultiNLI
# NLLB
# WikiAnswers
# SimCSE NLI
# SNLI
# Aya Dataset
# eli5
# ----
# in MTEB:
"CodeFeedbackMT": ["train"],
"CodeFeedbackST": ["train"],
"ArxivClusteringP2P": ["train"],
"ArxivClusteringS2S": ["train"],
"ArxivClusteringP2P.v2": ["train"],
"TRECCOVID": ["train"],
"DBPedia": ["train"],
"ESCIReranking": ["train"],
"FEVER": ["train"],
"FiQA2018": ["train"],
"FEVERHardNegatives": ["train"],
"NanoFEVERRetrieval": ["train"],
"FEVER-NL": ["train"], # translation not trained on
"FiQA2018-NL": ["train"], # translation not trained on
"HotpotQA-PL": ["train"], # translation not trained on
"HotpotQA-NL": ["train"], # translation not trained on
"HotpotQAHardNegatives": ["train"],
"MultiLongDocRetrieval": ["train"],
"MSMARCO": ["train"],
"MSMARCOHardNegatives": ["train"],
"NanoMSMARCORetrieval": ["train"],
"MSMARCO-PL": ["train"], # translation not trained on
"mMARCO-NL": ["train"], # translation not trained on
"MSMARCOv2": ["train"],
"NFCorpus": ["train"],
"SciFact": ["train"],
"NQ": ["train"],
"NQHardNegatives": ["train"],
"NanoNQRetrieval": ["train"],
"NQ-PL": ["train"], # translation not trained on
"NQ-NL": ["train"], # translation not trained on
"YahooAnswersTopicsClassification": ["train"],
"ContractNLIConfidentialityOfAgreementLegalBenchClassification": ["train"],
"ContractNLIExplicitIdentificationLegalBenchClassification": ["train"],
"ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification": [
"train"
],
"ContractNLILimitedUseLegalBenchClassification": ["train"],
"ContractNLINoLicensingLegalBenchClassification": ["train"],
"ContractNLINoticeOnCompelledDisclosureLegalBenchClassification": ["train"],
"ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification": [
"train"
],
"ContractNLIPermissibleCopyLegalBenchClassification": ["train"],
"ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification": [
"train"
],
"ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification": ["train"],
"ContractNLIReturnOfConfidentialInformationLegalBenchClassification": ["train"],
"ContractNLISharingWithEmployeesLegalBenchClassification": ["train"],
"ContractNLISharingWithThirdPartiesLegalBenchClassification": ["train"],
"ContractNLISurvivalOfObligationsLegalBenchClassification": ["train"],
"QuoraRetrieval": ["train"],
"NanoQuoraRetrieval": ["train"],
"BiorxivClusteringP2P.v2": ["train"],
"BiorxivClusteringS2S.v2": ["train"],
"MedrxivClusteringP2P.v2": ["train"],
"MedrxivClusteringS2S.v2": ["train"],
"Banking77Classification": ["train"],
"AmazonPolarityClassification": ["train"],
"ImdbClassification": ["train"],
"EmotionClassification": ["train"],
"TweetSentimentExtractionClassification": ["train"],
"ToxicConversationsClassification": ["train"],
"MIRACLRetrieval": ["train"],
"MIRACLRetrievalHardNegatives": ["train"],
"MIRACLReranking": ["train"],
"MrTidyRetrieval": ["train"],
"PawsXPairClassification": ["train"],
"AmazonReviewsClassification": ["train"],
"AmazonCounterfactualClassification": ["train"],
"MultilingualSentiment": ["train"],
"MassiveIntentClassification": ["train"],
"MassiveScenarioClassification": ["train"],
"MTOPDomainClassification": ["train"],
"MTOPIntentClassification": ["train"],
}

HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1 = ModelMeta(
name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1",
revision="45e42c89990c40aca042659133fc8b13c28634b5",
Expand All @@ -61,7 +164,7 @@
loader=None,
n_parameters=494032768,
memory_usage_mb=1885,
max_tokens=131072.0,
max_tokens=512,
embed_dim=896,
license="mit",
open_weights=True,
Expand All @@ -71,7 +174,7 @@
reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None,
training_datasets=kalm_training_data,
adapted_from="/mnt/shgeminicephfs/wx-dc-plt-hpc/xinshuohu/Output/Embedding/Qwen2-0.5B-eos_mean_pretrain_0806_1e-4_uen_sft_1022_filtered_v2_inst_3node_g8_1e-5_sin-0.1_mrl",
superseded_by=None,
)
Expand All @@ -83,7 +186,7 @@
loader=None,
n_parameters=494032768,
memory_usage_mb=1885,
max_tokens=131072.0,
max_tokens=512,
embed_dim=896,
license="mit",
open_weights=True,
Expand All @@ -93,7 +196,7 @@
reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None,
training_datasets=kalm_training_data,
adapted_from="/mnt/shgeminicephfs/wx-dc-plt-hpc/xinshuohu/Output/Embedding/Qwen2-0.5B-eos_mean_pretrain_0806_1e-4_uen_sft_0902_filtered_v2_3node_g8_1e-5_sin-0.1",
superseded_by=None,
)
Expand Down Expand Up @@ -208,6 +311,15 @@
adapted_from="intfloat/e5-mistral-7b-instruct",
superseded_by=None,
)

bilingual_embedding_training_data = {
"STSBenchmark": ["train"],
"STSBenchmarkMultilingualSTS": ["train"],
"XNLI": ["train"],
# not in mteb
# SNLI
}

Lajavaness__bilingual_embedding_base = ModelMeta(
name="Lajavaness/bilingual-embedding-base",
revision="0bfc54bb2aa2666dd84715289c7ef58a95eb4d8d",
Expand All @@ -231,7 +343,7 @@
reference="https://huggingface.co/Lajavaness/bilingual-embedding-base",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None,
training_datasets=bilingual_embedding_training_data,
adapted_from="dangvantuan/bilingual_impl",
superseded_by=None,
)
Expand All @@ -258,7 +370,7 @@
reference="https://huggingface.co/Lajavaness/bilingual-embedding-large",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None,
training_datasets=bilingual_embedding_training_data,
adapted_from="dangvantuan/bilingual_impl",
superseded_by=None,
)
Expand All @@ -285,7 +397,7 @@
reference="https://huggingface.co/Lajavaness/bilingual-embedding-small",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None,
training_datasets=bilingual_embedding_training_data,
adapted_from="dangvantuan/bilingual_impl",
superseded_by=None,
)
Expand Down Expand Up @@ -1254,8 +1366,25 @@
reference="https://huggingface.co/avsolatorio/GIST-Embedding-v0",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None,
adapted_from=None,
training_datasets={
**bge_training_data,
# not in mteb:
# MEDI
# all MTEB CLF datasets that has a train split:
"AmazonPolarityClassification": ["train"],
"AmazonReviewsClassification": ["train"],
"EmotionClassification": ["train"],
"ImdbClassification": ["train"],
"MTOPDomainClassification": ["train"],
"MTOPIntentClassification": ["train"],
"MassiveIntentClassification": ["train"],
"MassiveScenarioClassification": ["train"],
"ToxicConversationsClassification": ["train"],
"TweetSentimentExtractionClassification": ["train"],
"Banking77Classification": ["train"],
"AmazonCounterfactualClassification": ["train"],
},
adapted_from="BAAI/bge-large-en-v1.5",
superseded_by=None,
)
avsolatorio__GIST_all_MiniLM_L6_v2 = ModelMeta(
Expand Down
2 changes: 0 additions & 2 deletions mteb/models/nvidia_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def instruction_template(
"FEVERHardNegatives": ["train"],
"NanoFEVERRetrieval": ["train"],
"FiQA2018": ["train"],
"FiQA2018-PL": ["train"], # translation not trained on
"FiQA2018-NL": ["train"], # translation not trained on
"STS12": ["train"],
"STS22": ["train"],
Expand All @@ -56,7 +55,6 @@ def instruction_template(
"ArxivClusteringP2P": ["train"],
"ArxivClusteringP2P.v2": ["train"],
"ArxivClusteringS2S": ["train"],
"ArxivClusteringS2S.v2": ["train"],
"BiorxivClusteringP2P": ["train"],
"BiorxivClusteringP2P.v2": ["train"],
"BiorxivClusteringS2S": ["train"],
Expand Down
1 change: 0 additions & 1 deletion mteb/models/salesforce_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def instruction_template(
**E5_MISTRAL_TRAINING_DATA,
# From previously released blogpost which now have been taken down:
"FiQA2018": ["train"],
"FiQA2018-PL": ["train"],
"FiQA2018-NL": ["train"], # translation not trained on
"FEVER": ["train"],
"FEVERHardNegatives": ["train"],
Expand Down
7 changes: 4 additions & 3 deletions mteb/models/stella_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from mteb.model_meta import ModelMeta
from mteb.models.instruct_wrapper import instruct_wrapper
from mteb.models.nvidia_models import nvidia_training_datasets

stella_en_400M = ModelMeta(
# https://huggingface.co/dunzhang/stella_en_400M_v5/discussions/21#671a6205ac1e2416090f2bf4
Expand Down Expand Up @@ -57,7 +58,7 @@
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch", "GritLM"],
reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5",
training_datasets=None,
training_datasets=nvidia_training_datasets, # also distilled from gte-qwen (but training data is unknown) #2164
public_training_code="https://github.com/NovaSearch-Team/RAG-Retrieval/blob/c40f4638b705eb77d88305d2056901ed550f9f4b/rag_retrieval/train/embedding/README.md",
public_training_data=None,
)
Expand Down Expand Up @@ -121,7 +122,7 @@
open_weights=True,
revision="17bb1c32a93a8fc5f6fc9e91d5ea86da99983cfe",
release_date="2024-02-27",
n_parameters=326 * 1e6,
n_parameters=int(326 * 1e6),
memory_usage_mb=1242,
embed_dim=1792,
license="mit",
Expand All @@ -143,7 +144,7 @@
open_weights=True,
revision="b1075144f440ab4409c05622c1179130ebd57d03",
release_date="2024-06-04",
n_parameters=326 * 1e6,
n_parameters=int(326 * 1e6),
memory_usage_mb=1242,
embed_dim=1792,
license="mit",
Expand Down
77 changes: 27 additions & 50 deletions mteb/models/voyage_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,85 +416,62 @@ def _batched_encode(
# MTEB(eng, v1) training data:
"AmazonPolarityClassification": ["train"],
"AmazonReviewsClassification": ["train"],
"ArguAna": ["train"],
"ArxivClusteringP2P": ["train"],
"ArxivClusteringS2S": ["train"],
"AskUbuntuDupQuestions": ["train"],
"BIOSSES": ["train"],
"Banking77Classification": ["train"],
"BiorxivClusteringP2P": ["train"],
"BiorxivClusteringS2S": ["train"],
"CQADupstackRetrieval": ["train"],
"ClimateFEVER": ["train"],
"DBPedia": ["train"],
"EmotionClassification": ["train"],
"FEVER": ["train"],
"FiQA2018": ["train"],
"HotpotQA": ["train"],
"ImdbClassification": ["train"],
"MTOPDomainClassification": ["train"],
"MTOPIntentClassification": ["train"],
"MindSmallReranking": ["train"],
"MassiveIntentClassification": ["train"],
"MassiveScenarioClassification": ["train"],
"MedrxivClusteringP2P": ["train"],
"MedrxivClusteringS2S": ["train"],
"MindSmallReranking": ["train"],
"NFCorpus": ["train"],
"NQ": ["train"],
"QuoraRetrieval": ["train"],
"RedditClustering": ["train"],
"RedditClusteringP2P": ["train"],
"SCIDOCS": ["train"],
"SICK-R": ["train"],
"STS12": ["train"],
"STS13": ["train"],
"STS14": ["train"],
"STS15": ["train"],
"STS16": ["train"],
"STSBenchmark": ["train"],
"SciDocsRR": ["train"],
"SciFact": ["train"],
"SprintDuplicateQuestions": ["train"],
"StackExchangeClustering": ["train"],
"StackExchangeClusteringP2P": ["train"],
"StackOverflowDupQuestions": ["train"],
"SummEval": ["train"],
"TRECCOVID": ["train"],
"Touche2020": ["train"],
"ToxicConversationsClassification": ["train"],
"TweetSentimentExtractionClassification": ["train"],
"TwentyNewsgroupsClustering": ["train"],
"TwitterSemEval2015": ["train"],
"TwitterURLCorpus": ["train"],
"BiorxivClusteringP2P": ["train"],
"BiorxivClusteringS2S": ["train"],
"Banking77Classification": ["train"],
"ArguAna": ["train"],
"ArguAna-PL": ["train"],
"ArguAna-NL": ["train"], # translation not trained on
"NanoArguAnaRetrieval": ["train"],
"HotpotQA-PL": ["train"], # translation not trained on
"HotpotQA-NL": ["train"], # translation not trained on
"HotpotQAHardNegatives": ["train"],
"MSMARCO": ["train"],
"MSMARCOHardNegatives": ["train"],
"NanoMSMARCORetrieval": ["train"],
"MSMARCO-PL": ["train"], # translation not trained on
"mMARCO-NL": ["train"], # translation not trained on
"STS22": ["train"],
"AmazonCounterfactualClassification": ["train"],
"ArxivClusteringP2P": ["train"],
"ArxivClusteringS2S": ["train"],
"NQ": ["train"],
"SciFact": ["train"],
"QuoraRetrieval": ["train"],
"NanoQuoraRetrieval": ["train"],
"NQHardNegatives": ["train"],
"NanoNQRetrieval": ["train"],
"NQ-PL": ["train"], # translation not trained on
"NQ-NL": ["train"], # translation not trained on
"NFCorpus": ["train"],
"FEVERHardNegatives": ["train"],
"NanoFEVERRetrieval": ["train"],
"FEVER-NL": ["train"], # translation not trained on
"FiQA2018-PL": ["train"], # translation not trained on
"FiQA2018-NL": ["train"], # translation not trained on
"STS22": ["train"],
"AmazonCounterfactualClassification": ["train"],
"ArxivClusteringP2P.v2": ["train"],
"ArxivClusteringS2S.v2": ["train"],
"BiorxivClusteringP2P.v2": ["train"],
"BiorxivClusteringS2S.v2": ["train"],
"MedrxivClusteringP2P.v2": ["train"],
"MedrxivClusteringS2S.v2": ["train"],
"TwentyNewsgroupsClustering.v2": ["train"],
"MSMARCO": ["train"],
"MSMARCOHardNegatives": ["train"],
"NanoMSMARCORetrieval": ["train"],
"MSMARCO-PL": ["train"], # translation not trained on
"mMARCO-NL": ["train"], # translation not trained on
"HotpotQA-PL": ["train"], # translation not trained on
"HotpotQA-NL": ["train"], # translation not trained on
"HotpotQAHardNegatives": ["train"],
"FEVER": ["train"],
"FiQA2018": ["train"],
"DBPedia": ["train"],
"TRECCOVID": ["train"],
"ArxivClusteringP2P.v2": ["train"],
"STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on
},
public_training_code=None,
Expand Down