From 62eb9442e73fb700ae0a5bdadf6e0b17cd4ce729 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 19 Jan 2025 10:54:47 +0300 Subject: [PATCH 1/6] apply additions from #1794 --- mteb/models/bge_models.py | 111 +++++------------------ mteb/models/colbert_models.py | 14 ++- mteb/models/ru_sentence_models.py | 141 ++++++++++++++++++++++++------ 3 files changed, 149 insertions(+), 117 deletions(-) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 05547d6a04..17153dfd3c 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -7,8 +7,8 @@ model_prompts = {"query": "Represent this sentence for searching relevant passages: "} model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"} -bge_m_training_data = { - # source: https://arxiv.org/pdf/2402.03216 +bgem3_training_data = { + # source: https://arxiv.org/abs/2402.03216 "MIRACLRetrieval": ["train"], "MIRACLRetrievalHardNegatives": ["train"], "MIRACLReranking": ["train"], @@ -28,6 +28,28 @@ "HotpotQA": ["train"], "HotpotQA-PL": ["train"], # translation not trained on "HotpotQAHardNegatives": ["train"], + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CodeSearchNet": ["train"], + # not in mteb + # "s2orc" + # Wikipedia + # "xP3" + # "mC4" + # "CC-News" + # "MTP" + # "NLLB" + # "CCMatrix" + # TriviaQA + # COL-IEE + # PubMedQA + # SQuAD + # SimCSE + # mMARCO-ZH + # LawGPT + # NLI-zh2, LeCaRDv2, + # NLI, MultiLongDoc (their syntetic) # + synthetic data } @@ -89,38 +111,6 @@ # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) } -bgem3_training_data = { - # source https://arxiv.org/abs/2402.03216 - "T2Retrieval": ["train"], - "DuReader": ["train"], - "MMarcoReranking": ["train"], - "CMedQAv2-reranking": ["train"], - "HotpotQA": ["train"], - "NQ": ["train"], - "MSMARCO": ["train"], - "MrTidyRetrieval": ["train"], - "MIRACLRetrieval": ["train"], - "CodeSearchNet": ["train"], - # not in mteb - # "s2orc" - # Wikipedia - # "xP3" - # "mC4" - # "CC-News" - # "MTP" - # "NLLB" - # "CCMatrix" - # TriviaQA - # COL-IEE - # PubMedQA - # SQuAD - # SimCSE - # mMARCO-ZH - # LawGPT - # NLI-zh2, LeCaRDv2, - # NLI, MultiLongDoc (their syntetic) -} - # https://huggingface.co/BAAI/bge-m3/discussions/29 bgem3_languages = [ "afr_Latn", # af @@ -298,59 +288,6 @@ "zho_Hans", # zh ] -bge_m_training_data = { - # source: https://arxiv.org/pdf/2402.03216 - "MIRACLRetrieval": ["train"], - "MIRACLRetrievalHardNegatives": ["train"], - "MIRACLReranking": ["train"], - "LeCaRDv2": ["train"], - "CMedQAv1-reranking": ["train"], - "CMedQAv2-reranking": ["train"], - "MrTidyRetrieval": ["train"], - "T2Reranking": ["train"], - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - "HotpotQA": ["train"], - "HotpotQA-PL": ["train"], # translation not trained on - "HotpotQAHardNegatives": ["train"], - # + synthetic data -} - -bge_training_data = { - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) -} bge_small_en_v1_5 = ModelMeta( loader=partial( # type: ignore diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 8753791bff..51cd058f11 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -152,7 +152,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: languages=["eng_Latn"], open_weights=True, revision="c1e84128e85ef755c096a95bdb06b47793b13acf", - public_training_code=True, + public_training_code=None, release_date="2024-09-21", n_parameters=110 * 1e6, max_tokens=180, # Reduced for Benchmarking - see ColBERT paper @@ -164,6 +164,10 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, + public_training_data=True, + training_datasets={ + "MSMARCO": ["train"], # dev? + }, ) @@ -203,7 +207,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: ], open_weights=True, revision="4cf816e5e2b03167b132a3c847a9ecd48ba708e1", - public_training_code=False, + public_training_code=None, release_date="2024-08-16", n_parameters=559 * 1e6, max_tokens=8192, @@ -215,4 +219,10 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, + public_training_data=True, + training_datasets={ + "MSMARCO": ["train"], + "DuRetrieval": [], + "MIRACL": ["train"], + }, ) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 6bca544b11..1fb4a7ce3f 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -6,40 +6,51 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader -from .bge_models import bge_training_data +from .bge_models import bgem3_training_data -rubert_tiny2 = ModelMeta( - name="cointegrated/rubert-tiny2", +rubert_tiny = ModelMeta( + name="cointegrated/rubert-tiny", languages=["rus_Cyrl"], open_weights=True, - revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", - release_date="2021-10-28", + revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", + release_date="2021-05-24", n_parameters=29_400_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=2048, - reference="https://huggingface.co/cointegrated/rubert-tiny2", + reference="https://huggingface.co/cointegrated/rubert-tiny", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code="https://gist.github.com/avidale/7bc6350f26196918bf339c01261f5c60", + training_datasets={ + # [Yandex Translate corpus](https://translate.yandex.ru/corpus), [OPUS-100](https://huggingface.co/datasets/opus100) + "Tatoeba": ["train"], + }, + adapted_from="google-bert/bert-base-multilingual-cased", ) -rubert_tiny = ModelMeta( - name="cointegrated/rubert-tiny", +rubert_tiny2 = ModelMeta( + name="cointegrated/rubert-tiny2", languages=["rus_Cyrl"], open_weights=True, - revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", - release_date="2021-05-24", + revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", + release_date="2021-10-28", n_parameters=29_400_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=2048, - reference="https://huggingface.co/cointegrated/rubert-tiny", + reference="https://huggingface.co/cointegrated/rubert-tiny2", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code="https://colab.research.google.com/drive/1mSWfIQ6PIlteLVZ9DKKpcorycgLIKZLf?usp=sharing", + training_datasets={ + # https://huggingface.co/datasets/cointegrated/ru-paraphrase-NMT-Leipzig + # Wikipedia https://huggingface.co/datasets/Madjogger/JamSpell_dataset + # https://huggingface.co/datasets/imvladikon/leipzig_corpora_collection + }, + adapted_from="cointegrated/rubert-tiny", ) sbert_large_nlu_ru = ModelMeta( @@ -49,7 +60,6 @@ revision="af977d5dfa46a3635e29bf0ef383f2df2a08d47a", release_date="2020-11-20", n_parameters=427_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, # best guess @@ -57,6 +67,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) sbert_large_mt_nlu_ru = ModelMeta( @@ -66,7 +78,6 @@ revision="05300876c2b83f46d3ddd422a7f17e45cf633bb0", release_date="2021-05-18", n_parameters=427_000_000, - memory_usage=None, embed_dim=1024, license="Not specified", max_tokens=512, # best guess @@ -74,6 +85,11 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets={ + # SNLI, MNLI + # https://github.com/brmson/dataset-sts + }, ) user_base_ru = ModelMeta( @@ -89,21 +105,75 @@ revision="436a489a2087d61aa670b3496a9915f84e46c861", release_date="2024-06-10", n_parameters=427_000_000, - memory_usage=None, - embed_dim=1024, - license="Not specified", - max_tokens=512, # best guess - reference="https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru", + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/deepvk/USER-base", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], + adapted_from="https://huggingface.co/deepvk/deberta-v1-base", use_instructions=True, + training_datasets={ + "BibleNLPBitextMining": ["train"], + # https://github.com/unicamp-dl/mMARCO + # deepvk/ru-HNP + # deepvk/ru-WANLI + # MedNLI + # RCB + "TERRa": ["train"], + # Tapaco + # Opus100 + # BiblePar + # RudetoxifierDataDetox + # RuParadetox + "MIRACL": ["train"], + # MLDR + # Lenta + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + "MrTidyRetrieval": ["train"], + # "Panorama" + # PravoIsrael + # xlsum + # Fialka-v1 + # RussianKeywords + # Gazeta + # Gsm8k-ru + # DSumRu + # SummDialogNews + }, + public_training_code=None, +) + +user_bge_m3 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="deepvk/USER-bge-m3", + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + ), + name="deepvk/USER-bge-m3", + languages=["rus_Cyrl"], + open_weights=True, + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + release_date="2024-07-05", + n_parameters=359_000_000, + embed_dim=1024, + license="apache-2.0", + max_tokens=8194, + reference="https://huggingface.co/deepvk/USER-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from="https://huggingface.co/BAAI/bge-m3", + use_instructions=False, training_datasets={ "BibleNLPBitextMining": ["train"], "MLSUMClusteringP2P": ["train"], "MLSUMClusteringP2P.v2": ["train"], "MLSUMClusteringS2S": ["train"], "MLSUMClusteringS2S.v2": ["train"], - **bge_training_data, + **bgem3_training_data, # not MTEB: # "deepvk/ru-HNP": ["train"], # "deepvk/ru-WANLI": ["train"], @@ -120,8 +190,10 @@ # "bragovo/dsum_ru": ["train"], # "CarlBrendt/Summ_Dialog_News": ["train"], }, + public_training_code=None, ) + deberta_v1_ru = ModelMeta( name="deepvk/deberta-v1-base", languages=["rus_Cyrl"], @@ -129,7 +201,6 @@ revision="bdd30b0e19757e6940c92c7aff19e8fc0a60dff4", release_date="2023-02-07", n_parameters=124_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -137,6 +208,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + # Wikipedia, Books, Twitter comments, Pikabu, Proza.ru, Film subtitles, News websites, and Social corpus + public_training_code=None, + training_datasets=None, ) rubert_base_cased = ModelMeta( @@ -146,7 +220,6 @@ revision="4036cab694767a299f2b9e6492909664d9414229", release_date="2020-03-04", n_parameters=1280_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, # best guess @@ -154,6 +227,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) distilrubert_small_cased_conversational = ModelMeta( @@ -163,7 +238,6 @@ revision="e348066b4a7279b97138038299bddc6580a9169a", release_date="2022-06-28", n_parameters=107_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, @@ -171,6 +245,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) rubert_base_cased_sentence = ModelMeta( @@ -180,7 +256,6 @@ revision="78b5122d6365337dd4114281b0d08cd1edbb3bc8", release_date="2020-03-04", n_parameters=107_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, @@ -188,6 +263,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) labse_en_ru = ModelMeta( @@ -197,7 +274,6 @@ revision="cf0714e606d4af551e14ad69a7929cd6b0da7f7e", release_date="2021-06-10", n_parameters=129_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, @@ -205,6 +281,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) rubert_tiny_turbo = ModelMeta( @@ -214,7 +292,6 @@ revision="8ce0cf757446ce9bb2d5f5a4ac8103c7a1049054", release_date="2024-06-21", n_parameters=129_000_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=512, @@ -222,6 +299,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, training_datasets=None, # source model in unknown # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) @@ -233,7 +311,6 @@ revision="1940b046c6b5e125df11722b899130329d0a46da", release_date="2024-06-27", n_parameters=129_000_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=512, @@ -243,6 +320,7 @@ use_instructions=False, training_datasets=None, # source model in unknown # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + public_training_code=None, ) @@ -264,4 +342,11 @@ revision="89fb1651989adbb1cfcfdedafd7d102951ad0555", release_date="2024-07-29", use_instructions=True, + n_parameters=404_000_000, + max_tokens=514, + embed_dim=1024, + license="mit", + similarity_fn_name="cosine", + public_training_code=None, + training_datasets=None, ) From 4b7cbf08ea9b643592b6445697e6ede90127e7e2 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 19 Jan 2025 12:41:58 +0300 Subject: [PATCH 2/6] add annotations for rumodels --- mteb/models/bge_models.py | 4 ++-- mteb/models/misc_models.py | 38 +++---------------------------- mteb/models/ru_sentence_models.py | 37 +++++++++++++++++++++++------- 3 files changed, 34 insertions(+), 45 deletions(-) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 17153dfd3c..c7517ca36d 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -7,7 +7,7 @@ model_prompts = {"query": "Represent this sentence for searching relevant passages: "} model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"} -bgem3_training_data = { +bge_m3_training_data = { # source: https://arxiv.org/abs/2402.03216 "MIRACLRetrieval": ["train"], "MIRACLRetrievalHardNegatives": ["train"], @@ -467,7 +467,7 @@ use_instructions=False, public_training_data=True, public_training_code=None, - training_datasets=bgem3_training_data, + training_datasets=bge_m3_training_data, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 09e423240e..e6f524dbd8 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -7,7 +7,7 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.e5_models import E5_TRAINING_DATA -from .bge_models import bge_m_training_data, bge_training_data +from .bge_models import bge_m3_training_data, bge_training_data from .sentence_transformers_models import sent_trf_training_dataset Haon_Chen__speed_embedding_7b_instruct = ModelMeta( @@ -671,7 +671,7 @@ reference="https://huggingface.co/manu/bge-m3-custom-fr", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, + training_datasets=bge_m3_training_data, adapted_from="data/bge-m3-custom", superseded_by=None, ) @@ -1483,39 +1483,7 @@ adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", superseded_by=None, ) -deepvk__USER_bge_m3 = ModelMeta( - name="deepvk/USER-bge-m3", - revision="0cc6cfe48e260fb0474c753087a69369e88709ae", - release_date="2024-07-05", - languages=["rus_Cyrl"], - loader=None, - n_parameters=359026688, - memory_usage=None, - max_tokens=8194.0, - embed_dim=1024, - license="apache-2.0", - open_weights=True, - public_training_data=True, - public_training_code=None, - framework=["PyTorch", "Sentence Transformers"], - reference="https://huggingface.co/deepvk/USER-bge-m3", - similarity_fn_name="cosine", - use_instructions=None, - training_datasets=bge_m_training_data, # derived from. - # not in MTEB: - # "deepvk/ru-HNP": ["train"], - # "deepvk/ru-WANLI": ["train"], - # "Shitao/bge-m3-data": ["train"], - # "RussianNLP/russian_super_glue": ["train"], - # "reciTAL/mlsum": ["train"], - # "Milana/russian_keywords": ["train"], - # "IlyaGusev/gazeta": ["train"], - # "d0rj/gsm8k-ru": ["train"], - # "bragovo/dsum_ru": ["train"], - # "CarlBrendt/Summ_Dialog_News": ["train"], - adapted_from="USER-bge-m3", - superseded_by=None, -) + infgrad__stella_base_en_v2 = ModelMeta( name="infgrad/stella-base-en-v2", revision="c9e80ff9892d80b39dc54e30a7873f91ea161034", diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 1fb4a7ce3f..d3d68b9a22 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -6,7 +6,7 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader -from .bge_models import bgem3_training_data +from .bge_models import bge_m3_training_data rubert_tiny = ModelMeta( name="cointegrated/rubert-tiny", @@ -158,7 +158,7 @@ open_weights=True, revision="0cc6cfe48e260fb0474c753087a69369e88709ae", release_date="2024-07-05", - n_parameters=359_000_000, + n_parameters=359_026_688, embed_dim=1024, license="apache-2.0", max_tokens=8194, @@ -173,7 +173,7 @@ "MLSUMClusteringP2P.v2": ["train"], "MLSUMClusteringS2S": ["train"], "MLSUMClusteringS2S.v2": ["train"], - **bgem3_training_data, + **bge_m3_training_data, # not MTEB: # "deepvk/ru-HNP": ["train"], # "deepvk/ru-WANLI": ["train"], @@ -222,7 +222,7 @@ n_parameters=1280_000_000, embed_dim=768, license="Not specified", - max_tokens=512, # best guess + max_tokens=512, reference="https://huggingface.co/DeepPavlov/rubert-base-cased", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], @@ -264,7 +264,10 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, - training_datasets=None, + training_datasets={ + # "SNLI": [], + "XNLI": ["dev"] + }, ) labse_en_ru = ModelMeta( @@ -281,8 +284,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, + public_training_code=True, # https://colab.research.google.com/drive/1dnPRn0-ugj3vZgSpyCC9sgslM2SuSfHy?usp=sharing training_datasets=None, + adapted_from="sentence-transformers/LaBSE", ) rubert_tiny_turbo = ModelMeta( @@ -302,6 +306,7 @@ public_training_code=None, training_datasets=None, # source model in unknown # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + adapted_from="cointegrated/rubert-tiny2", ) labse_ru_turbo = ModelMeta( @@ -321,6 +326,7 @@ training_datasets=None, # source model in unknown # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, public_training_code=None, + adapted_from="cointegrated/LaBSE-en-ru", ) @@ -347,6 +353,21 @@ embed_dim=1024, license="mit", similarity_fn_name="cosine", - public_training_code=None, - training_datasets=None, + public_training_code=False, + adapted_from="ai-forever/ruRoberta-large", + training_datasets={ + # https://huggingface.co/ai-forever/ruRoberta-large + # https://huggingface.co/datasets/IlyaGusev/yandex_q_full + # https://huggingface.co/datasets/IlyaGusev/pikabu + # https://huggingface.co/datasets/IlyaGusev/ru_stackoverflow + # https://huggingface.co/datasets/IlyaGusev/habr + # https://huggingface.co/datasets/its5Q/habr_qna + # NewsCommentary + # MultiParaCrawl + "XNLI": [], + "XNLIV2": [], + "LanguageClassification": [], # XNLI + "MIRACLReranking": ["train"], + "MIRACLRetrieval": ["train"], + }, ) From 91ebda2fc1f5003a688bdd411214ae0cdfeb8448 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 19 Jan 2025 13:48:34 +0300 Subject: [PATCH 3/6] add nomic training data --- mteb/models/nomic_models.py | 90 ++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index aa6989941f..25ad8e4d31 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -90,6 +90,79 @@ def encode( # type: ignore return emb +nomic_training_data = { + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/data/contrastive_pretrain.yaml + # reddit_title_body + "RedditClustering": [], + "RedditClusteringP2P": [], + "RedditClustering.v2": [], + "RedditClusteringP2P.v2": [], + # amazon_reviews + # amazonqa + "AmazonPolarityClassification": [], + "AmazonReviewsClassification": [], + "AmazonCounterfactualClassification": [], + # paq + # s2orc_citation_titles + # s2orc_title_abstract + # s2orc_abstract_citation + # s2orc_abstract_body + # wikianswers + # wikipedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + # gooaq + # codesearch + "CodeSearchNetCCRetrieval": [], + "COIRCodeSearchNetRetrieval": [], + # yahoo_title_answer + # yahoo_qa + # yahoo_title_question + "YahooAnswersTopicsClassification": [], + # agnews + # ccnews + # npr + # eli5 + # cnn + # stackexchange_duplicate_questions + # stackexchange_title_body + # stackexchange_body_body + "StackExchangeClustering.v2": [], + "StackExchangeClusteringP2P.v2": [], + # sentence_compression + # wikihow + # altlex + # quora + "QuoraRetrieval": [], + "NanoQuoraRetrieval": [], + # simplewiki + # squad + "FQuADRetrieval": [], + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/data/finetune_triplets.yaml + # msmaro + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + # nq_triples + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # nli_triplets + # reddit + # medi_wiki + # medi_stackexchange + # medi_flickr + # medi_supernli + # hotpot + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + # fever + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], +} + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/eval/mteb_eval/eval_mteb.py#L142-L159 model_prompts = { "Classification": "classification: ", @@ -127,6 +200,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code=True, # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml + public_training_data=True, + training_datasets=nomic_training_data, ) nomic_embed_v1 = ModelMeta( @@ -153,6 +229,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by="nomic-ai/nomic-embed-text-v1.5", + public_training_code=True, + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml + training_datasets=nomic_training_data, ) nomic_embed_v1_ablated = ModelMeta( @@ -179,6 +258,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code=True, + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml + training_datasets=nomic_training_data, ) @@ -206,6 +288,8 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code=True, # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml + training_datasets=nomic_training_data, ) nomic_modern_bert_embed = ModelMeta( @@ -232,6 +316,10 @@ def encode( # type: ignore similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - adapted_from=None, + adapted_from="answerdotai/ModernBERT-base", + public_training_code=True, + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_pretrain_modernbert.yaml + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune_modernnomic.yaml superseded_by=None, + training_datasets=nomic_training_data, ) From fd1693291764ed3035b9c87ebc80a599297d832f Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 19 Jan 2025 13:53:13 +0300 Subject: [PATCH 4/6] fix metadata --- mteb/models/ru_sentence_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index d3d68b9a22..d7d395f599 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -22,7 +22,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code="https://gist.github.com/avidale/7bc6350f26196918bf339c01261f5c60", + public_training_code=True, # "https://gist.github.com/avidale/7bc6350f26196918bf339c01261f5c60", training_datasets={ # [Yandex Translate corpus](https://translate.yandex.ru/corpus), [OPUS-100](https://huggingface.co/datasets/opus100) "Tatoeba": ["train"], @@ -44,7 +44,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code="https://colab.research.google.com/drive/1mSWfIQ6PIlteLVZ9DKKpcorycgLIKZLf?usp=sharing", + public_training_code=True, # "https://colab.research.google.com/drive/1mSWfIQ6PIlteLVZ9DKKpcorycgLIKZLf?usp=sharing", training_datasets={ # https://huggingface.co/datasets/cointegrated/ru-paraphrase-NMT-Leipzig # Wikipedia https://huggingface.co/datasets/Madjogger/JamSpell_dataset From 140d8a49dedd76b60203bc82ae784a4069fa5ae9 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 22 Jan 2025 11:29:09 +0300 Subject: [PATCH 5/6] update rest of model meta --- mteb/models/bge_models.py | 2 +- mteb/models/ibm_granite_models.py | 67 +++++++++++++++++++++++++++++-- mteb/models/jina_models.py | 18 ++++++++- mteb/models/rerankers_custom.py | 3 +- mteb/models/ru_sentence_models.py | 3 +- mteb/models/stella_models.py | 2 + 6 files changed, 87 insertions(+), 8 deletions(-) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index b267a17414..001c711ed3 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -459,7 +459,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, - public_training_data=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", training_datasets=bge_m3_training_data, ) diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py index 63679879c2..e7c3b8b022 100644 --- a/mteb/models/ibm_granite_models.py +++ b/mteb/models/ibm_granite_models.py @@ -20,6 +20,65 @@ "zho_Hans", ] +granite_training_data = { + # Multilingual MC4 + # Multilingual Webhose + # English Wikipedia + # Multilingual Wikimedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + # Miracl Corpus (Title-Body) + # Stack Exchange Duplicate questions (titles) + # Stack Exchange Duplicate questions (titles) + # Stack Exchange Duplicate questions (bodies) + "StackOverflowDupQuestions": [], + "AskUbuntuDupQuestions": [], + # Stack Exchange (Title, Answer) pairs + # Stack Exchange (Title, Body) pairs + # Stack Exchange (Title, Body) pairs + # Machine Translations of Stack Exchange Duplicate questions (titles) + # Machine Translations of Stack Exchange (Title+Body, Answer) pairs + "StackExchangeClusteringP2P": [], + "StackExchangeClusteringP2P.v2": [], + "StackExchangeClustering": [], + "StackExchangeClustering.v2": [], + # SearchQA + # S2ORC (Title, Abstract) + # WikiAnswers Duplicate question pairs + # CCNews + # XSum + # SimpleWiki + # Machine Translated Cross Lingual Parallel Corpora + # SPECTER citation triplets + # Machine Translations of SPECTER citation triplets + # Natural Questions (NQ) + "NQ": ["test"], + "NQHardNegatives": ["test"], + # SQuAD2.0 + # HotpotQA + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + # Fever + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # PubMed + # Multilingual Miracl Triples + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + # Multilingual MrTydi Triples + "MrTidyRetrieval": ["train"], + # Sadeeem Question Asnwering + # DBPedia Title-Body Pairs + "DBPedia": ["train"], + # Synthetic: English Query-Wikipedia Passage + # Synthetic: English Fact Verification + # Synthetic: Multilingual Query-Wikipedia Passage + # Synthetic: Multilingual News Summaries + # IBM Internal Triples + # IBM Internal Title-Body Pairs +} granite_107m_multilingual = ModelMeta( loader=partial( # type: ignore @@ -44,7 +103,7 @@ public_training_code=None, public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_278m_multilingual = ModelMeta( @@ -70,7 +129,7 @@ public_training_code=None, public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_30m_english = ModelMeta( @@ -96,7 +155,7 @@ public_training_code=None, public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_125m_english = ModelMeta( @@ -122,5 +181,5 @@ public_training_code=None, public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 4f1b58a352..e855ad3c7a 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -222,9 +222,25 @@ def encode( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", - training_datasets=None, public_training_code=None, public_training_data=None, + training_datasets={ + # CulturaX + "STS12": [], + # "SICK": [], + # "WMT19": [], + # "MADLAD-3B": [], + # NLI + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # oasst1, oasst2 + }, + adapted_from="XLM-RoBERTa", ) diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index 1a0fd1f6ba..d2e90fad29 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -11,6 +11,7 @@ from mteb.encoder_interface import Encoder from mteb.evaluation.evaluators.RetrievalEvaluator import DenseRetrievalExactSearch from mteb.model_meta import ModelMeta +from mteb.models.bge_models import bge_m3_training_data logger = logging.getLogger(__name__) @@ -291,7 +292,7 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, - public_training_data=None, + public_training_data=bge_m3_training_data, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index bd67ad7012..a91b6e7286 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -149,6 +149,7 @@ # SummDialogNews }, public_training_code=None, + public_training_data=None, ) user_bge_m3 = ModelMeta( @@ -364,7 +365,6 @@ embed_dim=1024, license="mit", similarity_fn_name="cosine", - public_training_code=False, adapted_from="ai-forever/ruRoberta-large", training_datasets={ # https://huggingface.co/ai-forever/ruRoberta-large @@ -382,5 +382,6 @@ "MIRACLRetrieval": ["train"], }, public_training_data=None, + public_training_code=None, framework=["Sentence Transformers", "PyTorch"], ) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 7210b287cb..92d5db7c8a 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -29,6 +29,7 @@ framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", training_datasets=None, + # will be at https://github.com/NLPJCL/RAG-Retrieval public_training_code=None, public_training_data=None, ) @@ -55,6 +56,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", + # will be at https://github.com/NLPJCL/RAG-Retrieval training_datasets=None, public_training_code=None, public_training_data=None, From 8107a8cd25ab95ba9f55c70ebd365f8f05518d5c Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 22 Jan 2025 12:26:08 +0300 Subject: [PATCH 6/6] fix bge reranker --- mteb/models/rerankers_custom.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index d2e90fad29..0e2c8d8f73 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -292,9 +292,9 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, - public_training_data=bge_m3_training_data, + public_training_data=None, similarity_fn_name=None, use_instructions=None, - training_datasets=None, + training_datasets=bge_m3_training_data, framework=["Sentence Transformers", "PyTorch"], )