diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index d9a8bd1041..a55230a602 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -77,7 +77,11 @@ use_instructions=False, public_training_code=None, public_training_data=None, - training_datasets=None, + adapted_from="google/bert_uncased_L-12_H-768_A-12", + training_datasets={ + # SNLI + # MNLI + }, ) sbert_large_mt_nlu_ru = ModelMeta( @@ -226,7 +230,15 @@ # Wikipedia, Books, Twitter comments, Pikabu, Proza.ru, Film subtitles, News websites, and Social corpus public_training_code=None, public_training_data=None, - training_datasets=None, + training_datasets={ + # 400 GB of filtered and deduplicated texts in total. + # A mix of the following data: Wikipedia, Books, Twitter comments, Pikabu, Proza.ru, + # Film subtitles, News websites, and Social corpus. + # wikipedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + "RiaNewsRetrieval": [], # probably + }, ) rubert_base_cased = ModelMeta( @@ -246,7 +258,12 @@ use_instructions=False, public_training_code=None, public_training_data=None, - training_datasets=None, + adapted_from="google/bert_uncased_L-12_H-768_A-12", + training_datasets={ + # wikipedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + }, ) distilrubert_small_cased_conversational = ModelMeta( @@ -266,7 +283,10 @@ use_instructions=False, public_training_code=None, public_training_data=None, - training_datasets=None, + adapted_from="DeepPavlov/distilrubert-base-cased-conversational", + training_datasets={ + # OpenSubtitles[1], Dirty, Pikabu, and a Social Media segment of Taiga corpus + }, ) rubert_base_cased_sentence = ModelMeta( @@ -309,10 +329,15 @@ use_instructions=False, public_training_code="https://colab.research.google.com/drive/1dnPRn0-ugj3vZgSpyCC9sgslM2SuSfHy?usp=sharing", public_training_data=None, - training_datasets=None, + training_datasets={ + # https://translate.yandex.ru/corpus + }, adapted_from="sentence-transformers/LaBSE", ) +turbo_models_datasets = { + # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, +} rubert_tiny_turbo = ModelMeta( name="sergeyzh/rubert-tiny-turbo", languages=["rus_Cyrl"], @@ -330,8 +355,7 @@ use_instructions=False, public_training_code=None, public_training_data=None, - training_datasets=None, - # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=turbo_models_datasets, adapted_from="cointegrated/rubert-tiny2", ) @@ -350,8 +374,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets=None, - # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=turbo_models_datasets, public_training_code=None, adapted_from="cointegrated/LaBSE-en-ru", public_training_data=None,