diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 8dfaacddbc..2a9c7063a4 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -14,6 +14,47 @@ nomic_training_data, ) +GIGA_task_prompts = { + "TERRa": "Given a premise, retrieve a hypothesis that is entailed by the premise\nquery: ", + "STS22": "Retrieve semantically similar text\nquery: ", + "RuSTSBenchmarkSTS": "Retrieve semantically similar text\nquery: ", + "RUParaPhraserSTS": "Retrieve semantically similar text\nquery: ", + "CEDRClassification": "Дан комментарий, определи выраженную в нем эмоцию (радость, грусть, удивление, страх, гнев или нейтрально) \nкомментарий: ", + "GeoreviewClassification": "Classify the organization rating based on the reviews\nquery: ", + "GeoreviewClusteringP2P": "Классифицируй рейтинг организации на основе отзыва \nотзыв: ", + "HeadlineClassification": "Классифицируй тему данного новостного заголовка \nзаголовок: ", + "InappropriatenessClassification": "Классифицируй данный комментарий как токсичный или не токсичный \nкомментарий: ", + "KinopoiskClassification": "Classify the sentiment expressed in the given movie review text\nquery: ", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents\nquery: ", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios\nquery: ", + "RuReviewsClassification": "Classify product reviews into positive, negative or neutral sentiment\nquery: ", + "RuSciBenchGRNTIClassification": "Classify the category of scientific papers based on the titles and abstracts\nquery: ", + "RuSciBenchGRNTIClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации \nаннотация: ", + "RuSciBenchOECDClassification": "Classify the category of scientific papers based on the titles and abstracts\nquery: ", + "RuSciBenchOECDClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации \nаннотация: ", + "SensitiveTopicsClassification": "Классифицируй чувствительную тему по запросу \nзапрос: ", + "RuBQRetrieval": { + "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "passage": "", + }, + "RuBQReranking": { + "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "passage": "", + }, + "RiaNewsRetrieval": { + "query": "Given a news title, retrieve relevant news article\nquery: ", + "passage": "", + }, + "MIRACLReranking": { + "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "passage": "", + }, + "MIRACLRetrieval": { + "query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ", + "passage": "", + }, +} + rubert_tiny = ModelMeta( name="cointegrated/rubert-tiny", languages=["rus-Cyrl"], @@ -589,13 +630,15 @@ ) giga_embeddings = ModelMeta( - loader=partial( + loader=partial( # type: ignore InstructSentenceTransformerWrapper, model_name="ai-sage/Giga-Embeddings-instruct", revision="40b27667b9ad586d7812675df76e5062ccc80b0e", - trust_remote_code=True, - instruction_template="{instruction}\nquery: ", + instruction_template="{instruction}", + max_seq_length=512, apply_instruction_to_passages=False, + prompts_dict=GIGA_task_prompts, + trust_remote_code=True, model_kwargs={ "torch_dtype": torch.bfloat16, },