Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 46 additions & 3 deletions mteb/models/ru_sentence_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,47 @@
nomic_training_data,
)

GIGA_task_prompts = {
"TERRa": "Given a premise, retrieve a hypothesis that is entailed by the premise\nquery: ",
"STS22": "Retrieve semantically similar text\nquery: ",
"RuSTSBenchmarkSTS": "Retrieve semantically similar text\nquery: ",
"RUParaPhraserSTS": "Retrieve semantically similar text\nquery: ",
"CEDRClassification": "Дан комментарий, определи выраженную в нем эмоцию (радость, грусть, удивление, страх, гнев или нейтрально) \nкомментарий: ",
"GeoreviewClassification": "Classify the organization rating based on the reviews\nquery: ",
"GeoreviewClusteringP2P": "Классифицируй рейтинг организации на основе отзыва \nотзыв: ",
"HeadlineClassification": "Классифицируй тему данного новостного заголовка \nзаголовок: ",
"InappropriatenessClassification": "Классифицируй данный комментарий как токсичный или не токсичный \nкомментарий: ",
"KinopoiskClassification": "Classify the sentiment expressed in the given movie review text\nquery: ",
"MassiveIntentClassification": "Given a user utterance as query, find the user intents\nquery: ",
"MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios\nquery: ",
"RuReviewsClassification": "Classify product reviews into positive, negative or neutral sentiment\nquery: ",
"RuSciBenchGRNTIClassification": "Classify the category of scientific papers based on the titles and abstracts\nquery: ",
"RuSciBenchGRNTIClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации \nаннотация: ",
"RuSciBenchOECDClassification": "Classify the category of scientific papers based on the titles and abstracts\nquery: ",
"RuSciBenchOECDClusteringP2P": "Классифицируй категорию научной статьи основываясь на аннотации \nаннотация: ",
"SensitiveTopicsClassification": "Классифицируй чувствительную тему по запросу \nзапрос: ",
"RuBQRetrieval": {
"query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ",
"passage": "",
},
"RuBQReranking": {
"query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ",
"passage": "",
},
"RiaNewsRetrieval": {
"query": "Given a news title, retrieve relevant news article\nquery: ",
"passage": "",
},
"MIRACLReranking": {
"query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ",
"passage": "",
},
"MIRACLRetrieval": {
"query": "Given a question, retrieve Wikipedia passages that answer the question\nquery: ",
"passage": "",
},
}

rubert_tiny = ModelMeta(
name="cointegrated/rubert-tiny",
languages=["rus-Cyrl"],
Expand Down Expand Up @@ -589,13 +630,15 @@
)

giga_embeddings = ModelMeta(
loader=partial(
loader=partial( # type: ignore
InstructSentenceTransformerWrapper,
model_name="ai-sage/Giga-Embeddings-instruct",
revision="40b27667b9ad586d7812675df76e5062ccc80b0e",
trust_remote_code=True,
instruction_template="{instruction}\nquery: ",
instruction_template="{instruction}",
max_seq_length=512,
apply_instruction_to_passages=False,
prompts_dict=GIGA_task_prompts,
trust_remote_code=True,
model_kwargs={
"torch_dtype": torch.bfloat16,
},
Expand Down