embeddings-benchmark · Samoed · Nov 25, 2025 · Nov 15, 2025 · Nov 15, 2025 · Nov 22, 2025
diff --git a/mteb/_evaluators/any_sts_evaluator.py b/mteb/_evaluators/any_sts_evaluator.py
@@ -12,6 +12,7 @@
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import EncoderProtocol
 from mteb.similarity_functions import compute_pairwise_similarity
+from mteb.types import PromptType
 
 from .evaluator import Evaluator
 
@@ -42,6 +43,8 @@ def __init__(
         task_metadata: TaskMetadata,
         hf_split: str,
         hf_subset: str,
+        input1_prompt_type: PromptType | None,
+        input2_prompt_type: PromptType | None,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -50,6 +53,8 @@ def __init__(
         self.task_metadata = task_metadata
         self.hf_split = hf_split
         self.hf_subset = hf_subset
+        self.input1_prompt_type = input1_prompt_type
+        self.input2_prompt_type = input2_prompt_type
 
     def __call__(
         self,
@@ -68,6 +73,7 @@ def __call__(
             task_metadata=self.task_metadata,
             hf_split=self.hf_split,
             hf_subset=self.hf_subset,
+            prompt_type=self.input1_prompt_type,
             **encode_kwargs,
         )
 
@@ -82,6 +88,7 @@ def __call__(
             task_metadata=self.task_metadata,
             hf_split=self.hf_split,
             hf_subset=self.hf_subset,
+            prompt_type=self.input2_prompt_type,
             **encode_kwargs,
         )
 

diff --git a/mteb/_evaluators/pair_classification_evaluator.py b/mteb/_evaluators/pair_classification_evaluator.py
@@ -14,6 +14,7 @@
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models import EncoderProtocol
 from mteb.similarity_functions import compute_pairwise_similarity
+from mteb.types import PromptType
 
 logger = logging.getLogger(__name__)
 
@@ -60,6 +61,8 @@ def __init__(
         task_metadata: TaskMetadata,
         hf_split: str,
         hf_subset: str,
+        input1_prompt_type: PromptType | None,
+        input2_prompt_type: PromptType | None,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -69,6 +72,8 @@ def __init__(
         self.task_metadata = task_metadata
         self.hf_split = hf_split
         self.hf_subset = hf_subset
+        self.input1_prompt_type = input1_prompt_type
+        self.input2_prompt_type = input2_prompt_type
 
         if len(self.dataset[self.input1_column_name]) != len(
             self.dataset[self.input2_column_name]
@@ -82,49 +87,34 @@ def __call__(
         model: EncoderProtocol,
         encode_kwargs: dict[str, Any],
     ) -> PairClassificationDistances:
-        logger.info("Running pair classification - Encoding inputs...")
-        if self.task_metadata.modalities == ["text"]:
-            # datasets v4 will pass column objects, so we need to extract the text
-            all_sentences = (
-                self.dataset[self.input1_column_name][:]
-                + self.dataset[self.input2_column_name][:]
-            )
-            len_sentences1 = len(self.dataset[self.input1_column_name])
-            embeddings = self._encode_unique_texts(
-                all_sentences,
-                model,
-                task_metadata=self.task_metadata,
-                hf_split=self.hf_split,
-                hf_subset=self.hf_subset,
-                **encode_kwargs,
-            )
-            embeddings1 = embeddings[:len_sentences1]
-            embeddings2 = embeddings[len_sentences1:]
-        else:
-            embeddings1 = model.encode(
-                create_dataloader(
-                    self.dataset,
-                    task_metadata=self.task_metadata,
-                    input_column=self.input1_column_name,
-                    **encode_kwargs,
-                ),
+        logger.info("Running pair classification - Encoding samples (1/2)")
+        embeddings1 = model.encode(
+            create_dataloader(
+                self.dataset,
                 task_metadata=self.task_metadata,
-                hf_split=self.hf_split,
-                hf_subset=self.hf_subset,
+                input_column=self.input1_column_name,
                 **encode_kwargs,
-            )
-            embeddings2 = model.encode(
-                create_dataloader(
-                    self.dataset,
-                    task_metadata=self.task_metadata,
-                    input_column=self.input2_column_name,
-                    **encode_kwargs,
-                ),
+            ),
+            task_metadata=self.task_metadata,
+            hf_split=self.hf_split,
+            hf_subset=self.hf_subset,
+            prompt_type=self.input1_prompt_type,
+            **encode_kwargs,
+        )
+        logger.info("Running pair classification - Encoding samples (2/2)")
+        embeddings2 = model.encode(
+            create_dataloader(
+                self.dataset,
                 task_metadata=self.task_metadata,
-                hf_split=self.hf_split,
-                hf_subset=self.hf_subset,
+                input_column=self.input2_column_name,
                 **encode_kwargs,
-            )
+            ),
+            task_metadata=self.task_metadata,
+            hf_split=self.hf_split,
+            hf_subset=self.hf_subset,
+            prompt_type=self.input2_prompt_type,
+            **encode_kwargs,
+        )
 
         logger.info("Running pair classification - Evaluating pair similarity...")
         cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)

diff --git a/mteb/abstasks/pair_classification.py b/mteb/abstasks/pair_classification.py
@@ -19,6 +19,7 @@
 from mteb.abstasks.abstask import AbsTask
 from mteb.models.model_meta import ScoringFunction
 from mteb.models.models_protocols import EncoderProtocol
+from mteb.types import PromptType
 from mteb.types.statistics import (
     ImageStatistics,
     LabelStatistics,
@@ -35,7 +36,7 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
     Attributes:
         num_samples: number of samples in the dataset.
         number_of_characters: Total number of symbols in the dataset.
-        unique_text_pairs: Number of unique pairs
+        unique_pairs: Number of unique pairs
 
         text1_statistics: Statistics for sentence1
         text2_statistics: Statistics for sentence2
@@ -65,12 +66,16 @@ class AbsTaskPairClassification(AbsTask):
         input2_column_name: The name of the column containing the second sentence in the pair.
         label_column_name: The name of the column containing the labels for the pairs. Labels should be 0 or 1.
         abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
+        input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
+        input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
     """
 
     abstask_prompt = "Retrieve text that are semantically similar to the given text."
     input1_column_name: str = "sentence1"
     input2_column_name: str = "sentence2"
     label_column_name: str = "labels"
+    input1_prompt_type: PromptType | None = None
+    input2_prompt_type: PromptType | None = None
 
     def _evaluate_subset(
         self,
@@ -93,6 +98,8 @@ def _evaluate_subset(
             task_metadata=self.metadata,
             hf_split=hf_split,
             hf_subset=hf_subset,
+            input1_prompt_type=self.input1_prompt_type,
+            input2_prompt_type=self.input2_prompt_type,
             **kwargs,
         )
         similarity_scores = evaluator(model, encode_kwargs=encode_kwargs)

diff --git a/mteb/abstasks/sts.py b/mteb/abstasks/sts.py
@@ -8,6 +8,7 @@
 from mteb._evaluators import AnySTSEvaluator
 from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
 from mteb.models import EncoderProtocol
+from mteb.types import PromptType
 from mteb.types.statistics import (
     ImageStatistics,
     ScoreStatistics,
@@ -89,12 +90,16 @@ class AbsTaskSTS(AbsTask):
         min_score: Minimum possible score in the dataset.
         max_score: Maximum possible score in the dataset.
         abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
+        input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
+        input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
     """
 
     abstask_prompt = "Retrieve semantically similar text."
     column_names: tuple[str, str] = ("sentence1", "sentence2")
     min_score: int = 0
     max_score: int = 5
+    input1_prompt_type: PromptType | None = None
+    input2_prompt_type: PromptType | None = None
 
     def _evaluate_subset(
         self,
@@ -115,6 +120,8 @@ def _evaluate_subset(
             task_metadata=self.metadata,
             hf_split=hf_split,
             hf_subset=hf_subset,
+            input1_prompt_type=self.input1_prompt_type,
+            input2_prompt_type=self.input2_prompt_type,
             **kwargs,
         )
         scores = evaluator(model, encode_kwargs=encode_kwargs)

diff --git a/mteb/descriptive_stats/PairClassification/TERRa.V2.json b/mteb/descriptive_stats/PairClassification/TERRa.V2.json
@@ -0,0 +1,35 @@
+{
+    "dev": {
+        "num_samples": 307,
+        "number_of_characters": 84848,
+        "unique_pairs": 307,
+        "text1_statistics": {
+            "total_text_length": 70844,
+            "min_text_length": 39,
+            "average_text_length": 230.76221498371336,
+            "max_text_length": 717,
+            "unique_texts": 282
+        },
+        "text2_statistics": {
+            "total_text_length": 14004,
+            "min_text_length": 12,
+            "average_text_length": 45.615635179153095,
+            "max_text_length": 129,
+            "unique_texts": 307
+        },
+        "labels_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "1": {
+                    "count": 153
+                },
+                "0": {
+                    "count": 154
+                }
+            }
+        }
+    }
+}
diff --git a/mteb/tasks/pair_classification/rus/__init__.py b/mteb/tasks/pair_classification/rus/__init__.py
@@ -1,3 +1,3 @@
-from .terra import TERRa
+from .terra import TERRa, TERRaV2
 
-__all__ = ["TERRa"]
+__all__ = ["TERRa", "TERRaV2"]
diff --git a/mteb/tasks/pair_classification/rus/terra.py b/mteb/tasks/pair_classification/rus/terra.py
@@ -1,31 +1,27 @@
 from mteb.abstasks.pair_classification import AbsTaskPairClassification
 from mteb.abstasks.task_metadata import TaskMetadata
+from mteb.types import PromptType
 
-
-class TERRa(AbsTaskPairClassification):
-    metadata = TaskMetadata(
-        name="TERRa",
-        dataset={
-            "path": "ai-forever/terra-pairclassification",
-            "revision": "7b58f24536063837d644aab9a023c62199b2a612",
-        },
-        description="Textual Entailment Recognition for Russian. This task requires to recognize, given two text fragments, "
-        + "whether the meaning of one text is entailed (can be inferred) from the other text.",
-        reference="https://arxiv.org/pdf/2010.15925",
-        type="PairClassification",
-        category="t2t",
-        modalities=["text"],
-        eval_splits=["dev"],
-        eval_langs=["rus-Cyrl"],
-        main_score="max_ap",
-        date=("2000-01-01", "2018-01-01"),
-        domains=["News", "Web", "Written"],
-        task_subtypes=[],
-        license="mit",
-        annotations_creators="human-annotated",
-        dialect=[],
-        sample_creation="found",
-        bibtex_citation=r"""
+_terra_metadata = dict(
+    dataset={
+        "path": "ai-forever/terra-pairclassification",
+        "revision": "7b58f24536063837d644aab9a023c62199b2a612",
+    },
+    reference="https://arxiv.org/pdf/2010.15925",
+    type="PairClassification",
+    category="t2t",
+    modalities=["text"],
+    eval_splits=["dev"],
+    eval_langs=["rus-Cyrl"],
+    main_score="max_ap",
+    date=("2000-01-01", "2018-01-01"),
+    domains=["News", "Web", "Written"],
+    task_subtypes=[],
+    license="mit",
+    annotations_creators="human-annotated",
+    dialect=[],
+    sample_creation="found",
+    bibtex_citation=r"""
 @article{shavrina2020russiansuperglue,
   author = {Shavrina, Tatiana
 and Fenogenova, Alena
@@ -42,7 +38,37 @@ class TERRa(AbsTaskPairClassification):
   year = {2020},
 }
 """,
+)
+
+
+class TERRa(AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="TERRa",
+        description="Textual Entailment Recognition for Russian. This task requires to recognize, given two text fragments, "
+        + "whether the meaning of one text is entailed (can be inferred) from the other text.",
         prompt="Given a premise, retrieve a hypothesis that is entailed by the premise",
+        **_terra_metadata,
+    )
+
+    def dataset_transform(self):
+        self.dataset = self.dataset.rename_column("sent1", "sentence1")
+        self.dataset = self.dataset.rename_column("sent2", "sentence2")
+
+
+class TERRaV2(AbsTaskPairClassification):
+    input1_prompt_type = PromptType.document
+    input2_prompt_type = PromptType.query
+
+    metadata = TaskMetadata(
+        name="TERRa.V2",
+        description="Textual Entailment Recognition for Russian. This task requires to recognize, given two text fragments, "
+        + "whether the meaning of one text is entailed (can be inferred) from the other text."
+        + " Version 2 uses different prompt types for the two inputs.",
+        adapted_from=["TERRa"],
+        prompt={
+            PromptType.query.value: "Given a premise, retrieve a hypothesis that is entailed by the premise"
+        },
+        **_terra_metadata,
     )
 
     def dataset_transform(self):

diff --git a/tests/test_evaluators/test_PairClassificationEvaluator.py b/tests/test_evaluators/test_PairClassificationEvaluator.py
@@ -21,6 +21,8 @@ def test_accuracy(self):
             MockClassificationTask.metadata,
             "test",
             "test",
+            input1_prompt_type=None,
+            input2_prompt_type=None,
         )
         distances = evaluator(
             mteb.get_model("baseline/random-encoder-baseline"),

diff --git a/tests/test_evaluators/test_STSEvaluator.py b/tests/test_evaluators/test_STSEvaluator.py
@@ -29,6 +29,8 @@ def test_output_structure(model, mock_task):
         task_metadata=mock_task.metadata,
         hf_subset="default",
         hf_split="test",
+        input1_prompt_type=None,
+        input2_prompt_type=None,
     )
     scores = evaluator(model, encode_kwargs={"batch_size": 32})