Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions mteb/_evaluators/any_sts_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from mteb.abstasks.task_metadata import TaskMetadata
from mteb.models import EncoderProtocol
from mteb.similarity_functions import compute_pairwise_similarity
from mteb.types import PromptType

from .evaluator import Evaluator

Expand Down Expand Up @@ -42,6 +43,8 @@ def __init__(
task_metadata: TaskMetadata,
hf_split: str,
hf_subset: str,
input1_prompt_type: PromptType | None,
input2_prompt_type: PromptType | None,
**kwargs,
) -> None:
super().__init__(**kwargs)
Expand All @@ -50,6 +53,8 @@ def __init__(
self.task_metadata = task_metadata
self.hf_split = hf_split
self.hf_subset = hf_subset
self.input1_prompt_type = input1_prompt_type
self.input2_prompt_type = input2_prompt_type

def __call__(
self,
Expand All @@ -68,6 +73,7 @@ def __call__(
task_metadata=self.task_metadata,
hf_split=self.hf_split,
hf_subset=self.hf_subset,
prompt_type=self.input1_prompt_type,
**encode_kwargs,
)

Expand All @@ -82,6 +88,7 @@ def __call__(
task_metadata=self.task_metadata,
hf_split=self.hf_split,
hf_subset=self.hf_subset,
prompt_type=self.input2_prompt_type,
**encode_kwargs,
)

Expand Down
68 changes: 29 additions & 39 deletions mteb/_evaluators/pair_classification_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from mteb.abstasks.task_metadata import TaskMetadata
from mteb.models import EncoderProtocol
from mteb.similarity_functions import compute_pairwise_similarity
from mteb.types import PromptType

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -60,6 +61,8 @@ def __init__(
task_metadata: TaskMetadata,
hf_split: str,
hf_subset: str,
input1_prompt_type: PromptType | None,
input2_prompt_type: PromptType | None,
**kwargs,
) -> None:
super().__init__(**kwargs)
Expand All @@ -69,6 +72,8 @@ def __init__(
self.task_metadata = task_metadata
self.hf_split = hf_split
self.hf_subset = hf_subset
self.input1_prompt_type = input1_prompt_type
self.input2_prompt_type = input2_prompt_type

if len(self.dataset[self.input1_column_name]) != len(
self.dataset[self.input2_column_name]
Expand All @@ -82,49 +87,34 @@ def __call__(
model: EncoderProtocol,
encode_kwargs: dict[str, Any],
) -> PairClassificationDistances:
logger.info("Running pair classification - Encoding inputs...")
if self.task_metadata.modalities == ["text"]:
# datasets v4 will pass column objects, so we need to extract the text
all_sentences = (
self.dataset[self.input1_column_name][:]
+ self.dataset[self.input2_column_name][:]
)
len_sentences1 = len(self.dataset[self.input1_column_name])
embeddings = self._encode_unique_texts(
all_sentences,
model,
task_metadata=self.task_metadata,
hf_split=self.hf_split,
hf_subset=self.hf_subset,
**encode_kwargs,
)
embeddings1 = embeddings[:len_sentences1]
embeddings2 = embeddings[len_sentences1:]
else:
embeddings1 = model.encode(
create_dataloader(
self.dataset,
task_metadata=self.task_metadata,
input_column=self.input1_column_name,
**encode_kwargs,
),
logger.info("Running pair classification - Encoding samples (1/2)")
embeddings1 = model.encode(
create_dataloader(
self.dataset,
task_metadata=self.task_metadata,
hf_split=self.hf_split,
hf_subset=self.hf_subset,
input_column=self.input1_column_name,
**encode_kwargs,
)
embeddings2 = model.encode(
create_dataloader(
self.dataset,
task_metadata=self.task_metadata,
input_column=self.input2_column_name,
**encode_kwargs,
),
),
task_metadata=self.task_metadata,
hf_split=self.hf_split,
hf_subset=self.hf_subset,
prompt_type=self.input1_prompt_type,
**encode_kwargs,
)
logger.info("Running pair classification - Encoding samples (2/2)")
embeddings2 = model.encode(
create_dataloader(
self.dataset,
task_metadata=self.task_metadata,
hf_split=self.hf_split,
hf_subset=self.hf_subset,
input_column=self.input2_column_name,
**encode_kwargs,
)
),
task_metadata=self.task_metadata,
hf_split=self.hf_split,
hf_subset=self.hf_subset,
prompt_type=self.input2_prompt_type,
**encode_kwargs,
)

logger.info("Running pair classification - Evaluating pair similarity...")
cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
Expand Down
9 changes: 8 additions & 1 deletion mteb/abstasks/pair_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from mteb.abstasks.abstask import AbsTask
from mteb.models.model_meta import ScoringFunction
from mteb.models.models_protocols import EncoderProtocol
from mteb.types import PromptType
from mteb.types.statistics import (
ImageStatistics,
LabelStatistics,
Expand All @@ -35,7 +36,7 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
unique_text_pairs: Number of unique pairs
unique_pairs: Number of unique pairs

text1_statistics: Statistics for sentence1
text2_statistics: Statistics for sentence2
Expand Down Expand Up @@ -65,12 +66,16 @@ class AbsTaskPairClassification(AbsTask):
input2_column_name: The name of the column containing the second sentence in the pair.
label_column_name: The name of the column containing the labels for the pairs. Labels should be 0 or 1.
abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
"""

abstask_prompt = "Retrieve text that are semantically similar to the given text."
input1_column_name: str = "sentence1"
input2_column_name: str = "sentence2"
label_column_name: str = "labels"
input1_prompt_type: PromptType | None = None
input2_prompt_type: PromptType | None = None

def _evaluate_subset(
self,
Expand All @@ -93,6 +98,8 @@ def _evaluate_subset(
task_metadata=self.metadata,
hf_split=hf_split,
hf_subset=hf_subset,
input1_prompt_type=self.input1_prompt_type,
input2_prompt_type=self.input2_prompt_type,
**kwargs,
)
similarity_scores = evaluator(model, encode_kwargs=encode_kwargs)
Expand Down
7 changes: 7 additions & 0 deletions mteb/abstasks/sts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from mteb._evaluators import AnySTSEvaluator
from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
from mteb.models import EncoderProtocol
from mteb.types import PromptType
from mteb.types.statistics import (
ImageStatistics,
ScoreStatistics,
Expand Down Expand Up @@ -89,12 +90,16 @@ class AbsTaskSTS(AbsTask):
min_score: Minimum possible score in the dataset.
max_score: Maximum possible score in the dataset.
abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
"""

abstask_prompt = "Retrieve semantically similar text."
column_names: tuple[str, str] = ("sentence1", "sentence2")
min_score: int = 0
max_score: int = 5
input1_prompt_type: PromptType | None = None
input2_prompt_type: PromptType | None = None

def _evaluate_subset(
self,
Expand All @@ -115,6 +120,8 @@ def _evaluate_subset(
task_metadata=self.metadata,
hf_split=hf_split,
hf_subset=hf_subset,
input1_prompt_type=self.input1_prompt_type,
input2_prompt_type=self.input2_prompt_type,
**kwargs,
)
scores = evaluator(model, encode_kwargs=encode_kwargs)
Expand Down
35 changes: 35 additions & 0 deletions mteb/descriptive_stats/PairClassification/TERRa.V2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"dev": {
"num_samples": 307,
"number_of_characters": 84848,
"unique_pairs": 307,
"text1_statistics": {
"total_text_length": 70844,
"min_text_length": 39,
"average_text_length": 230.76221498371336,
"max_text_length": 717,
"unique_texts": 282
},
"text2_statistics": {
"total_text_length": 14004,
"min_text_length": 12,
"average_text_length": 45.615635179153095,
"max_text_length": 129,
"unique_texts": 307
},
"labels_statistics": {
"min_labels_per_text": 1,
"average_label_per_text": 1.0,
"max_labels_per_text": 1,
"unique_labels": 2,
"labels": {
"1": {
"count": 153
},
"0": {
"count": 154
}
}
}
}
}
4 changes: 2 additions & 2 deletions mteb/tasks/pair_classification/rus/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .terra import TERRa
from .terra import TERRa, TERRaV2

__all__ = ["TERRa"]
__all__ = ["TERRa", "TERRaV2"]
76 changes: 51 additions & 25 deletions mteb/tasks/pair_classification/rus/terra.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,27 @@
from mteb.abstasks.pair_classification import AbsTaskPairClassification
from mteb.abstasks.task_metadata import TaskMetadata
from mteb.types import PromptType


class TERRa(AbsTaskPairClassification):
metadata = TaskMetadata(
name="TERRa",
dataset={
"path": "ai-forever/terra-pairclassification",
"revision": "7b58f24536063837d644aab9a023c62199b2a612",
},
description="Textual Entailment Recognition for Russian. This task requires to recognize, given two text fragments, "
+ "whether the meaning of one text is entailed (can be inferred) from the other text.",
reference="https://arxiv.org/pdf/2010.15925",
type="PairClassification",
category="t2t",
modalities=["text"],
eval_splits=["dev"],
eval_langs=["rus-Cyrl"],
main_score="max_ap",
date=("2000-01-01", "2018-01-01"),
domains=["News", "Web", "Written"],
task_subtypes=[],
license="mit",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation=r"""
_terra_metadata = dict(
dataset={
"path": "ai-forever/terra-pairclassification",
"revision": "7b58f24536063837d644aab9a023c62199b2a612",
},
reference="https://arxiv.org/pdf/2010.15925",
type="PairClassification",
category="t2t",
modalities=["text"],
eval_splits=["dev"],
eval_langs=["rus-Cyrl"],
main_score="max_ap",
date=("2000-01-01", "2018-01-01"),
domains=["News", "Web", "Written"],
task_subtypes=[],
license="mit",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation=r"""
@article{shavrina2020russiansuperglue,
author = {Shavrina, Tatiana
and Fenogenova, Alena
Expand All @@ -42,7 +38,37 @@ class TERRa(AbsTaskPairClassification):
year = {2020},
}
""",
)


class TERRa(AbsTaskPairClassification):
metadata = TaskMetadata(
name="TERRa",
description="Textual Entailment Recognition for Russian. This task requires to recognize, given two text fragments, "
+ "whether the meaning of one text is entailed (can be inferred) from the other text.",
prompt="Given a premise, retrieve a hypothesis that is entailed by the premise",
**_terra_metadata,
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("sent1", "sentence1")
self.dataset = self.dataset.rename_column("sent2", "sentence2")


class TERRaV2(AbsTaskPairClassification):
input1_prompt_type = PromptType.document
input2_prompt_type = PromptType.query

metadata = TaskMetadata(
name="TERRa.V2",
description="Textual Entailment Recognition for Russian. This task requires to recognize, given two text fragments, "
+ "whether the meaning of one text is entailed (can be inferred) from the other text."
+ " Version 2 uses different prompt types for the two inputs.",
adapted_from=["TERRa"],
prompt={
PromptType.query.value: "Given a premise, retrieve a hypothesis that is entailed by the premise"
},
**_terra_metadata,
)

def dataset_transform(self):
Expand Down
2 changes: 2 additions & 0 deletions tests/test_evaluators/test_PairClassificationEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def test_accuracy(self):
MockClassificationTask.metadata,
"test",
"test",
input1_prompt_type=None,
input2_prompt_type=None,
)
distances = evaluator(
mteb.get_model("baseline/random-encoder-baseline"),
Expand Down
2 changes: 2 additions & 0 deletions tests/test_evaluators/test_STSEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def test_output_structure(model, mock_task):
task_metadata=mock_task.metadata,
hf_subset="default",
hf_split="test",
input1_prompt_type=None,
input2_prompt_type=None,
)
scores = evaluator(model, encode_kwargs={"batch_size": 32})

Expand Down
Loading