diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 31169fab68..2c96973ab3 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -170,7 +170,7 @@ def evaluate( def _evaluate_subset( self, model: Encoder, - data_split: DatasetDict | Dataset, + data_split: Dataset, encode_kwargs: dict[str, Any], hf_split: str, hf_subset: str, diff --git a/mteb/abstasks/AbsTaskAnySTS.py b/mteb/abstasks/AbsTaskAnySTS.py new file mode 100644 index 0000000000..5ddd8d7445 --- /dev/null +++ b/mteb/abstasks/AbsTaskAnySTS.py @@ -0,0 +1,190 @@ +from __future__ import annotations + +import logging +from typing import Any + +from datasets import Dataset + +from mteb.abstasks.TaskMetadata import ( + DescriptiveStatistics, + ImageStatistics, + ScoreStatistics, + TextStatistics, +) +from mteb.encoder_interface import Encoder + +from ..evaluation.evaluators import AnySTSEvaluator +from ..load_results.task_results import ScoresDict +from .AbsTask import AbsTask + +logger = logging.getLogger(__name__) + + +class AnySTSDescriptiveStatistics(DescriptiveStatistics): + """Descriptive statistics for STS + + Attributes: + num_samples: number of samples in the dataset. + number_of_characters: Total number of symbols in the dataset. + unique_pairs: Number of unique pairs + + text1_statistics: Statistics for sentence1 + text2_statistics: Statistics for sentence2 + + image1_statistics: Statistics for image1 + image2_statistics: Statistics for image2 + + label_statistics: Statistics for labels + """ + + num_samples: int + number_of_characters: int | None + unique_pairs: int | None + + text1_statistics: TextStatistics | None + text2_statistics: TextStatistics | None + + image1_statistics: ImageStatistics | None + image2_statistics: ImageStatistics | None + + label_statistics: ScoreStatistics + + +class AbsTaskAnySTS(AbsTask): + """Abstract class for STS experiments. + + self.load_data() must generate a huggingface dataset with a split matching self.metadata.eval_splits, and assign it to self.dataset. It must contain the following columns:: + sentence1: str + sentence2: str + score: float + """ + + abstask_prompt = "Retrieve semantically similar text." + column_names: tuple[str, str] = ("sentence1", "sentence2") + min_score: int = 0 + max_score: int = 5 + + def _evaluate_subset( + self, + model: Encoder, + data_split: Dataset, + encode_kwargs: dict[str, Any], + hf_split: str, + hf_subset: str, + **kwargs: Any, + ) -> ScoresDict: + normalized_scores = list(map(self.normalize, data_split["score"])) + evaluator = AnySTSEvaluator( + data_split, + self.column_names, + normalized_scores, + task_metadata=self.metadata, + hf_split=hf_split, + hf_subset=hf_subset, + **kwargs, + ) + scores = evaluator(model, encode_kwargs=encode_kwargs) + return scores + + def _calculate_metrics_from_split( + self, split: str, hf_subset: str | None = None, compute_overall: bool = False + ) -> AnySTSDescriptiveStatistics: + first_column, second_column = self.column_names + if hf_subset: + sentence1 = self.dataset[hf_subset][split][first_column] + sentence2 = self.dataset[hf_subset][split][second_column] + score = self.dataset[hf_subset][split]["score"] + elif compute_overall: + sentence1 = [] + sentence2 = [] + score = [] + for hf_subset in self.metadata.eval_langs: + sentence1.extend(self.dataset[hf_subset][split][first_column]) + sentence2.extend(self.dataset[hf_subset][split][second_column]) + score.extend(self.dataset[hf_subset][split]["score"]) + else: + sentence1 = self.dataset[split][first_column] + sentence2 = self.dataset[split][second_column] + score = self.dataset[split]["score"] + + if "text" in self.metadata.modalities: + text1_statistics = TextStatistics( + min_text_length=min(len(s) for s in sentence1), + average_text_length=sum(len(s) for s in sentence1) / len(sentence1), + max_text_length=max(len(s) for s in sentence1), + unique_texts=len(set(sentence1)), + ) + text2_statistics = TextStatistics( + min_text_length=min(len(s) for s in sentence2), + max_text_length=max(len(s) for s in sentence2), + average_text_length=sum(len(s) for s in sentence2) / len(sentence2), + unique_texts=len(set(sentence2)), + ) + sentence1_len = [len(s) for s in sentence1] + sentence2_len = [len(s) for s in sentence2] + number_of_characters = sum(sentence1_len) + sum(sentence2_len) + unique_pairs = len(set(zip(sentence1, sentence2))) + else: + text1_statistics = None + text2_statistics = None + number_of_characters = None + unique_pairs = None + + if "image" in self.metadata.modalities: + img_widths1, img_heights1 = [], [] + for img in sentence1: + width, height = img.size + img_heights1.append(height) + img_widths1.append(width) + + image1_statistics = ImageStatistics( + min_image_width=min(img_widths1), + average_image_width=sum(img_widths1) / len(img_widths1), + max_image_width=max(img_widths1), + min_image_height=min(img_heights1), + average_image_height=sum(img_heights1) / len(img_heights1), + max_image_height=max(img_widths1), + ) + + img_widths2, img_heights2 = [], [] + for img in sentence2: + width, height = img.size + img_heights2.append(height) + img_widths2.append(width) + + image2_statistics = ImageStatistics( + min_image_width=min(img_widths2), + average_image_width=sum(img_widths2) / len(img_widths2), + max_image_width=max(img_widths2), + min_image_height=min(img_heights2), + average_image_height=sum(img_heights2) / len(img_heights2), + max_image_height=max(img_widths2), + ) + else: + image1_statistics = None + image2_statistics = None + + labels_statistics = ScoreStatistics( + min_score=min(score), + avg_score=sum(score) / len(score), + max_score=max(score), + ) + + return AnySTSDescriptiveStatistics( + num_samples=len(sentence1), + number_of_characters=number_of_characters, + unique_pairs=unique_pairs, + text1_statistics=text1_statistics, + text2_statistics=text2_statistics, + image1_statistics=image1_statistics, + image2_statistics=image2_statistics, + label_statistics=labels_statistics, + ) + + def _push_dataset_to_hub(self, repo_name: str) -> None: + self._upload_dataset_to_hub( + repo_name, [self.column_names[0], self.column_names[1], "score"] + ) + + def normalize(self, x: float) -> float: + return (x - self.min_score) / (self.max_score - self.min_score) diff --git a/mteb/abstasks/AbsTaskSTS.py b/mteb/abstasks/AbsTaskSTS.py deleted file mode 100644 index 957e51d679..0000000000 --- a/mteb/abstasks/AbsTaskSTS.py +++ /dev/null @@ -1,139 +0,0 @@ -from __future__ import annotations - -import logging -from typing import Any - -from mteb.abstasks.TaskMetadata import DescriptiveStatistics - -from ..evaluation.evaluators import STSEvaluator -from ..load_results.task_results import ScoresDict -from .AbsTask import AbsTask - -logger = logging.getLogger(__name__) - - -class STSDescriptiveStatistics(DescriptiveStatistics): - """Descriptive statistics for STS - - Attributes: - num_samples: number of samples in the dataset. - number_of_characters: Total number of symbols in the dataset. - unique_pairs: Number of unique pairs - - min_sentence1_length: Minimum length of sentence1 - average_sentence1_len: Average length of sentence1 - max_sentence1_length: Maximum length of sentence1 - - min_sentence2_length: Minimum length of sentence2 - average_sentence2_len: Average length of sentence2 - max_sentence2_length: Maximum length of sentence2 - - min_score: Minimum score - avg_score: Average score - max_score: Maximum score - """ - - num_samples: int - number_of_characters: int - unique_pairs: int - - min_sentence1_length: int - average_sentence1_len: float - max_sentence1_length: int - unique_sentence1: int - - min_sentence2_length: int - average_sentence2_len: float - max_sentence2_length: int - unique_sentence2: int - - min_score: float - avg_score: float - max_score: float - - -class AbsTaskSTS(AbsTask): - """Abstract class for STS experiments. - - self.load_data() must generate a huggingface dataset with a split matching self.metadata.eval_splits, and assign it to self.dataset. It must contain the following columns:: - sentence1: str - sentence2: str - score: float - """ - - abstask_prompt = "Retrieve semantically similar text." - min_score: int - max_score: int - - def _evaluate_subset( - self, - model, - data_split, - *, - hf_split: str, - hf_subset: str, - encode_kwargs: dict[str, Any], - **kwargs, - ) -> ScoresDict: - def normalize(x): - return (x - self.min_score) / (self.max_score - self.min_score) - - normalized_scores = list(map(normalize, data_split["score"])) - evaluator = STSEvaluator( - data_split["sentence1"], - data_split["sentence2"], - normalized_scores, - task_metadata=self.metadata, - hf_split=hf_split, - hf_subset=hf_subset, - **kwargs, - ) - scores = evaluator(model, encode_kwargs=encode_kwargs) - - self._add_main_score(scores) - return scores - - def _calculate_metrics_from_split( - self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ) -> STSDescriptiveStatistics: - if hf_subset: - sentence1 = self.dataset[hf_subset][split]["sentence1"] - sentence2 = self.dataset[hf_subset][split]["sentence2"] - score = self.dataset[hf_subset][split]["score"] - elif compute_overall: - sentence1 = [] - sentence2 = [] - score = [] - for hf_subset in self.metadata.eval_langs: - sentence1.extend(self.dataset[hf_subset][split]["sentence1"]) - sentence2.extend(self.dataset[hf_subset][split]["sentence2"]) - score.extend(self.dataset[hf_subset][split]["score"]) - else: - sentence1 = self.dataset[split]["sentence1"] - sentence2 = self.dataset[split]["sentence2"] - score = self.dataset[split]["score"] - - sentence1_len = [len(s) for s in sentence1] - sentence2_len = [len(s) for s in sentence2] - total_sentence1_len = sum(sentence1_len) - total_sentence2_len = sum(sentence2_len) - avg_score = sum(score) / len(score) - return STSDescriptiveStatistics( - num_samples=len(sentence1), - number_of_characters=total_sentence1_len + total_sentence2_len, - unique_pairs=len(set(zip(sentence1, sentence2))), - min_sentence1_length=min(sentence1_len), - average_sentence1_len=total_sentence1_len / len(sentence1), - max_sentence1_length=max(sentence1_len), - unique_sentence1=len(set(sentence1)), - min_sentence2_length=min(sentence2_len), - average_sentence2_len=total_sentence2_len / len(sentence2), - max_sentence2_length=max(sentence2_len), - unique_sentence2=len(set(sentence2)), - min_score=min(score), - avg_score=avg_score, - max_score=max(score), - ) - - def _push_dataset_to_hub(self, repo_name: str) -> None: - self._upload_dataset_to_hub(repo_name, ["sentence1", "sentence2", "score"]) diff --git a/mteb/abstasks/Image/AbsTaskVisualSTS.py b/mteb/abstasks/Image/AbsTaskVisualSTS.py deleted file mode 100644 index 238c0ec02e..0000000000 --- a/mteb/abstasks/Image/AbsTaskVisualSTS.py +++ /dev/null @@ -1,158 +0,0 @@ -from __future__ import annotations - -import logging -from typing import Any - -from ...evaluation.evaluators import VisualSTSEvaluator -from ..AbsTask import AbsTask, DescriptiveStatistics, ScoresDict - -logger = logging.getLogger(__name__) - - -class VisualSTSDescriptiveStatistics(DescriptiveStatistics): - """Descriptive statistics for STS - - Attributes: - num_samples: number of samples in the dataset - - min_image1_width: Minimum width of images1 - average_image1_width: Average width of images1 - max_image1_width: Maximum width of images1 - - min_image1_height: Minimum height of images1 - average_image1_height: Average height of images1 - max_image1_height: Maximum height of images1 - - min_image2_width: Minimum width of images2 - average_image2_width: Average width of images2 - max_image2_width: Maximum width of images2 - - min_image2_height: Minimum height of images2 - average_image2_height: Average height of images2 - max_image2_height: Maximum height of images2 - - min_score: Minimum score - avg_score: Average score - max_score: Maximum score - """ - - num_samples: int - - min_image1_width: float - average_image1_width: float - max_image1_width: float - - min_image1_height: float - average_image1_height: float - max_image1_height: float - - min_image2_width: float - average_image2_width: float - max_image2_width: float - - min_image2_height: float - average_image2_height: float - max_image2_height: float - - min_score: float - avg_score: float - max_score: float - - -class AbsTaskVisualSTS(AbsTask): - """Abstract class for visual STS experiments. - - self.load_data() must generate a huggingface dataset with a split matching self.metadata.eval_splits, and assign it to self.dataset. It must contain the following columns: - sentence1: PIL.Image - sentence2: PIL.Image - score: float - """ - - sentences_column_names = ["sentence1", "sentence2"] - min_score: int = 0 - max_score: int = 5 - - def _evaluate_subset( - self, - model, - data_split, - *, - hf_split: str, - hf_subset: str, - encode_kwargs: dict[str, Any], - **kwargs, - ) -> ScoresDict: - def normalize(x): - return (x - self.min_score) / (self.max_score - self.min_score) - - normalized_scores = list(map(normalize, data_split["score"])) - evaluator = VisualSTSEvaluator( - data_split, - self.sentences_column_names, - normalized_scores, - task_metadata=self.metadata, - hf_split=hf_split, - hf_subset=hf_subset, - **kwargs, - ) - scores = evaluator(model, encode_kwargs=encode_kwargs) - - self._add_main_score(scores) - return scores - - def _add_main_score(self, scores: ScoresDict) -> None: - scores["main_score"] = scores[self.metadata.main_score] - - def _calculate_metrics_from_split( - self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ) -> VisualSTSDescriptiveStatistics: - if hf_subset: - images1 = self.dataset[hf_subset][split][self.sentences_column_names[0]] - images2 = self.dataset[hf_subset][split][self.sentences_column_names[1]] - score = self.dataset[hf_subset][split]["score"] - elif compute_overall: - images1, images2 = [], [] - score = [] - for hf_subset in self.metadata.eval_langs: - images1.extend( - self.dataset[hf_subset][split][self.sentences_column_names[0]] - ) - images2.extend( - self.dataset[hf_subset][split][self.sentences_column_names[1]] - ) - score.extend(self.dataset[hf_subset][split]["score"]) - else: - images1 = self.dataset[split][self.sentences_column_names[0]] - images2 = self.dataset[split][self.sentences_column_names[1]] - score = self.dataset[split]["score"] - - img_widths1, img_heights1 = [], [] - for img in images1: - width, height = img.size - img_heights1.append(height) - img_widths1.append(width) - - img_widths2, img_heights2 = [], [] - for img in images1: - width, height = img.size - img_heights2.append(height) - img_widths2.append(width) - - return VisualSTSDescriptiveStatistics( - num_samples=len(score), - min_image1_width=min(img_widths1), - average_image1_width=sum(img_widths1) / len(img_widths1), - max_image1_width=max(img_widths1), - min_image1_height=min(img_heights1), - average_image1_height=sum(img_heights1) / len(img_heights1), - max_image1_height=max(img_widths1), - min_image2_width=min(img_widths2), - average_image2_width=sum(img_widths2) / len(img_widths2), - max_image2_width=max(img_widths2), - min_image2_height=min(img_heights2), - average_image2_height=sum(img_heights2) / len(img_heights2), - max_image2_height=max(img_widths2), - min_score=min(score), - avg_score=sum(score) / len(score), - max_score=max(score), - ) diff --git a/mteb/abstasks/Image/__init__.py b/mteb/abstasks/Image/__init__.py index 05c17f81d2..1a1ca25f17 100644 --- a/mteb/abstasks/Image/__init__.py +++ b/mteb/abstasks/Image/__init__.py @@ -5,12 +5,10 @@ from .AbsTaskImageClustering import AbsTaskImageClustering from .AbsTaskImageMultilabelClassification import AbsTaskImageMultilabelClassification from .AbsTaskImageTextPairClassification import AbsTaskImageTextPairClassification -from .AbsTaskVisualSTS import AbsTaskVisualSTS from .AbsTaskZeroShotClassification import AbsTaskZeroShotClassification __all__ = [ "AbsTaskZeroShotClassification", - "AbsTaskVisualSTS", "AbsTaskImageTextPairClassification", "AbsTaskImageMultilabelClassification", "AbsTaskImageClustering", diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 9bd1cc4c24..01962923cb 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -239,6 +239,23 @@ class LabelStatistics(TypedDict): labels: dict[str, dict[str, int]] +class ScoreStatistics(TypedDict): + """Class for descriptive statistics for texts. + + Attributes: + min_labels_per_text: Minimum number of labels per text + average_label_per_text: Average number of labels per text + max_labels_per_text: Maximum number of labels per text + + unique_labels: Number of unique labels + labels: dict of label frequencies + """ + + min_score: int + avg_score: float + max_score: int + + logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/__init__.py b/mteb/abstasks/__init__.py index 3631cab5d3..34da698bba 100644 --- a/mteb/abstasks/__init__.py +++ b/mteb/abstasks/__init__.py @@ -2,13 +2,13 @@ from .AbsTask import AbsTask from .AbsTaskAnyClassification import AbsTaskAnyClassification +from .AbsTaskAnySTS import AbsTaskAnySTS from .AbsTaskBitextMining import AbsTaskBitextMining from .AbsTaskClustering import AbsTaskClustering from .AbsTaskClusteringFast import AbsTaskClusteringFast from .AbsTaskMultilabelClassification import AbsTaskMultilabelClassification from .AbsTaskPairClassification import AbsTaskPairClassification from .AbsTaskRetrieval import AbsTaskRetrieval -from .AbsTaskSTS import AbsTaskSTS from .AbsTaskSummarization import AbsTaskSummarization from .Image import ( AbsTaskAny2AnyMultiChoice, @@ -16,7 +16,6 @@ AbsTaskImageClustering, AbsTaskImageMultilabelClassification, AbsTaskImageTextPairClassification, - AbsTaskVisualSTS, AbsTaskZeroShotClassification, ) from .TaskMetadata import TaskMetadata @@ -30,7 +29,7 @@ "AbsTaskMultilabelClassification", "AbsTaskPairClassification", "AbsTaskRetrieval", - "AbsTaskSTS", + "AbsTaskAnySTS", "AbsTaskSummarization", "TaskMetadata", "AbsTaskAny2AnyMultiChoice", @@ -38,6 +37,5 @@ "AbsTaskImageClustering", "AbsTaskImageMultilabelClassification", "AbsTaskImageTextPairClassification", - "AbsTaskVisualSTS", "AbsTaskZeroShotClassification", ] diff --git a/mteb/create_dataloaders.py b/mteb/create_dataloaders.py index 4524c266dd..8b7d5ac4d3 100644 --- a/mteb/create_dataloaders.py +++ b/mteb/create_dataloaders.py @@ -7,6 +7,7 @@ from datasets import Dataset from torch.utils.data import DataLoader, default_collate +from mteb.abstasks import TaskMetadata from mteb.types import BatchedInput, Conversation logger = logging.getLogger(__name__) @@ -265,3 +266,21 @@ def create_image_dataloader( collate_fn=collate_fn, shuffle=False, ) + + +def create_dataloader( + dataset: Dataset, + task_metadata: TaskMetadata, + input_column: str | None = None, + batch_size: int = 32, +) -> DataLoader: + if "image" in task_metadata.modalities: + return create_image_dataloader( + (dataset.select_columns(input_column).rename_column(input_column, "image")), + batch_size=batch_size, + ) + if "text" in task_metadata.modalities and input_column is not None: + return create_dataloader_from_texts( + dataset[input_column], batch_size=batch_size + ) + return DataLoader(dataset, batch_size=batch_size) diff --git a/mteb/evaluation/__init__.py b/mteb/evaluation/__init__.py index 6e18a22336..e8bad9a3f5 100644 --- a/mteb/evaluation/__init__.py +++ b/mteb/evaluation/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations from .evaluators import ( + AnySTSEvaluator, BitextMiningEvaluator, ClassificationEvaluator, ClusteringEvaluator, @@ -9,7 +10,6 @@ Evaluator, PairClassificationEvaluator, RetrievalEvaluator, - STSEvaluator, SummarizationEvaluator, ) from .LangMapping import LANG_MAPPING @@ -17,7 +17,7 @@ __all__ = [ "Evaluator", - "STSEvaluator", + "AnySTSEvaluator", "SummarizationEvaluator", "DeprecatedSummarizationEvaluator", "RetrievalEvaluator", diff --git a/mteb/evaluation/evaluators/STSEvaluator.py b/mteb/evaluation/evaluators/AnySTSEvaluator.py similarity index 83% rename from mteb/evaluation/evaluators/STSEvaluator.py rename to mteb/evaluation/evaluators/AnySTSEvaluator.py index cb0aab43fc..1336304bfa 100644 --- a/mteb/evaluation/evaluators/STSEvaluator.py +++ b/mteb/evaluation/evaluators/AnySTSEvaluator.py @@ -3,6 +3,7 @@ import logging from typing import Any +from datasets import Dataset from scipy.stats import pearsonr, spearmanr from sklearn.metrics.pairwise import ( paired_cosine_distances, @@ -13,27 +14,37 @@ from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.encoder_interface import Encoder -from ...create_dataloaders import create_dataloader_from_texts +from ...create_dataloaders import ( + create_dataloader, +) from ...similarity_functions import compute_pairwise_similarity from .Evaluator import Evaluator logger = logging.getLogger(__name__) -class STSEvaluator(Evaluator): +class AnySTSEvaluator(Evaluator): def __init__( self, - sentences1, - sentences2, - gold_scores, + dataset: Dataset, + sentences_column_names: tuple[str, str], + gold_scores: list[float], task_metadata: TaskMetadata, hf_split: str, hf_subset: str, **kwargs, ): super().__init__(**kwargs) - self.sentences1 = sentences1 - self.sentences2 = sentences2 + self.first_column = create_dataloader( + dataset, + task_metadata, + sentences_column_names[0], + ) + self.second_column = create_dataloader( + dataset, + task_metadata, + sentences_column_names[1], + ) self.gold_scores = gold_scores self.task_metadata = task_metadata self.hf_split = hf_split @@ -46,14 +57,14 @@ def __call__( encode_kwargs: dict[str, Any], ): embeddings1 = model.encode( - create_dataloader_from_texts(self.sentences1), + self.first_column, task_metadata=self.task_metadata, hf_split=self.hf_split, hf_subset=self.hf_subset, **encode_kwargs, ) embeddings2 = model.encode( - create_dataloader_from_texts(self.sentences2), + self.second_column, task_metadata=self.task_metadata, hf_split=self.hf_split, hf_subset=self.hf_subset, diff --git a/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py b/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py deleted file mode 100644 index e812b713ed..0000000000 --- a/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py +++ /dev/null @@ -1,110 +0,0 @@ -from __future__ import annotations - -import logging -from typing import Any - -from scipy.stats import pearsonr, spearmanr -from sklearn.metrics.pairwise import ( - paired_cosine_distances, - paired_euclidean_distances, - paired_manhattan_distances, -) - -from mteb.abstasks import TaskMetadata -from mteb.create_dataloaders import create_image_dataloader -from mteb.similarity_functions import compute_pairwise_similarity - -from ..Evaluator import Evaluator - -logger = logging.getLogger(__name__) - - -class VisualSTSEvaluator(Evaluator): - def __init__( - self, - dataset, - sentences_column_names: list[str], - gold_scores: list[float], - task_metadata: TaskMetadata, - hf_split: str, - hf_subset: str, - task_name: str | None = None, - **kwargs, - ): - super().__init__(**kwargs) - self.sentence1_dataset = create_image_dataloader( - ( - dataset.select_columns(sentences_column_names[0]).rename_column( - sentences_column_names[0], "image" - ) - ), - ) - self.sentence2_dataset = create_image_dataloader( - ( - dataset.select_columns(sentences_column_names[1]).rename_column( - sentences_column_names[1], "image" - ) - ), - ) - self.gold_scores = gold_scores - self.task_metadata = task_metadata - self.hf_split = hf_split - self.hf_subset = hf_subset - - def __call__( - self, - model, # TODO: model type - *, - encode_kwargs: dict[str, Any], - ): - embeddings1 = model.encode( - self.sentence1_dataset, - task_metadata=self.task_metadata, - hf_subset=self.hf_subset, - hf_split=self.hf_split, - batch_size=encode_kwargs["batch_size"], - ) - embeddings2 = model.encode( - self.sentence2_dataset, - task_metadata=self.task_metadata, - hf_subset=self.hf_subset, - hf_split=self.hf_split, - batch_size=encode_kwargs["batch_size"], - ) - - logger.info("Evaluating...") - cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2) - manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2) - euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2) - - cosine_pearson, _ = pearsonr(self.gold_scores, cosine_scores) - cosine_spearman, _ = spearmanr(self.gold_scores, cosine_scores) - - manhatten_pearson, _ = pearsonr(self.gold_scores, manhattan_distances) - manhatten_spearman, _ = spearmanr(self.gold_scores, manhattan_distances) - - euclidean_pearson, _ = pearsonr(self.gold_scores, euclidean_distances) - euclidean_spearman, _ = spearmanr(self.gold_scores, euclidean_distances) - - similarity_scores = compute_pairwise_similarity(model, embeddings1, embeddings2) - - if similarity_scores is not None: - pearson = pearsonr(self.gold_scores, similarity_scores) - spearman = spearmanr(self.gold_scores, similarity_scores) - else: - # if model does not have a similarity function, we assume the cosine similarity - pearson = cosine_pearson - spearman = cosine_spearman - - return { - # using the models own similarity score - "pearson": pearson, - "spearman": spearman, - # generic similarity scores - "cosine_pearson": cosine_pearson, - "cosine_spearman": cosine_spearman, - "manhattan_pearson": manhatten_pearson, - "manhattan_spearman": manhatten_spearman, - "euclidean_pearson": euclidean_pearson, - "euclidean_spearman": euclidean_spearman, - } diff --git a/mteb/evaluation/evaluators/Image/__init__.py b/mteb/evaluation/evaluators/Image/__init__.py index 5ba68a246b..4355193607 100644 --- a/mteb/evaluation/evaluators/Image/__init__.py +++ b/mteb/evaluation/evaluators/Image/__init__.py @@ -4,7 +4,6 @@ from .Any2AnyRetrievalEvaluator import Any2AnyRetrievalEvaluator from .ClusteringEvaluator import ImageClusteringEvaluator from .ImageTextPairClassificationEvaluator import ImageTextPairClassificationEvaluator -from .VisualSTSEvaluator import VisualSTSEvaluator from .ZeroShotClassificationEvaluator import ZeroShotClassificationEvaluator __all__ = [ @@ -12,6 +11,5 @@ "Any2AnyRetrievalEvaluator", "ImageClusteringEvaluator", "ImageTextPairClassificationEvaluator", - "VisualSTSEvaluator", "ZeroShotClassificationEvaluator", ] diff --git a/mteb/evaluation/evaluators/__init__.py b/mteb/evaluation/evaluators/__init__.py index 72b9c2f102..638ab237b0 100644 --- a/mteb/evaluation/evaluators/__init__.py +++ b/mteb/evaluation/evaluators/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from .AnySTSEvaluator import AnySTSEvaluator from .BitextMiningEvaluator import BitextMiningEvaluator from .ClassificationEvaluator import ClassificationEvaluator from .ClusteringEvaluator import ClusteringEvaluator @@ -9,13 +10,11 @@ Any2AnyRetrievalEvaluator, ImageClusteringEvaluator, ImageTextPairClassificationEvaluator, - VisualSTSEvaluator, ZeroShotClassificationEvaluator, ) from .model_classes import DenseRetrievalExactSearch from .PairClassificationEvaluator import PairClassificationEvaluator from .RetrievalEvaluator import RetrievalEvaluator -from .STSEvaluator import STSEvaluator from .SummarizationEvaluator import ( DeprecatedSummarizationEvaluator, SummarizationEvaluator, @@ -23,7 +22,7 @@ __all__ = [ "Evaluator", - "STSEvaluator", + "AnySTSEvaluator", "SummarizationEvaluator", "DeprecatedSummarizationEvaluator", "RetrievalEvaluator", @@ -35,7 +34,6 @@ "Any2AnyRetrievalEvaluator", "ImageClusteringEvaluator", "ImageTextPairClassificationEvaluator", - "VisualSTSEvaluator", "ZeroShotClassificationEvaluator", "ClassificationEvaluator", ] diff --git a/mteb/tasks/Image/VisualSTS/eng/STS12VisualSTS.py b/mteb/tasks/Image/VisualSTS/eng/STS12VisualSTS.py index 39c82bf718..040f8d52fe 100644 --- a/mteb/tasks/Image/VisualSTS/eng/STS12VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/eng/STS12VisualSTS.py @@ -1,10 +1,10 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata -class STS12VisualSTS(AbsTaskVisualSTS): +class STS12VisualSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="STS12VisualSTS", dataset={ diff --git a/mteb/tasks/Image/VisualSTS/eng/STS13VisualSTS.py b/mteb/tasks/Image/VisualSTS/eng/STS13VisualSTS.py index 75a78f6989..ad9b288417 100644 --- a/mteb/tasks/Image/VisualSTS/eng/STS13VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/eng/STS13VisualSTS.py @@ -1,10 +1,10 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata -class STS13VisualSTS(AbsTaskVisualSTS): +class STS13VisualSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="STS13VisualSTS", dataset={ diff --git a/mteb/tasks/Image/VisualSTS/eng/STS14VisualSTS.py b/mteb/tasks/Image/VisualSTS/eng/STS14VisualSTS.py index 818d580b15..3b5d1c2d81 100644 --- a/mteb/tasks/Image/VisualSTS/eng/STS14VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/eng/STS14VisualSTS.py @@ -1,10 +1,10 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata -class STS14VisualSTS(AbsTaskVisualSTS): +class STS14VisualSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="STS14VisualSTS", dataset={ diff --git a/mteb/tasks/Image/VisualSTS/eng/STS15VisualSTS.py b/mteb/tasks/Image/VisualSTS/eng/STS15VisualSTS.py index 2a513cae6f..ed6426e465 100644 --- a/mteb/tasks/Image/VisualSTS/eng/STS15VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/eng/STS15VisualSTS.py @@ -1,10 +1,10 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata -class STS15VisualSTS(AbsTaskVisualSTS): +class STS15VisualSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="STS15VisualSTS", dataset={ diff --git a/mteb/tasks/Image/VisualSTS/eng/STS16VisualSTS.py b/mteb/tasks/Image/VisualSTS/eng/STS16VisualSTS.py index 74ca217097..0d5dce5651 100644 --- a/mteb/tasks/Image/VisualSTS/eng/STS16VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/eng/STS16VisualSTS.py @@ -1,10 +1,10 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata -class STS16VisualSTS(AbsTaskVisualSTS): +class STS16VisualSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="STS16VisualSTS", dataset={ diff --git a/mteb/tasks/Image/VisualSTS/multilingual/STS17MultilingualVisualSTS.py b/mteb/tasks/Image/VisualSTS/multilingual/STS17MultilingualVisualSTS.py index 9238ff51ab..94fad752f8 100644 --- a/mteb/tasks/Image/VisualSTS/multilingual/STS17MultilingualVisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/multilingual/STS17MultilingualVisualSTS.py @@ -1,6 +1,6 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { @@ -20,7 +20,7 @@ _SPLITS = ["test"] -class STS17MultilingualVisualSTS(AbsTaskVisualSTS): +class STS17MultilingualVisualSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="STS17MultilingualVisualSTS", dataset={ diff --git a/mteb/tasks/Image/VisualSTS/multilingual/STSBenchmarkMultilingualVisualSTS.py b/mteb/tasks/Image/VisualSTS/multilingual/STSBenchmarkMultilingualVisualSTS.py index 53d318e3d8..f42825d555 100644 --- a/mteb/tasks/Image/VisualSTS/multilingual/STSBenchmarkMultilingualVisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/multilingual/STSBenchmarkMultilingualVisualSTS.py @@ -1,6 +1,6 @@ from __future__ import annotations -from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { @@ -19,7 +19,7 @@ _SPLITS = ["dev", "test"] -class STSBenchmarkMultilingualVisualSTS(AbsTaskVisualSTS): +class STSBenchmarkMultilingualVisualSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="STSBenchmarkMultilingualVisualSTS", dataset={ diff --git a/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py b/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py index 3e25369752..9e9fd6655e 100644 --- a/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py +++ b/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class GermanSTSBenchmarkSTS(AbsTaskSTS): +class GermanSTSBenchmarkSTS(AbsTaskAnySTS): min_score = 0 max_score = 5 diff --git a/mteb/tasks/STS/eng/BiossesSTS.py b/mteb/tasks/STS/eng/BiossesSTS.py index 1c126c3d2c..f650d7da6c 100644 --- a/mteb/tasks/STS/eng/BiossesSTS.py +++ b/mteb/tasks/STS/eng/BiossesSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class BiossesSTS(AbsTaskSTS): +class BiossesSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="BIOSSES", dataset={ diff --git a/mteb/tasks/STS/eng/STS12STS.py b/mteb/tasks/STS/eng/STS12STS.py index 25e5fb83a2..78a62ca0c8 100644 --- a/mteb/tasks/STS/eng/STS12STS.py +++ b/mteb/tasks/STS/eng/STS12STS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class STS12STS(AbsTaskSTS): +class STS12STS(AbsTaskAnySTS): min_score = 0 max_score = 5 diff --git a/mteb/tasks/STS/eng/STS13STS.py b/mteb/tasks/STS/eng/STS13STS.py index 3ad4703e03..97d6f44eb1 100644 --- a/mteb/tasks/STS/eng/STS13STS.py +++ b/mteb/tasks/STS/eng/STS13STS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class STS13STS(AbsTaskSTS): +class STS13STS(AbsTaskAnySTS): min_score = 0 max_score = 5 diff --git a/mteb/tasks/STS/eng/STS14STS.py b/mteb/tasks/STS/eng/STS14STS.py index c9cee177ac..f80e3e19e3 100644 --- a/mteb/tasks/STS/eng/STS14STS.py +++ b/mteb/tasks/STS/eng/STS14STS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class STS14STS(AbsTaskSTS): +class STS14STS(AbsTaskAnySTS): metadata = TaskMetadata( name="STS14", dataset={ diff --git a/mteb/tasks/STS/eng/STS15STS.py b/mteb/tasks/STS/eng/STS15STS.py index 06492a3d73..f7c17903f8 100644 --- a/mteb/tasks/STS/eng/STS15STS.py +++ b/mteb/tasks/STS/eng/STS15STS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class STS15STS(AbsTaskSTS): +class STS15STS(AbsTaskAnySTS): metadata = TaskMetadata( name="STS15", dataset={ diff --git a/mteb/tasks/STS/eng/STS16STS.py b/mteb/tasks/STS/eng/STS16STS.py index d75f7ecf51..0dd6a269af 100644 --- a/mteb/tasks/STS/eng/STS16STS.py +++ b/mteb/tasks/STS/eng/STS16STS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class STS16STS(AbsTaskSTS): +class STS16STS(AbsTaskAnySTS): metadata = TaskMetadata( name="STS16", dataset={ diff --git a/mteb/tasks/STS/eng/STSBenchmarkSTS.py b/mteb/tasks/STS/eng/STSBenchmarkSTS.py index ea51135e63..b9d5b07c34 100644 --- a/mteb/tasks/STS/eng/STSBenchmarkSTS.py +++ b/mteb/tasks/STS/eng/STSBenchmarkSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class STSBenchmarkSTS(AbsTaskSTS): +class STSBenchmarkSTS(AbsTaskAnySTS): min_score = 0 max_score = 5 diff --git a/mteb/tasks/STS/eng/SickrSTS.py b/mteb/tasks/STS/eng/SickrSTS.py index 657af1c707..0c8ad94a81 100644 --- a/mteb/tasks/STS/eng/SickrSTS.py +++ b/mteb/tasks/STS/eng/SickrSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class SickrSTS(AbsTaskSTS): +class SickrSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="SICK-R", dataset={ diff --git a/mteb/tasks/STS/fao/FaroeseSTS.py b/mteb/tasks/STS/fao/FaroeseSTS.py index 11da623957..8a614db853 100644 --- a/mteb/tasks/STS/fao/FaroeseSTS.py +++ b/mteb/tasks/STS/fao/FaroeseSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class FaroeseSTS(AbsTaskSTS): +class FaroeseSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="FaroeseSTS", dataset={ diff --git a/mteb/tasks/STS/fas/FaMTEBSTS.py b/mteb/tasks/STS/fas/FaMTEBSTS.py index 01838b68c8..8ce8265e01 100644 --- a/mteb/tasks/STS/fas/FaMTEBSTS.py +++ b/mteb/tasks/STS/fas/FaMTEBSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class Farsick(AbsTaskSTS): +class Farsick(AbsTaskAnySTS): metadata = TaskMetadata( name="Farsick", description="A Persian Semantic Textual Similarity And Natural Language Inference Dataset", @@ -32,7 +32,7 @@ class Farsick(AbsTaskSTS): min_score = 1 -class SynPerSTS(AbsTaskSTS): +class SynPerSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="SynPerSTS", description="Synthetic Persian Semantic Textual Similarity Dataset", @@ -59,7 +59,7 @@ class SynPerSTS(AbsTaskSTS): min_score = 1 -class Query2Query(AbsTaskSTS): +class Query2Query(AbsTaskAnySTS): metadata = TaskMetadata( name="Query2Query", description="Query to Query Datasets.", diff --git a/mteb/tasks/STS/fin/FinParaSTS.py b/mteb/tasks/STS/fin/FinParaSTS.py index e029e5f536..afe9bd8054 100644 --- a/mteb/tasks/STS/fin/FinParaSTS.py +++ b/mteb/tasks/STS/fin/FinParaSTS.py @@ -2,12 +2,12 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS N_SAMPLES = 1000 -class FinParaSTS(AbsTaskSTS): +class FinParaSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="FinParaSTS", dataset={ diff --git a/mteb/tasks/STS/fra/SickFrSTS.py b/mteb/tasks/STS/fra/SickFrSTS.py index 932cb58436..36de253b10 100644 --- a/mteb/tasks/STS/fra/SickFrSTS.py +++ b/mteb/tasks/STS/fra/SickFrSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class SickFrSTS(AbsTaskSTS): +class SickFrSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="SICKFr", dataset={ diff --git a/mteb/tasks/STS/jpn/JSICK.py b/mteb/tasks/STS/jpn/JSICK.py index 093162e65f..1413b8ab34 100644 --- a/mteb/tasks/STS/jpn/JSICK.py +++ b/mteb/tasks/STS/jpn/JSICK.py @@ -1,10 +1,10 @@ from __future__ import annotations -from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata -class JSICK(AbsTaskSTS): +class JSICK(AbsTaskAnySTS): metadata = TaskMetadata( name="JSICK", dataset={ diff --git a/mteb/tasks/STS/jpn/JSTS.py b/mteb/tasks/STS/jpn/JSTS.py index b9281848bd..901a1c25ce 100644 --- a/mteb/tasks/STS/jpn/JSTS.py +++ b/mteb/tasks/STS/jpn/JSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class JSTS(AbsTaskSTS): +class JSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="JSTS", dataset={ diff --git a/mteb/tasks/STS/kor/KlueSTS.py b/mteb/tasks/STS/kor/KlueSTS.py index c0bf29271d..b9c9998d7b 100644 --- a/mteb/tasks/STS/kor/KlueSTS.py +++ b/mteb/tasks/STS/kor/KlueSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class KlueSTS(AbsTaskSTS): +class KlueSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="KLUE-STS", dataset={ diff --git a/mteb/tasks/STS/kor/KorSTS.py b/mteb/tasks/STS/kor/KorSTS.py index 5b7ee1f244..665af1c06b 100644 --- a/mteb/tasks/STS/kor/KorSTS.py +++ b/mteb/tasks/STS/kor/KorSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class KorSTS(AbsTaskSTS): +class KorSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="KorSTS", dataset={ diff --git a/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py b/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py index abafc58c21..343be998bc 100644 --- a/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py +++ b/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py @@ -1,6 +1,6 @@ from __future__ import annotations -from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { @@ -19,7 +19,7 @@ } -class IndicCrosslingualSTS(AbsTaskSTS): +class IndicCrosslingualSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="IndicCrosslingualSTS", dataset={ diff --git a/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py b/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py index 4cc292b99d..77be831d7f 100644 --- a/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py +++ b/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py @@ -1,6 +1,6 @@ from __future__ import annotations -from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { @@ -18,7 +18,7 @@ } -class STS17Crosslingual(AbsTaskSTS): +class STS17Crosslingual(AbsTaskAnySTS): fast_loading = True metadata = TaskMetadata( name="STS17", diff --git a/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py b/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py index 6d2869ca16..2e308d0b6f 100644 --- a/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py +++ b/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py @@ -1,6 +1,6 @@ from __future__ import annotations -from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { @@ -25,7 +25,7 @@ } -class STS22CrosslingualSTSv2(AbsTaskSTS): +class STS22CrosslingualSTSv2(AbsTaskAnySTS): fast_loading = True metadata = TaskMetadata( name="STS22.v2", @@ -85,7 +85,7 @@ class STS22CrosslingualSTSv2(AbsTaskSTS): max_score = 4 -class STS22CrosslingualSTS(AbsTaskSTS): +class STS22CrosslingualSTS(AbsTaskAnySTS): superseded_by = "STS22.v2" fast_loading = True metadata = TaskMetadata( diff --git a/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py b/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py index 738404844c..d737b1d33b 100644 --- a/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py +++ b/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py @@ -1,6 +1,6 @@ from __future__ import annotations -from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { @@ -19,7 +19,7 @@ _SPLITS = ["dev", "test"] -class STSBenchmarkMultilingualSTS(AbsTaskSTS): +class STSBenchmarkMultilingualSTS(AbsTaskAnySTS): fast_loading = True metadata = TaskMetadata( name="STSBenchmarkMultilingualSTS", diff --git a/mteb/tasks/STS/multilingual/SemRel24STS.py b/mteb/tasks/STS/multilingual/SemRel24STS.py index 7df960fa87..b54da4e361 100644 --- a/mteb/tasks/STS/multilingual/SemRel24STS.py +++ b/mteb/tasks/STS/multilingual/SemRel24STS.py @@ -1,6 +1,6 @@ from __future__ import annotations -from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { @@ -21,7 +21,7 @@ _SPLITS = ["test"] -class SemRel24STS(AbsTaskSTS): +class SemRel24STS(AbsTaskAnySTS): metadata = TaskMetadata( name="SemRel24STS", dataset={ diff --git a/mteb/tasks/STS/pol/PolishSTS.py b/mteb/tasks/STS/pol/PolishSTS.py index ffda54a822..ade92fb755 100644 --- a/mteb/tasks/STS/pol/PolishSTS.py +++ b/mteb/tasks/STS/pol/PolishSTS.py @@ -1,10 +1,10 @@ from __future__ import annotations -from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata -class SickrPLSTS(AbsTaskSTS): +class SickrPLSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="SICK-R-PL", dataset={ @@ -63,7 +63,7 @@ class SickrPLSTS(AbsTaskSTS): max_score = 5 -class CdscrSTS(AbsTaskSTS): +class CdscrSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="CDSC-R", dataset={ diff --git a/mteb/tasks/STS/por/Assin2STS.py b/mteb/tasks/STS/por/Assin2STS.py index 6159d97084..0caf2a6875 100644 --- a/mteb/tasks/STS/por/Assin2STS.py +++ b/mteb/tasks/STS/por/Assin2STS.py @@ -1,10 +1,10 @@ from __future__ import annotations -from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.TaskMetadata import TaskMetadata -class Assin2STS(AbsTaskSTS): +class Assin2STS(AbsTaskAnySTS): metadata = TaskMetadata( name="Assin2STS", dataset={ diff --git a/mteb/tasks/STS/por/SickBrSTS.py b/mteb/tasks/STS/por/SickBrSTS.py index d9f7fa64ed..4f61013ea9 100644 --- a/mteb/tasks/STS/por/SickBrSTS.py +++ b/mteb/tasks/STS/por/SickBrSTS.py @@ -2,12 +2,12 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS N_SAMPLES = 1000 -class SickBrSTS(AbsTaskSTS): +class SickBrSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="SICK-BR-STS", dataset={ diff --git a/mteb/tasks/STS/ron/RonSTS.py b/mteb/tasks/STS/ron/RonSTS.py index 303ad07756..98e8c22da1 100644 --- a/mteb/tasks/STS/ron/RonSTS.py +++ b/mteb/tasks/STS/ron/RonSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class RonSTS(AbsTaskSTS): +class RonSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="RonSTS", dataset={ diff --git a/mteb/tasks/STS/rus/RUParaPhraserSTS.py b/mteb/tasks/STS/rus/RUParaPhraserSTS.py index 5e1bf976b0..093eedd228 100644 --- a/mteb/tasks/STS/rus/RUParaPhraserSTS.py +++ b/mteb/tasks/STS/rus/RUParaPhraserSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class RUParaPhraserSTS(AbsTaskSTS): +class RUParaPhraserSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="RUParaPhraserSTS", dataset={ diff --git a/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py b/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py index b00705a7a9..ecb86796f6 100644 --- a/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py +++ b/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class RuSTSBenchmarkSTS(AbsTaskSTS): +class RuSTSBenchmarkSTS(AbsTaskAnySTS): metadata = TaskMetadata( name="RuSTSBenchmarkSTS", dataset={ diff --git a/mteb/tasks/STS/spa/STSES.py b/mteb/tasks/STS/spa/STSES.py index 7d57092819..c8af92bf85 100644 --- a/mteb/tasks/STS/spa/STSES.py +++ b/mteb/tasks/STS/spa/STSES.py @@ -2,12 +2,12 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS _EVAL_SPLIT = "test" -class STSES(AbsTaskSTS): +class STSES(AbsTaskAnySTS): metadata = TaskMetadata( name="STSES", dataset={ diff --git a/mteb/tasks/STS/zho/CMTEBSTS.py b/mteb/tasks/STS/zho/CMTEBSTS.py index 8075fba64c..4e5d04af75 100644 --- a/mteb/tasks/STS/zho/CMTEBSTS.py +++ b/mteb/tasks/STS/zho/CMTEBSTS.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskSTS import AbsTaskSTS +from ....abstasks.AbsTaskAnySTS import AbsTaskAnySTS -class ATEC(AbsTaskSTS): +class ATEC(AbsTaskAnySTS): metadata = TaskMetadata( name="ATEC", dataset={ @@ -55,7 +55,7 @@ class ATEC(AbsTaskSTS): max_score = 1 -class BQ(AbsTaskSTS): +class BQ(AbsTaskAnySTS): metadata = TaskMetadata( name="BQ", dataset={ @@ -94,7 +94,7 @@ class BQ(AbsTaskSTS): max_score = 1 -class LCQMC(AbsTaskSTS): +class LCQMC(AbsTaskAnySTS): metadata = TaskMetadata( name="LCQMC", dataset={ @@ -133,7 +133,7 @@ class LCQMC(AbsTaskSTS): max_score = 1 -class PAWSX(AbsTaskSTS): +class PAWSX(AbsTaskAnySTS): metadata = TaskMetadata( name="PAWSX", dataset={ @@ -172,7 +172,7 @@ class PAWSX(AbsTaskSTS): max_score = 1 -class STSB(AbsTaskSTS): +class STSB(AbsTaskAnySTS): metadata = TaskMetadata( name="STSB", dataset={ @@ -211,7 +211,7 @@ class STSB(AbsTaskSTS): max_score = 5 -class AFQMC(AbsTaskSTS): +class AFQMC(AbsTaskAnySTS): metadata = TaskMetadata( name="AFQMC", dataset={ @@ -261,7 +261,7 @@ class AFQMC(AbsTaskSTS): max_score = 1 -class QBQTC(AbsTaskSTS): +class QBQTC(AbsTaskAnySTS): metadata = TaskMetadata( name="QBQTC", dataset={ diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index 311c2285d0..ce77969920 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -7,6 +7,7 @@ from PIL import Image from mteb.abstasks.AbsTaskAnyClassification import AbsTaskAnyClassification +from mteb.abstasks.AbsTaskAnySTS import AbsTaskAnySTS from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining from mteb.abstasks.AbsTaskClustering import AbsTaskClustering from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast @@ -15,7 +16,6 @@ ) from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval -from mteb.abstasks.AbsTaskSTS import AbsTaskSTS from mteb.abstasks.AbsTaskSummarization import AbsTaskSummarization from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval @@ -26,7 +26,6 @@ from mteb.abstasks.Image.AbsTaskImageTextPairClassification import ( AbsTaskImageTextPairClassification, ) -from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS from mteb.abstasks.Image.AbsTaskZeroShotClassification import ( AbsTaskZeroShotClassification, ) @@ -903,23 +902,27 @@ def load_data(self, **kwargs): self.data_loaded = True -class MockSTSTask(AbsTaskSTS): +class MockSTSTask(AbsTaskAnySTS): expected_stats = { "test": { "num_samples": 2, "number_of_characters": 113, "unique_pairs": 2, - "min_sentence1_length": 23, - "average_sentence1_len": 26.0, - "max_sentence1_length": 29, - "unique_sentence1": 2, - "min_sentence2_length": 24, - "average_sentence2_len": 30.5, - "max_sentence2_length": 37, - "unique_sentence2": 2, - "min_score": 0, - "avg_score": 0.5, - "max_score": 1, + "text1_statistics": { + "min_text_length": 23, + "average_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + }, + "text2_statistics": { + "min_text_length": 24, + "max_text_length": 37, + "average_text_length": 30.5, + "unique_texts": 2, + }, + "image1_statistics": None, + "image2_statistics": None, + "label_statistics": {"min_score": 0, "avg_score": 0.5, "max_score": 1}, } } @@ -955,55 +958,75 @@ def load_data(self, **kwargs): max_score = 1 -class MockMultilingualSTSTask(AbsTaskSTS): +class MockMultilingualSTSTask(AbsTaskAnySTS): expected_stats = { "test": { "num_samples": 4, "number_of_characters": 226, "unique_pairs": 2, - "min_sentence1_length": 23, - "average_sentence1_len": 26.0, - "max_sentence1_length": 29, - "unique_sentence1": 2, - "min_sentence2_length": 24, - "average_sentence2_len": 30.5, - "max_sentence2_length": 37, - "unique_sentence2": 2, - "min_score": 0, - "avg_score": 0.5, - "max_score": 1, + "text1_statistics": { + "min_text_length": 23, + "average_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + }, + "text2_statistics": { + "min_text_length": 24, + "max_text_length": 37, + "average_text_length": 30.5, + "unique_texts": 2, + }, + "image1_statistics": None, + "image2_statistics": None, + "label_statistics": {"min_score": 0, "avg_score": 0.5, "max_score": 1}, "hf_subset_descriptive_stats": { "eng": { "num_samples": 2, "number_of_characters": 113, "unique_pairs": 2, - "min_sentence1_length": 23, - "average_sentence1_len": 26.0, - "max_sentence1_length": 29, - "unique_sentence1": 2, - "min_sentence2_length": 24, - "average_sentence2_len": 30.5, - "max_sentence2_length": 37, - "unique_sentence2": 2, - "min_score": 0, - "avg_score": 0.5, - "max_score": 1, + "text1_statistics": { + "min_text_length": 23, + "average_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + }, + "text2_statistics": { + "min_text_length": 24, + "max_text_length": 37, + "average_text_length": 30.5, + "unique_texts": 2, + }, + "image1_statistics": None, + "image2_statistics": None, + "label_statistics": { + "min_score": 0, + "avg_score": 0.5, + "max_score": 1, + }, }, "fra": { "num_samples": 2, "number_of_characters": 113, "unique_pairs": 2, - "min_sentence1_length": 23, - "average_sentence1_len": 26.0, - "max_sentence1_length": 29, - "unique_sentence1": 2, - "min_sentence2_length": 24, - "average_sentence2_len": 30.5, - "max_sentence2_length": 37, - "unique_sentence2": 2, - "min_score": 0, - "avg_score": 0.5, - "max_score": 1, + "text1_statistics": { + "min_text_length": 23, + "average_text_length": 26.0, + "max_text_length": 29, + "unique_texts": 2, + }, + "text2_statistics": { + "min_text_length": 24, + "max_text_length": 37, + "average_text_length": 30.5, + "unique_texts": 2, + }, + "image1_statistics": None, + "image2_statistics": None, + "label_statistics": { + "min_score": 0, + "avg_score": 0.5, + "max_score": 1, + }, }, }, } @@ -3079,25 +3102,31 @@ def load_data(self, **kwargs): self.data_loaded = True -class MockVisualSTSTask(AbsTaskVisualSTS): +class MockVisualSTSTask(AbsTaskAnySTS): expected_stats = { "test": { "num_samples": 2, - "min_image1_width": 100, - "average_image1_width": 100.0, - "max_image1_width": 100, - "min_image1_height": 100, - "average_image1_height": 100.0, - "max_image1_height": 100, - "min_image2_width": 100, - "average_image2_width": 100.0, - "max_image2_width": 100, - "min_image2_height": 100, - "average_image2_height": 100.0, - "max_image2_height": 100, - "min_score": 0.5, - "avg_score": 0.5, - "max_score": 0.5, + "number_of_characters": None, + "unique_pairs": None, + "text1_statistics": None, + "text2_statistics": None, + "image1_statistics": { + "min_image_width": 100, + "average_image_width": 100.0, + "max_image_width": 100, + "min_image_height": 100, + "average_image_height": 100.0, + "max_image_height": 100, + }, + "image2_statistics": { + "min_image_width": 100, + "average_image_width": 100.0, + "max_image_width": 100, + "min_image_height": 100, + "average_image_height": 100.0, + "max_image_height": 100, + }, + "label_statistics": {"min_score": 0.5, "avg_score": 0.5, "max_score": 0.5}, } } @@ -3107,7 +3136,7 @@ class MockVisualSTSTask(AbsTaskVisualSTS): main_score="cosine_spearman", **general_args, # type: ignore ) - metadata.modalities = ["image", "text"] + metadata.modalities = ["image"] metadata.category = "i2i" def load_data(self, **kwargs): diff --git a/tests/test_tasks/test_metadata.py b/tests/test_tasks/test_metadata.py index 69bfb5b04d..cbf751c832 100644 --- a/tests/test_tasks/test_metadata.py +++ b/tests/test_tasks/test_metadata.py @@ -16,6 +16,8 @@ def test_descriptive_stats(task): # remove descriptive task file task.metadata.descriptive_stat_path.unlink() task_stat = task.expected_stats + print(task.metadata.name) + print(result_stat) for key, value in result_stat.items(): assert key in task_stat