Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mieb] Any2TextMultipleChoice Abstask&Evaluator & four tasks in CV-bench #1287

Merged
merged 9 commits into from
Oct 11, 2024
65 changes: 65 additions & 0 deletions mteb/abstasks/Image/AbsTaskAny2TextMultipleChoice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from __future__ import annotations

import logging
from typing import Any

from datasets import Dataset

from ...encoder_interface import Encoder, EncoderWithQueryCorpusEncode
from ...evaluation.evaluators import Any2TextMultipleChoiceEvaluator
from ...load_results.mteb_results import ScoresDict
from ..AbsTask import AbsTask

logger = logging.getLogger(__name__)


class AbsTaskAny2TextMultipleChoice(AbsTask):
"""Abstract class for Any to Text Multiple Choice tasks,
where the queries and be either text or image, or both.
This task assess interleaved encoding of queries,
the similarity computed between the queries and the candidate choices is ranked.

self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset.
"""

query_modalities: list[str] | str = ["image", "text"]
query_column_names: dict = {"image": "image", "text": "question"}
label_column_name: str = "answer"
choices_column_name: str = "choices"

def __init__(self, **kwargs):
super().__init__(**kwargs)

def _add_main_score(self, scores) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
):
pass

def _evaluate_subset(
self,
model: Encoder | EncoderWithQueryCorpusEncode,
dataset: Dataset,
*,
encode_kwargs: dict[str, Any] = {},
**kwargs,
) -> ScoresDict:
for modality in self.query_modalities:
if modality not in self.query_column_names:
raise KeyError(
f"query column name of modality {modality} is not defined"
)
evaluator = Any2TextMultipleChoiceEvaluator(
dataset,
query_modalities=self.query_modalities,
query_column_names=self.query_column_names,
label_column_name=self.label_column_name,
choices_column_name=self.choices_column_name,
task_name=self.metadata.name,
**kwargs,
)
scores = evaluator(model, encode_kwargs=encode_kwargs)
self._add_main_score(scores)
return scores
1 change: 1 addition & 0 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
"Speed",
"ZeroShotClassification",
"ImageTextPairClassification",
"Any2TextMutipleChoice",
]

TASK_CATEGORY = Literal[
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .AbsTaskSTS import *
from .AbsTaskSummarization import *
from .Image.AbsTaskAny2AnyRetrieval import *
from .Image.AbsTaskAny2TextMultipleChoice import *
from .Image.AbsTaskImageClassification import *
from .Image.AbsTaskImageClustering import *
from .Image.AbsTaskImageMultilabelClassification import *
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from __future__ import annotations

import logging
from typing import Any

import numpy as np
import torch
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from torchvision import transforms
from tqdm import tqdm

from mteb.encoder_interface import Encoder, EncoderWithSimilarity
from mteb.evaluation.evaluators.Evaluator import Evaluator

logger = logging.getLogger(__name__)

transform = transforms.Compose([transforms.PILToTensor()])


class Any2TextMultipleChoiceEvaluator(Evaluator):
"""Evaluate a model based on the similarity of queries (can be interleaved) and candidate answers.
The goal is to find the correct text in multiple candidates that
forms the correct answer of the interleaved query.

Args:
query_modalities: the modality of queries; supports image and text or either at the moment,
query_column_names: column names of queries; should align with query modalities.
label_column_name: column name of labels;
choices_column_names: column name of candidate choices;
"""

def __init__(
self,
dataset,
query_modalities: str | list[str],
query_column_names: dict,
label_column_name: str,
choices_column_name: str,
task_name: str | None = None,
transform=None,
limit: int | None = None,
**kwargs,
):
super().__init__(**kwargs)
if limit:
dataset = dataset.select(range(limit))
self.dataset = dataset
self.query_modalities = query_modalities
self.query_column_names = query_column_names
self.label_column_name = label_column_name
self.choices_column_name = choices_column_name
self.task_name = task_name
self.transform = transform

def __call__(
self,
model: Encoder | EncoderWithSimilarity,
encode_kwargs: dict[str, Any] = {},
):
if "batch_size" not in encode_kwargs:
encode_kwargs["batch_size"] = 64

label_list = list(
set([x for n in self.dataset[self.choices_column_name] for x in n])
)
label_embeddings = model.get_text_embeddings(label_list)
label_embedding_dict = {}
for label, embedding in zip(label_list, label_embeddings):
label_embedding_dict[label] = embedding

if "text" in self.query_modalities:
questions = self.dataset[self.query_column_names["text"]]
else:
questions = None
if "image" in self.query_modalities:
images = self.dataset[self.query_column_names["image"]]
query_embeddings = model.get_fused_embeddings(
texts=questions,
images=images,
batch_size=encode_kwargs["batch_size"],
)

answers = self.dataset[self.label_column_name]
choices = self.dataset[self.choices_column_name]

# note that answers are the indeces
predictions = []
for q_embedding, choice in tqdm(zip(query_embeddings, choices)):
choice_embeddings = torch.vstack(
[label_embedding_dict[c] for c in choice]
) # (choice_size, embedding_dim)
q_embedding = q_embedding[np.newaxis, :]
cos_sim = cosine_similarity(q_embedding, choice_embeddings)
predictions.append(np.argmax(cos_sim))

metrics = {}
metrics["accuracy"] = accuracy_score(predictions, answers)
isaac-chung marked this conversation as resolved.
Show resolved Hide resolved
return metrics
1 change: 1 addition & 0 deletions mteb/evaluation/evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .ClassificationEvaluator import *
from .ClusteringEvaluator import *
from .Image.Any2AnyRetrievalEvaluator import *
from .Image.Any2TextMultipleChoiceEvaluator import *
from .Image.ClassificationEvaluator import *
from .Image.ClusteringEvaluator import *
from .Image.ImageTextPairClassificationEvaluator import *
Expand Down
3 changes: 3 additions & 0 deletions mteb/tasks/Image/Any2TextMultipleChoice/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from __future__ import annotations

from .eng.CVBench import *
Loading
Loading