diff --git a/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py b/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py new file mode 100644 index 0000000000..a8d0dde0ea --- /dev/null +++ b/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py @@ -0,0 +1,450 @@ +from __future__ import annotations + +import json +import logging +import os +from collections import defaultdict +from pathlib import Path +from time import time +from typing import Any + +import tqdm +from datasets import Features, Value, load_dataset +from PIL import Image + +from ...evaluation.evaluators import Any2AnyMultiChoiceEvaluator +from ...load_results.mteb_results import ScoresDict +from ..AbsTask import AbsTask + +logger = logging.getLogger(__name__) + + +class HFDataLoader: + def __init__( + self, + hf_repo: str | None = None, + hf_repo_qrels: str | None = None, + data_folder: str | None = None, + prefix: str | None = None, + corpus_file: str = "corpus.jsonl", + query_file: str = "queries.jsonl", + qrels_folder: str = "qrels", + qrels_file: str = "", + streaming: bool = False, + keep_in_memory: bool = False, + ): + self.corpus = {} + self.queries = {} + self.qrels = {} + self.hf_repo = hf_repo + if hf_repo: + # By default fetch qrels from same repo not a second repo with "-qrels" like in original + self.hf_repo_qrels = hf_repo_qrels if hf_repo_qrels else hf_repo + else: + # data folder would contain these files: + # (1) fiqa/corpus.jsonl (format: jsonlines) + # (2) fiqa/queries.jsonl (format: jsonlines) + # (3) fiqa/qrels/test.tsv (format: tsv ("\t")) + if prefix: + query_file = prefix + "-" + query_file + qrels_folder = prefix + "-" + qrels_folder + + self.corpus_file = ( + os.path.join(data_folder, corpus_file) if data_folder else corpus_file + ) + self.query_file = ( + os.path.join(data_folder, query_file) if data_folder else query_file + ) + self.qrels_folder = ( + os.path.join(data_folder, qrels_folder) if data_folder else None + ) + self.qrels_file = qrels_file + self.streaming = streaming + self.keep_in_memory = keep_in_memory + + @staticmethod + def check(fIn: str, ext: str): + if not os.path.exists(fIn): + raise ValueError(f"File {fIn} not present! Please provide accurate file.") + + if not fIn.endswith(ext): + raise ValueError(f"File {fIn} must be present with extension {ext}") + + def load( + self, split="test" + ) -> tuple[ + dict[str, dict[str, str | Image.Image]], + dict[str, dict[str, str | Image.Image]], + dict[str, dict[str, int]], + ]: + if not self.hf_repo: + self.qrels_file = os.path.join(self.qrels_folder, split + ".tsv") + self.check(fIn=self.corpus_file, ext="jsonl") + self.check(fIn=self.query_file, ext="jsonl") + self.check(fIn=self.qrels_file, ext="tsv") + + if not len(self.corpus): + logger.info("Loading Corpus...") + self._load_corpus() + logger.info("Loaded %d %s Documents.", len(self.corpus), split.upper()) + logger.info("Doc Example: %s", self.corpus[0]) + + if not len(self.queries): + logger.info("Loading Queries...") + self._load_queries(split) + + self._load_qrels(split) + # filter queries with no qrels + qrels_dict = defaultdict(dict) + + def qrels_dict_init(row): + qrels_dict[row["query-id"]][row["corpus-id"]] = int(row["score"]) + + self.qrels.map(qrels_dict_init) + self.qrels = qrels_dict + self.queries = self.queries.filter(lambda x: x["id"] in self.qrels) + logger.info("Loaded %d %s Queries.", len(self.queries), split.upper()) + logger.info("Query Example: %s", self.queries[0]) + + return self.corpus, self.queries, self.qrels + + def load_corpus(self) -> dict[str, dict[str, str]]: + if not self.hf_repo: + self.check(fIn=self.corpus_file, ext="jsonl") + + if not len(self.corpus): + logger.info("Loading Corpus...") + self._load_corpus() + logger.info("Loaded %d %s Documents.", len(self.corpus)) + logger.info("Doc Example: %s", self.corpus[0]) + + return self.corpus + + def _load_corpus(self): + if self.hf_repo: + corpus_ds = load_dataset( + self.hf_repo, + "corpus", + keep_in_memory=self.keep_in_memory, + streaming=self.streaming, + )["corpus"] + else: + corpus_ds = load_dataset( + "json", + data_files=self.corpus_file, + streaming=self.streaming, + keep_in_memory=self.keep_in_memory, + ) + self.corpus = corpus_ds + + def _load_queries(self, split): + if self.hf_repo: + queries_ds = load_dataset( + self.hf_repo, + "query", + keep_in_memory=self.keep_in_memory, + streaming=self.streaming, + )[split] + else: + queries_ds = load_dataset( + "json", + data_files=self.query_file, + streaming=self.streaming, + keep_in_memory=self.keep_in_memory, + ) + self.queries = queries_ds + + def _load_qrels(self, split): + if self.hf_repo: + qrels_ds = load_dataset( + self.hf_repo_qrels, + "qrels", + keep_in_memory=self.keep_in_memory, + streaming=self.streaming, + )[split] + else: + qrels_ds = load_dataset( + "csv", + data_files=self.qrels_file, + delimiter="\t", + keep_in_memory=self.keep_in_memory, + ) + + if "Q0" in qrels_ds.column_names: + qrels_ds = qrels_ds.remove_columns("Q0") + features = Features( + { + "query-id": Value("string"), + "corpus-id": Value("string"), + "score": Value("float"), + } + ) + # Some datasets may have extra columns, e.g. `difficulty` in qrels for FORB. + qrels_ds = qrels_ds.select_columns(["query-id", "corpus-id", "score"]).cast( + features + ) + self.qrels = qrels_ds + + +class AbsTaskAny2AnyMultiChoice(AbsTask): + """Abstract class for Any2Any multiple choice experiments + + This is NOT a retrieval task: there is one correct answer among a set of candidates, which are a subset of the corpus, indicated in qrels with a relevance of 0 + + Child-classes must implement the following properties: + + self.corpus: dict[str, dict[str, str]] + Semantically, it should contain dict[split_name, dict[sample_id, dict[str, str]]] + E.g. {"test": {"document_one": {"_id": "d1", "title": "title", "text": "text"}}} + + self.queries: dict[str, dict[str, Union[str, List[str]]]] + Semantically, it should contain dict[split_name, dict[sample_id, str]] or dict[split_name, dict[sample_id, List[str]]] for conversations + E.g. {"test": {"q1": "query"}} + or {"test": {"q1": ["turn1", "turn2", "turn3"]}} + + self.relevant_docs: dict[str, dict[str, dict[str, int]]] + Semantically, it should contain dict[split_name, dict[sample_id, dict[doc_id, score]]] + E.g.: {"test": {"q1": {"document_one": 1}}} for hard positive samples (the correct choice) + E.g.: {"test": {"q1": {"document_two": 0}}} for hard negative samples (incorrect choices from the same query) + """ + + ignore_identical_ids: bool = False + skip_first_result: bool = False + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def load_data(self, **kwargs): + if self.data_loaded: + return + self.corpus, self.queries, self.relevant_docs = {}, {}, {} + dataset_path = self.metadata_dict["dataset"]["path"] + + for split in kwargs.get("eval_splits", self.metadata_dict["eval_splits"]): + corpus, queries, qrels = HFDataLoader( + hf_repo=dataset_path, + streaming=False, + keep_in_memory=False, + ).load(split=split) + # directly pass in corpus and queries datasets now to prevent loading into memory + # queries = {query["id"]: query for query in queries} + # corpus = {doc["id"]: doc for doc in corpus} + self.corpus[split], self.queries[split], self.relevant_docs[split] = ( + corpus, + queries, + qrels, + ) + + self.data_loaded = True + + def evaluate( + self, + model, + split: str = "test", + *, + encode_kwargs: dict[str, Any] = {}, + **kwargs, + ): + retriever = Any2AnyMultiChoiceEvaluator( + retriever=model, + task_name=self.metadata.name, + encode_kwargs=encode_kwargs, + **kwargs, + ) + + scores = {} + hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"] + + for hf_subset in hf_subsets: + logger.info(f"Subset: {hf_subset}") + + if hf_subset == "default": + corpus, queries, relevant_docs = ( + self.corpus[split], + self.queries[split], + self.relevant_docs[split], + ) + else: + corpus, queries, relevant_docs = ( + self.corpus[hf_subset][split], + self.queries[hf_subset][split], + self.relevant_docs[hf_subset][split], + ) + scores[hf_subset] = self._evaluate_subset( + retriever, corpus, queries, relevant_docs, hf_subset, **kwargs + ) + return scores + + def _evaluate_subset( + self, retriever, corpus, queries, relevant_docs, hf_subset: str, **kwargs + ): + start_time = time() + results = retriever(corpus, queries, relevant_docs) + end_time = time() + logger.info(f"Time taken to retrieve: {end_time - start_time:.2f} seconds") + + save_predictions = kwargs.get("save_predictions", False) + export_errors = kwargs.get("export_errors", False) + if save_predictions or export_errors: + output_folder = Path(kwargs.get("output_folder", "results")) + if not os.path.isdir(output_folder): + os.makedirs(output_folder) + + if save_predictions: + top_k = kwargs.get("top_k", None) + if top_k is not None: + for qid in list(results.keys()): + doc_ids = set( + sorted( + results[qid], key=lambda x: results[qid][x], reverse=True + )[:top_k] + ) + results[qid] = { + k: v for k, v in results[qid].items() if k in doc_ids + } + qrels_save_path = ( + output_folder / f"{self.metadata.name}_{hf_subset}_predictions.json" + ) + + with open(qrels_save_path, "w") as f: + json.dump(results, f) + + ndcg, _map, recall, precision, cv_recall, naucs = retriever.evaluate( + relevant_docs, + results, + retriever.k_values, + ignore_identical_ids=self.ignore_identical_ids, + skip_first_result=self.skip_first_result, + ) + mrr, naucs_mrr = retriever.evaluate_custom( + relevant_docs, results, retriever.k_values, "mrr" + ) + scores = { + **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, + **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr.items()}, + "accuracy": recall["Recall@1"], + } + self._add_main_score(scores) + + if export_errors: + errors = {} + + top_k = kwargs.get("top_k", 1) + if not save_predictions and top_k == 1: + for qid in results.keys(): + doc_scores = results[qid] + sorted_docs = sorted( + doc_scores.items(), key=lambda x: x[1], reverse=True + )[:top_k] + results[qid] = dict(sorted_docs) + for qid, retrieved_docs in results.items(): + expected_docs = relevant_docs[qid] + false_positives = [ + doc for doc in retrieved_docs if doc not in expected_docs + ] + false_negatives = [ + doc for doc in expected_docs if doc not in retrieved_docs + ] + if false_positives or false_negatives: + errors[qid] = { + "false_positives": false_positives, + "false_negatives": false_negatives, + } + + errors_save_path = ( + output_folder / f"{self.metadata.name}_{hf_subset}_errors.json" + ) + with open(errors_save_path, "w") as f: + json.dump(errors, f) + + return scores + + def _add_main_score(self, scores: ScoresDict) -> None: + scores["main_score"] = scores[self.metadata.main_score] + + def _calculate_metrics_from_split( + self, split: str, hf_subset: str | None = None, compute_overall: bool = False + ): + pass + + def calculate_metadata_metrics(self) -> None: + self.load_data() + + all_details = {} + pbar_split = tqdm.tqdm( + self.metadata_dict["eval_splits"], desc="Processing Splits..." + ) + for split in pbar_split: + pbar_split.set_postfix_str(f"Split: {split}") + print(f"Processing metadata for split {split}") + all_details[split] = {} + if self.is_multilingual: + pbar_lang = tqdm.tqdm( + self.relevant_docs.keys(), desc="Processing Languages..." + ) + for lang in pbar_lang: + pbar_lang.set_postfix_str(f"Language: {lang}") + print(f"Processing metadata for language {lang}") + split_details = process_language( + self.relevant_docs[lang][split], + self.queries[lang][split], + self.corpus[lang][split], + lang, + ) + all_details[split][lang] = split_details + else: + split_details = process_language( + self.relevant_docs[split], self.queries[split], self.corpus[split] + ) + all_details[split] = split_details + + return all_details + + +def process_language(relevant_docs, queries, corpus, lang=None): + """We want to get three pieces of information: + - the number of documents (and their char length) in the corpus + - the number of queries (and their char length) + - the average number of relevant documents per query + """ + query_len, doc_len = calculate_length(queries, corpus) + num_documents = len(corpus) + num_queries = len(queries) + + # number of qrels that are not 0 + num_qrels_non_zero = sum( + sum(1 for doc_id in docs if docs[doc_id] != 0) + for docs in relevant_docs.values() + ) + qrels_per_doc = num_qrels_non_zero / num_queries if num_queries else 0 + + language_description = f" for language {lang}" if lang else "" + print(f"Average document character length{language_description} is {doc_len}") + print(f"Average query character length{language_description} is {query_len}") + print(f"Number of documents{language_description} is {num_documents}") + print(f"Number of queries{language_description} is {num_queries}") + print( + f"Average number of relevant documents per query{language_description} is {qrels_per_doc}" + ) + return { + "average_document_length": doc_len, + "average_query_length": query_len, + "num_documents": num_documents, + "num_queries": num_queries, + "average_relevant_docs_per_query": qrels_per_doc, + } + + +def calculate_length(queries, corpus): + queries_lens = [] + doc_lens = [] + for query in queries.values(): + queries_lens.append(len(query)) + + for doc in corpus.values(): + if isinstance(doc, Image.Image): + doc_lens.append(1.0) # for image append 1. Can perhaps be removed. + + doc_len = sum(doc_lens) / len(doc_lens) if doc_lens else 0 + query_len = sum(queries_lens) / len(queries_lens) if queries_lens else 0 + return query_len, doc_len diff --git a/mteb/abstasks/__init__.py b/mteb/abstasks/__init__.py index f70cbd5324..c874bd2214 100644 --- a/mteb/abstasks/__init__.py +++ b/mteb/abstasks/__init__.py @@ -13,6 +13,7 @@ from .AbsTaskSpeedTask import * from .AbsTaskSTS import * from .AbsTaskSummarization import * +from .Image.AbsTaskAny2AnyMultiChoice import * from .Image.AbsTaskAny2AnyRetrieval import * from .Image.AbsTaskAny2TextMultipleChoice import * from .Image.AbsTaskImageClassification import * diff --git a/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py b/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py new file mode 100644 index 0000000000..20e8547536 --- /dev/null +++ b/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py @@ -0,0 +1,486 @@ +from __future__ import annotations + +import heapq +import io +import json +import logging +import os +from collections import defaultdict +from typing import Any + +import numpy as np +import pytrec_eval +import torch +from datasets import Dataset +from PIL import Image +from torch.utils.data import DataLoader +from torchvision import transforms + +from mteb.encoder_interface import EncoderWithQueryCorpusEncode + +from ..Evaluator import Evaluator +from ..utils import ( + confidence_scores, + cos_sim, + dot_score, + download, + hole, + mrr, + nAUC, + recall_cap, + top_k_accuracy, +) + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +logger = logging.getLogger(__name__) + +transform = transforms.Compose([transforms.PILToTensor()]) + + +class ImageDataset(torch.utils.data.Dataset): + def __init__(self, hf_dataset, image_column_name: str = "image", transform=None): + self.dataset = hf_dataset + self.transform = transform + self.image_column_name = image_column_name + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, idx): + image = self.dataset[idx][self.image_column_name] + if isinstance(image, bytes): + image = Image.open(io.BytesIO(image)) + else: + # Assume the image is already in a usable format (e.g., PIL Image) + image = image + if image.mode != "RGB": + image = image.convert("RGB") + image = self.transform(image) + return image + + +def custom_collate_fn(batch): + return batch + + +# Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/search/dense/exact_search.py#L12 +class Any2AnyMultiChoiceSearch: + def __init__( + self, + model: EncoderWithQueryCorpusEncode, + encode_kwargs: dict[str, Any] = {}, + corpus_chunk_size: int = 20000, + previous_results: str | None = None, + **kwargs: Any, + ): + # Model is class that provides get_text_embeddings() and get_image_embeddings() + self.model = model + self.encode_kwargs = encode_kwargs + + if "batch_size" not in encode_kwargs: + encode_kwargs["batch_size"] = 128 + + self.score_functions = {"cos_sim": cos_sim, "dot": dot_score} + self.score_function_desc = { + "cos_sim": "Cosine Similarity", + "dot": "Dot Product", + } + self.corpus_chunk_size = corpus_chunk_size + self.previous_results = previous_results + self.batch_size = encode_kwargs.get("batch_size") + self.show_progress_bar = encode_kwargs.get("show_progress_bar") + self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False) + self.corpus_embeddings = defaultdict(list) + self.results = {} + + if self.previous_results is not None: + self.previous_results = self.load_results_file() + + def search( + self, + corpus: Dataset, # solve memoery issues + queries: Dataset, # solve memoery issues + qrels: Dataset, + top_k: int, + score_function: str, + return_sorted: bool = False, + **kwargs, + ) -> dict[str, dict[str, float]]: + if score_function not in self.score_functions: + raise ValueError( + f"score function: {score_function} must be either (cos_sim) for cosine similarity or (dot) for dot product" + ) + + logger.info("Encoding Queries.") + query_ids = list(queries["id"]) + self.results = {qid: {} for qid in query_ids} + + q_modality = queries[0]["modality"] + + if q_modality == "text": + query_texts = queries["text"] + query_embeddings = self.model.get_text_embeddings( + texts=query_texts, batch_size=self.encode_kwargs["batch_size"] + ) + else: + queries_dataset = ImageDataset( + queries, image_column_name="image", transform=transform + ) + query_image_dataloader = DataLoader( + queries_dataset, + batch_size=self.encode_kwargs["batch_size"], + shuffle=False, + collate_fn=custom_collate_fn, + num_workers=max(1, os.cpu_count() // 2), + ) + if q_modality == "image": + query_embeddings = self.model.get_image_embeddings( + images=query_image_dataloader, + batch_size=self.encode_kwargs["batch_size"], + ) + elif q_modality == "image,text": + query_texts = queries["text"] + query_embeddings = self.model.get_fused_embeddings( + texts=query_texts, + images=query_image_dataloader, + batch_size=self.encode_kwargs["batch_size"], + ) + else: + raise ValueError(f"Unsupported modality: {q_modality}") + + logger.info("Preparing Corpus...") + corpus_ids = list(corpus["id"]) + + corpus_modality = corpus[0]["modality"] + + logger.info("Encoding Corpus in batches... Warning: This might take a while!") + logger.info( + f"Scoring Function: {self.score_function_desc[score_function]} ({score_function})" + ) + + result_heaps = {qid: [] for qid in query_ids} + for chunk_start in range(0, len(corpus), self.corpus_chunk_size): + chunk = corpus.select( + range( + chunk_start, min(chunk_start + self.corpus_chunk_size, len(corpus)) + ) + ) + chunk_ids = corpus_ids[chunk_start : chunk_start + self.corpus_chunk_size] + + if corpus_modality == "text": + corpus_texts = chunk["text"] + sub_corpus_embeddings = self.model.get_text_embeddings( + texts=corpus_texts, batch_size=self.encode_kwargs["batch_size"] + ) + else: + corpus_dataset = ImageDataset( + chunk, image_column_name="image", transform=transform + ) + corpus_image_dataloader = DataLoader( + corpus_dataset, + batch_size=self.encode_kwargs["batch_size"], + shuffle=False, + collate_fn=custom_collate_fn, + num_workers=max(1, os.cpu_count() // 2), + ) + if corpus_modality == "image": + sub_corpus_embeddings = self.model.get_image_embeddings( + images=corpus_image_dataloader, + batch_size=self.encode_kwargs["batch_size"], + ) + elif corpus_modality == "image,text": + corpus_texts = chunk["text"] + sub_corpus_embeddings = self.model.get_fused_embeddings( + texts=corpus_texts, + images=corpus_image_dataloader, + batch_size=self.encode_kwargs["batch_size"], + ) + else: + raise ValueError(f"Unsupported modality: {corpus_modality}") + + cos_scores = self.score_functions[score_function]( + query_embeddings, sub_corpus_embeddings + ) + cos_scores[torch.isnan(cos_scores)] = -1 + + for query_idx in range(len(query_embeddings)): + query_id = query_ids[query_idx] + # discount answers which aren't a multiple choice (where there is a qrel entry for both query and corpus id) + for c_idx, c_id in enumerate(chunk_ids): + if c_id not in qrels[query_id]: + cos_scores[query_idx, c_idx] = -1 + + cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( + cos_scores, + min(top_k, cos_scores.size(1)), + dim=1, + largest=True, + sorted=return_sorted, + ) + cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() + cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() + + for query_itr in range(len(query_embeddings)): + query_id = query_ids[query_itr] + for sub_corpus_id, score in zip( + cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr] + ): + corpus_id = chunk_ids[sub_corpus_id] + if len(result_heaps[query_id]) < top_k: + heapq.heappush(result_heaps[query_id], (score, corpus_id)) + else: + heapq.heappushpop(result_heaps[query_id], (score, corpus_id)) + + for qid in result_heaps: + for score, corpus_id in result_heaps[qid]: + self.results[qid][corpus_id] = score + + return self.results + + def load_results_file(self): + # load the first stage results from file in format {qid: {doc_id: score}} + if "https://" in self.previous_results: + # download the file + if not os.path.exists(self.previous_results): + url_descriptor = self.previous_results.split("https://")[-1].replace( + "/", "--" + ) + dest_file = os.path.join( + "results", f"cached_predictions--{url_descriptor}" + ) + os.makedirs(os.path.dirname(os.path.abspath(dest_file)), exist_ok=True) + download(self.previous_results, dest_file) + logger.info( + f"Downloaded the previous results at {self.previous_results} to {dest_file}" + ) + self.previous_results = dest_file + + with open(self.previous_results) as f: + previous_results = json.load(f) + assert isinstance(previous_results, dict) + assert isinstance(previous_results[list(previous_results.keys())[0]], dict) + return previous_results + + +class Any2AnyMultiChoiceEvaluator(Evaluator): + def __init__( + self, + retriever=None, + task_name: str | None = None, + k_values: list[int] = [1, 3, 5, 10, 20, 100, 1000], + score_function: str = "cos_sim", + encode_kwargs: dict[str, Any] = {}, + **kwargs, + ): + super().__init__(**kwargs) + + self.retriever = Any2AnyMultiChoiceSearch( + retriever, encode_kwargs=encode_kwargs, **kwargs + ) + self.k_values = k_values + self.top_k = ( + max(k_values) if "top_k" not in kwargs else kwargs["top_k"] + ) # can lower it if reranking + self.score_function = score_function + self.task_name = task_name + + def __call__( + self, + corpus: dict[str, dict[str, str | Image.Image]], + queries: dict[str, dict[str, str | Image.Image]], + qrels: dict[str, dict[str, int]], + ) -> dict[str, dict[str, float]]: + if not self.retriever: + raise ValueError("Model/Technique has not been provided!") + + return self.retriever.search( + corpus, + queries, + qrels, + self.top_k, + self.score_function, + prompt_name=self.task_name, # type: ignore + ) + + @staticmethod + def evaluate( + qrels: dict[str, dict[str, int]], + results: dict[str, dict[str, float]], + k_values: list[int], + ignore_identical_ids: bool = False, + skip_first_result: bool = False, + ) -> tuple[ + dict[str, float], + dict[str, float], + dict[str, float], + dict[str, float], + dict[str, float], + ]: + if ignore_identical_ids: + logger.debug( + "For evaluation, ``ignore_identical_ids=True`` is set to True, the evaluator will ignore identical query and document ids." + ) + # Remove identical ids from results dict + for qid, rels in results.items(): + for pid in list(rels): + if qid == pid: + results[qid].pop(pid) + else: + logger.debug( + "For evaluation, we DO NOT ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=True`` to ignore this." + ) + + all_ndcgs, all_aps, all_recalls, all_precisions, all_cv_recalls = ( + {}, + {}, + {}, + {}, + {}, + ) + + for k in k_values: + all_ndcgs[f"NDCG@{k}"] = [] + all_aps[f"MAP@{k}"] = [] + all_recalls[f"Recall@{k}"] = [] + all_precisions[f"P@{k}"] = [] + all_cv_recalls[f"CV_Recall@{k}"] = [] # (new) CV-style Recall + + map_string = "map_cut." + ",".join([str(k) for k in k_values]) + ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values]) + recall_string = "recall." + ",".join([str(k) for k in k_values]) + precision_string = "P." + ",".join([str(k) for k in k_values]) + evaluator = pytrec_eval.RelevanceEvaluator( + qrels, {map_string, ndcg_string, recall_string, precision_string} + ) + scores = evaluator.evaluate(results) + + sorted_results = { + qid: sorted(rels.items(), key=lambda item: item[1], reverse=True) + for qid, rels in results.items() + } + + if skip_first_result: + for qid, rels in sorted_results.items(): + sorted_results[qid].pop(0) + + for query_id in scores.keys(): + top_docs = [ + doc_id for doc_id, _ in sorted_results.get(query_id, []) + ] # Sorted list of doc IDs + # we need to discount qrels that have a ground truth score of 0 + relevant_docs = { + key + for key in qrels.get(query_id, {}).keys() + if qrels[query_id][key] != 0 + } + + for k in k_values: + top_k_docs = top_docs[:k] + all_ndcgs[f"NDCG@{k}"].append(scores[query_id]["ndcg_cut_" + str(k)]) + all_aps[f"MAP@{k}"].append(scores[query_id]["map_cut_" + str(k)]) + all_recalls[f"Recall@{k}"].append(scores[query_id]["recall_" + str(k)]) + all_precisions[f"P@{k}"].append(scores[query_id]["P_" + str(k)]) + + if relevant_docs.intersection(top_k_docs): + all_cv_recalls[f"CV_Recall@{k}"].append(1.0) + else: + all_cv_recalls[f"CV_Recall@{k}"].append(0.0) + + ndcg, _map, recall, precision, cv_recall = ( + all_ndcgs.copy(), + all_aps.copy(), + all_recalls.copy(), + all_precisions.copy(), + all_cv_recalls.copy(), + ) + + for k in k_values: + ndcg[f"NDCG@{k}"] = round(sum(ndcg[f"NDCG@{k}"]) / len(scores), 5) + _map[f"MAP@{k}"] = round(sum(_map[f"MAP@{k}"]) / len(scores), 5) + recall[f"Recall@{k}"] = round(sum(recall[f"Recall@{k}"]) / len(scores), 5) + precision[f"P@{k}"] = round(sum(precision[f"P@{k}"]) / len(scores), 5) + cv_recall[f"CV_Recall@{k}"] = round( + sum(cv_recall[f"CV_Recall@{k}"]) / len(scores), 5 + ) + + naucs = Any2AnyMultiChoiceEvaluator.evaluate_abstention( + results, + {**all_ndcgs, **all_aps, **all_recalls, **all_precisions, **all_cv_recalls}, + ) + + return ndcg, _map, recall, precision, cv_recall, naucs + + @staticmethod + def evaluate_custom( + qrels: dict[str, dict[str, int]], + results: dict[str, dict[str, float]], + k_values: list[int], + metric: str, + output_type: str = "all", + ) -> tuple[dict[str, float]]: + if metric.lower() in ["mrr", "mrr@k", "mrr_cut"]: + metric_scores = mrr(qrels, results, k_values, output_type) + + elif metric.lower() in ["recall_cap", "r_cap", "r_cap@k"]: + metric_scores = recall_cap(qrels, results, k_values, output_type) + + elif metric.lower() in ["hole", "hole@k"]: + metric_scores = hole(qrels, results, k_values, output_type) + + elif metric.lower() in [ + "acc", + "top_k_acc", + "accuracy", + "accuracy@k", + "top_k_accuracy", + ]: + metric_scores = top_k_accuracy(qrels, results, k_values, output_type) + + naucs = Any2AnyMultiChoiceEvaluator.evaluate_abstention(results, metric_scores) + metric_scores_avg = {k: sum(v) / len(v) for k, v in metric_scores.items()} + + return metric_scores_avg, naucs + + @staticmethod + def evaluate_abstention( + results: dict[str, dict[str, float]], + metric_scores: dict[str, list[float]], + ) -> dict[str, float]: + """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997""" + all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())] + all_conf_scores = [ + confidence_scores(sim_scores) for sim_scores in all_sim_scores + ] + conf_fcts = list(all_conf_scores[0].keys()) + all_conf_scores = { + fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts + } + metric_scores = {k: np.array(v) for k, v in metric_scores.items()} + naucs = {} + + for metric_name, scores in metric_scores.items(): + for fct, conf_scores in all_conf_scores.items(): + naucs[f"nAUC_{metric_name}_{fct}"] = nAUC(conf_scores, scores) + + return naucs + + @staticmethod + def calculate_cv_style_recall( + qrels: dict[str, dict[str, int]], results: dict[str, dict[str, float]], k: int + ) -> dict[str, float]: + """Calculate CV-style recall: Recall is 1 if any relevant document is + retrieved in the top k, otherwise 0. + """ + cv_recalls = {} + for query_id, relevant_docs in qrels.items(): + retrieved_docs = list(results.get(query_id, {}).keys())[ + :k + ] # Retrieve top k documents + if any(doc_id in relevant_docs for doc_id in retrieved_docs): + cv_recalls[query_id] = ( + 1.0 # If any relevant doc is found in top k, recall is 1 + ) + else: + cv_recalls[query_id] = 0.0 # Otherwise, recall is 0 + return cv_recalls diff --git a/mteb/evaluation/evaluators/__init__.py b/mteb/evaluation/evaluators/__init__.py index 2fb90b655f..d6ad94a88d 100644 --- a/mteb/evaluation/evaluators/__init__.py +++ b/mteb/evaluation/evaluators/__init__.py @@ -3,6 +3,7 @@ from .BitextMiningEvaluator import * from .ClassificationEvaluator import * from .ClusteringEvaluator import * +from .Image.Any2AnyMultiChoiceEvaluator import * from .Image.Any2AnyRetrievalEvaluator import * from .Image.Any2TextMultipleChoiceEvaluator import * from .Image.ClassificationEvaluator import * diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py b/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py new file mode 100644 index 0000000000..b317e8cabd --- /dev/null +++ b/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from .eng.ImageCoDeT2IMultiChoice import * diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py new file mode 100644 index 0000000000..1f00290cdd --- /dev/null +++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ImageCoDeT2IMultiChoice(AbsTaskAny2AnyMultiChoice): + metadata = TaskMetadata( + name="ImageCoDeT2IMultiChoice", + description="Identify the correct image from a set of similar images based on a precise caption.", + reference="https://aclanthology.org/2022.acl-long.241.pdf", + dataset={ + "path": "JamieSJS/imagecode-multi", + "revision": "d28adfd8b34fefa546fdf94bdc352622b2575f6c", + }, + type="Retrieval", + category="t2i", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_1", + date=("2022-05-22", "2022-05-27"), # conference dates + form=["written"], + domains=["Web"], + task_subtypes=["Image Text Retrieval"], + license="CC BY-SA 4.0", + socioeconomic_status="medium", + annotations_creators="derived", + dialect=[], + modalities=["text", "image"], + sample_creation="found", + bibtex_citation="""@article{krojer2022image, + title={Image retrieval from contextual descriptions}, + author={Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva}, + journal={arXiv preprint arXiv:2203.15867}, + year={2022} +} +""", + descriptive_stats={ + "n_samples": {"test": 2302}, + "avg_character_length": { + "test": { + "average_document_length": 0.0, + "average_query_length": 0.0, + "num_documents": 23020, + "num_queries": 2302, + "average_relevant_docs_per_query": 1.0, + } + }, + }, + ) diff --git a/mteb/tasks/Image/__init__.py b/mteb/tasks/Image/__init__.py index 845cc136f3..8f1c2d27f7 100644 --- a/mteb/tasks/Image/__init__.py +++ b/mteb/tasks/Image/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from .Any2AnyMultiChoice import * from .Any2AnyRetrieval import * from .Any2TextMultipleChoice import * from .Clustering import * diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index 0d7d1d5fc0..8d49517136 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -3,6 +3,7 @@ from .BitextMining import * from .Classification import * from .Clustering import * +from .Image.Any2AnyMultiChoice import * from .Image.Any2AnyRetrieval import * from .Image.Clustering import * from .Image.ImageClassification import * diff --git a/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageCoDeT2IMultiChoice.json b/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageCoDeT2IMultiChoice.json new file mode 100644 index 0000000000..1f3e5fbd9e --- /dev/null +++ b/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageCoDeT2IMultiChoice.json @@ -0,0 +1,33 @@ +{ + "dataset_revision": "d28adfd8b34fefa546fdf94bdc352622b2575f6c", + "evaluation_time": 459.3987202644348, + "kg_co2_emissions": null, + "mteb_version": "1.12.90", + "scores": { + "test": [ + { + "accuracy": 0.13206, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.13206, + "mrr_at_1": 0.13205907906168549, + "mrr_at_10": 0.32183470550108295, + "mrr_at_100": 0.32183470550108295, + "mrr_at_1000": 0.32183470550108295, + "mrr_at_20": 0.32183470550108295, + "mrr_at_3": 0.2158268172603526, + "mrr_at_5": 0.2607225600926729, + "ndcg_at_1": 0.13206, + "ndcg_at_10": 0.47717, + "ndcg_at_100": 0.47717, + "ndcg_at_1000": 0.47717, + "ndcg_at_20": 0.47717, + "ndcg_at_3": 0.24566, + "ndcg_at_5": 0.32738 + } + ] + }, + "task_name": "ImageCoDeT2IMultiChoice" +} \ No newline at end of file diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index c9f1f59ac6..d4c8e44a88 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -13,6 +13,7 @@ from mteb.abstasks.AbsTaskInstructionRetrieval import AbsTaskInstructionRetrieval from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask +from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval from mteb.abstasks.MultiSubsetLoader import MultiSubsetLoader from mteb.overview import TASKS_REGISTRY @@ -39,6 +40,7 @@ def test_load_data( or isinstance(task, AbsTaskInstructionRetrieval) or isinstance(task, MultiSubsetLoader) or isinstance(task, AbsTaskSpeedTask) + or isinstance(task, AbsTaskAny2AnyMultiChoice) ): pytest.skip() with patch.object(task, "dataset_transform") as mock_dataset_transform: