diff --git a/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py b/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py index 7076233500..e3a6ca8734 100644 --- a/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py +++ b/mteb/abstasks/Image/AbsTaskAny2AnyMultiChoice.py @@ -656,3 +656,272 @@ def process_docs( return { f"{split}_{hf_subset}_{k}": v for k, v in collection[hf_subset][split].items() } + + +class MultiChoiceEvaluationMixin: + """A mixin class to enable retrieval tasks to use multiple-choice evaluator; + It is designed for tasks like r-Oxford and r-Pairs that + require masking out different documents in the corpus for each query. + + example usage: + class ROxfordHardI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): + + It is for overriding `def evaluate`, `def _evaluate_subset` + and `def _calculate_metrics_from_split` of AbsTaskAny2AnyRetrieval. + """ + + def evaluate( + self, + model, + split: str = "test", + *, + encode_kwargs: dict[str, Any] = None, + **kwargs, + ): + # Use Any2AnyMultiChoiceEvaluator instead of Any2AnyRetrievalEvaluator + evaluator = Any2AnyMultiChoiceEvaluator( + retriever=model, + task_name=self.metadata.name, + encode_kwargs=encode_kwargs if encode_kwargs is not None else {}, + **kwargs, + ) + + scores = {} + hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"] + + for hf_subset in hf_subsets: + logger.info(f"Subset: {hf_subset}") + + if hf_subset == "default": + corpus, queries, relevant_docs = ( + self.corpus[split], + self.queries[split], + self.relevant_docs[split], + ) + else: + corpus, queries, relevant_docs = ( + self.corpus[hf_subset][split], + self.queries[hf_subset][split], + self.relevant_docs[hf_subset][split], + ) + scores[hf_subset] = self._evaluate_subset( + evaluator, corpus, queries, relevant_docs, hf_subset, **kwargs + ) + return scores + + def _evaluate_subset( + self, retriever, corpus, queries, relevant_docs, hf_subset: str, **kwargs + ): + start_time = time() + results = retriever(corpus, queries, relevant_docs) + end_time = time() + logger.info(f"Time taken to retrieve: {end_time - start_time:.2f} seconds") + + save_predictions = kwargs.get("save_predictions", False) + export_errors = kwargs.get("export_errors", False) + if save_predictions or export_errors: + output_folder = Path(kwargs.get("output_folder", "results")) + if not os.path.isdir(output_folder): + os.makedirs(output_folder) + + if save_predictions: + top_k = kwargs.get("top_k", None) + if top_k is not None: + for qid in list(results.keys()): + doc_ids = set( + sorted( + results[qid], key=lambda x: results[qid][x], reverse=True + )[:top_k] + ) + results[qid] = { + k: v for k, v in results[qid].items() if k in doc_ids + } + qrels_save_path = ( + output_folder / f"{self.metadata.name}_{hf_subset}_predictions.json" + ) + + with open(qrels_save_path, "w") as f: + json.dump(results, f) + + ndcg, _map, recall, precision, cv_recall, naucs = retriever.evaluate( + relevant_docs, + results, + retriever.k_values, + ignore_identical_ids=self.ignore_identical_ids, + skip_first_result=self.skip_first_result, + ) + mrr, naucs_mrr = retriever.evaluate_custom( + relevant_docs, results, retriever.k_values, "mrr" + ) + scores = { + **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, + **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()}, + **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()}, + **{f"cv_recall_at_{k.split('@')[1]}": v for (k, v) in cv_recall.items()}, + **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()}, + **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr.items()}, + **{ + k.replace("@", "_at_").replace("_P", "_precision").lower(): v + for k, v in naucs.items() + }, + **{ + k.replace("@", "_at_").replace("_P", "_precision").lower(): v + for k, v in naucs_mrr.items() + }, + "accuracy": recall["Recall@1"], + } + self._add_main_score(scores) + + if export_errors: + errors = {} + + top_k = kwargs.get("top_k", 1) + if not save_predictions and top_k == 1: + for qid in results.keys(): + doc_scores = results[qid] + sorted_docs = sorted( + doc_scores.items(), key=lambda x: x[1], reverse=True + )[:top_k] + results[qid] = dict(sorted_docs) + for qid, retrieved_docs in results.items(): + expected_docs = relevant_docs[qid] + false_positives = [ + doc for doc in retrieved_docs if doc not in expected_docs + ] + false_negatives = [ + doc for doc in expected_docs if doc not in retrieved_docs + ] + if false_positives or false_negatives: + errors[qid] = { + "false_positives": false_positives, + "false_negatives": false_negatives, + } + + errors_save_path = ( + output_folder / f"{self.metadata.name}_{hf_subset}_errors.json" + ) + with open(errors_save_path, "w") as f: + json.dump(errors, f) + + return scores + + def _calculate_metrics_from_split( + self, split: str, hf_subset: str | None = None, compute_overall: bool = False + ) -> Any2AnyMutipleChoiceDescriptiveStatistics: + if hf_subset: + queries = self.queries[hf_subset][split] + corpus = self.corpus[hf_subset][split] + relevant_docs = self.relevant_docs[hf_subset][split] + elif compute_overall: + queries = {} + corpus = {} + relevant_docs = {} + for hf_subset in self.metadata.eval_langs: + queries.update(process_docs(self.queries, hf_subset, split)) + corpus.update(process_docs(self.corpus, hf_subset, split)) + relevant_docs.update( + process_relevant_docs(self.relevant_docs, hf_subset, split) + ) + else: + queries = self.queries[split] + corpus = self.corpus[split] + relevant_docs = self.relevant_docs[split] + + queries_lens, doc_lens = [], [] + num_query_images = 0 + num_document_images = 0 + + q_modality = queries[0]["modality"] + unique_queries = len(set(queries["text"])) if "text" in q_modality else 0 + + for query in tqdm.tqdm(queries, desc="queries:"): + if "text" in q_modality: + text_query = query["text"] + queries_lens.append(len(text_query)) + if "image" in q_modality: + num_query_images += 1 + + d_modality = corpus[0]["modality"] + unique_documents = len(set(corpus["text"])) if "text" in d_modality else 0 + + for doc in tqdm.tqdm(corpus, desc="docs:"): + if "text" in d_modality: + text_doc = doc["text"] + doc_lens.append(len(text_doc)) + if "image" in d_modality: + num_document_images += 1 + + total_doc_len = sum(doc_lens) + total_query_len = sum(queries_lens) + num_documents = len(corpus) + num_queries = len(queries) + + d_modality = corpus[0]["modality"] + imgs = [doc["image"] for doc in corpus if "image" in d_modality] + d_img_widths, d_img_heights = [], [] + for img in imgs: + width, height = img.size + d_img_widths.append(height) + d_img_heights.append(width) + + q_modality = queries[0]["modality"] + imgs = [query["image"] for query in queries if "image" in q_modality] + q_img_widths, q_img_heights = [], [] + for img in imgs: + width, height = img.size + q_img_widths.append(height) + q_img_heights.append(width) + + # create a list of number of relevant docs per query + queries_set = set(queries["id"]) + qrels_lengths = [ + len([v for k, v in relevant_docs[qid].items() if v != 0]) + for qid in tqdm.tqdm(relevant_docs.keys(), desc="qrels:") + if qid in queries_set + ] + num_qrels = sum(qrels_lengths) + qrels_per_doc = num_qrels / len(relevant_docs) if num_queries else 0 + unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]}) + + return Any2AnyMutipleChoiceDescriptiveStatistics( + number_of_characters=total_query_len + total_doc_len, + num_samples=num_documents + num_queries, + num_queries=num_queries, + num_documents=num_documents, + min_document_length=min(doc_lens) if doc_lens else 0, + average_document_length=total_doc_len / len(doc_lens) if doc_lens else 0, + max_document_length=max(doc_lens) if doc_lens else 0, + unique_documents=unique_documents, + min_document_image_width=min(d_img_widths) if d_img_widths else 0, + average_document_image_width=sum(d_img_widths) / len(d_img_widths) + if d_img_widths + else 0, + max_document_image_width=max(d_img_widths) if d_img_widths else 0, + min_document_image_height=min(d_img_heights) if d_img_heights else 0, + average_document_image_height=sum(d_img_heights) / len(d_img_heights) + if d_img_heights + else 0, + max_document_image_height=max(d_img_heights) if d_img_heights else 0, + num_document_images=num_document_images, + min_query_length=min(queries_lens) if queries_lens else 0, + average_query_length=total_query_len / len(queries_lens) + if queries_lens + else 0, + max_query_length=max(queries_lens) if queries_lens else 0, + unique_queries=unique_queries, + num_query_images=num_query_images, + min_query_image_width=min(q_img_widths) if q_img_widths else 0, + average_query_image_width=sum(q_img_widths) / len(q_img_widths) + if q_img_widths + else 0, + max_query_image_width=max(q_img_widths) if q_img_widths else 0, + min_query_image_height=min(q_img_heights) if q_img_heights else 0, + average_query_image_height=sum(q_img_heights) / len(q_img_heights) + if q_img_heights + else 0, + max_query_image_height=max(q_img_heights) if q_img_heights else 0, + min_relevant_docs_per_query=min(qrels_lengths), + average_relevant_docs_per_query=qrels_per_doc, + max_relevant_docs_per_query=max(qrels_lengths), + unique_relevant_docs=unique_qrels, + ) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 40c6a0e588..b7bbcf2498 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1465,14 +1465,13 @@ "STL10ZeroShot", "SUN397ZeroShot", "UCF101ZeroShot", - # Any2TextMutipleChoice + # Any2AnyMultipleChoice + "BLINKIT2IMultiChoice", + "BLINKIT2TMultiChoice", "CVBenchCount", "CVBenchRelation", "CVBenchDepth", "CVBenchDistance", - # Any2AnyMultipleChoice - "BLINKIT2IMultiChoice", - "BLINKIT2TMultiChoice", # Compositionality "ImageCoDeT2IMultiChoice", "AROCocoOrder", @@ -1514,13 +1513,13 @@ "NIGHTSI2IRetrieval", "OVENIT2ITRetrieval", "OVENIT2TRetrieval", - "ROxfordEasyI2IMultiChoice", - "ROxfordMediumI2IMultiChoice", - "ROxfordHardI2IMultiChoice", + "ROxfordEasyI2IRetrieval", + "ROxfordMediumI2IRetrieval", + "ROxfordHardI2IRetrieval", "RP2kI2IRetrieval", - "RParisEasyI2IMultiChoice", - "RParisMediumI2IMultiChoice", - "RParisHardI2IMultiChoice", + "RParisEasyI2IRetrieval", + "RParisMediumI2IRetrieval", + "RParisHardI2IRetrieval", "SciMMIRI2TRetrieval", "SciMMIRT2IRetrieval", "SketchyI2IRetrieval", @@ -1609,14 +1608,13 @@ "Food101ZeroShot", "OxfordPetsZeroShot", "StanfordCarsZeroShot", - # Any2TextMutipleChoice + # Any2AnyMultipleChoice + "BLINKIT2IMultiChoice", + "ImageCoDeT2IMultiChoice", "CVBenchCount", "CVBenchRelation", "CVBenchDepth", "CVBenchDistance", - # Any2AnyMultipleChoice - "BLINKIT2IMultiChoice", - "ImageCoDeT2IMultiChoice", # ImageTextPairClassification "AROCocoOrder", "AROFlickrOrder", diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IMultiChoice.json deleted file mode 100644 index e26273628e..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 5063, - "num_queries": 70, - "num_documents": 4993, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 4993, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 0, - "average_relevant_docs_per_query": 43.27142857142857, - "max_relevant_docs_per_query": 248, - "unique_relevant_docs": 4993 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IRetrieval.json index b3469b2b0b..e26273628e 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordEasyI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 584, - "num_queries": 68, - "num_documents": 516, + "num_samples": 5063, + "num_queries": 70, + "num_documents": 4993, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 516, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 4993, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, - "num_query_images": 68, - "min_relevant_docs_per_query": 1, - "average_relevant_docs_per_query": 44.544117647058826, + "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, + "min_relevant_docs_per_query": 0, + "average_relevant_docs_per_query": 43.27142857142857, "max_relevant_docs_per_query": 248, - "unique_relevant_docs": 516 + "unique_relevant_docs": 4993 } -} +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IMultiChoice.json deleted file mode 100644 index a6b9a21ac5..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 5063, - "num_queries": 70, - "num_documents": 4993, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 4993, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 1, - "average_relevant_docs_per_query": 35.67142857142857, - "max_relevant_docs_per_query": 284, - "unique_relevant_docs": 4993 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IRetrieval.json index 515fa3a5da..a6b9a21ac5 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordHardI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 755, + "num_samples": 5063, "num_queries": 70, - "num_documents": 685, + "num_documents": 4993, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 685, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 4993, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 35.67142857142857, "max_relevant_docs_per_query": 284, - "unique_relevant_docs": 685 + "unique_relevant_docs": 4993 } -} +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IMultiChoice.json deleted file mode 100644 index 333bbe786d..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 5063, - "num_queries": 70, - "num_documents": 4993, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 4993, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 2, - "average_relevant_docs_per_query": 78.94285714285714, - "max_relevant_docs_per_query": 347, - "unique_relevant_docs": 4993 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IRetrieval.json index 8ca55933c4..333bbe786d 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/ROxfordMediumI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 858, + "num_samples": 5063, "num_queries": 70, - "num_documents": 788, + "num_documents": 4993, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 788, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 4993, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, "min_relevant_docs_per_query": 2, "average_relevant_docs_per_query": 78.94285714285714, "max_relevant_docs_per_query": 347, - "unique_relevant_docs": 788 + "unique_relevant_docs": 4993 } -} +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IMultiChoice.json deleted file mode 100644 index 5cf0e5ee74..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 6392, - "num_queries": 70, - "num_documents": 6322, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 6322, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 2, - "average_relevant_docs_per_query": 98.2, - "max_relevant_docs_per_query": 199, - "unique_relevant_docs": 6322 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IRetrieval.json index b21a7cfdd2..5cf0e5ee74 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisEasyI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 1540, + "num_samples": 6392, "num_queries": 70, - "num_documents": 1470, + "num_documents": 6322, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 1470, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 6322, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, "min_relevant_docs_per_query": 2, "average_relevant_docs_per_query": 98.2, "max_relevant_docs_per_query": 199, - "unique_relevant_docs": 1470 + "unique_relevant_docs": 6322 } -} +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IMultiChoice.json deleted file mode 100644 index 87f882d612..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 6392, - "num_queries": 70, - "num_documents": 6322, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 6322, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 34, - "average_relevant_docs_per_query": 147.85714285714286, - "max_relevant_docs_per_query": 556, - "unique_relevant_docs": 6322 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IRetrieval.json index a704a31bb2..87f882d612 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisHardI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 2048, + "num_samples": 6392, "num_queries": 70, - "num_documents": 1978, + "num_documents": 6322, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 1978, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 6322, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, "min_relevant_docs_per_query": 34, "average_relevant_docs_per_query": 147.85714285714286, "max_relevant_docs_per_query": 556, - "unique_relevant_docs": 1978 + "unique_relevant_docs": 6322 } -} +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IMultiChoice.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IMultiChoice.json deleted file mode 100644 index 95f4f9b84a..0000000000 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IMultiChoice.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "test": { - "number_of_characters": 0, - "num_samples": 6392, - "num_queries": 70, - "num_documents": 6322, - "min_document_length": 0, - "average_document_length": 0, - "max_document_length": 0, - "unique_documents": 0, - "min_document_image_width": 256, - "average_document_image_width": 256.0, - "max_document_image_width": 256, - "min_document_image_height": 256, - "average_document_image_height": 256.0, - "max_document_image_height": 256, - "num_document_images": 6322, - "min_query_length": 0, - "average_query_length": 0, - "max_query_length": 0, - "unique_queries": 0, - "num_query_images": 70, - "min_query_image_width": 256, - "average_query_image_width": 256.0, - "max_query_image_width": 256, - "min_query_image_height": 256, - "average_query_image_height": 256.0, - "max_query_image_height": 256, - "min_relevant_docs_per_query": 76, - "average_relevant_docs_per_query": 246.05714285714285, - "max_relevant_docs_per_query": 636, - "unique_relevant_docs": 6322 - } -} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IRetrieval.json b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IRetrieval.json index 65473fb4ed..95f4f9b84a 100644 --- a/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IRetrieval.json +++ b/mteb/descriptive_stats/Image/Any2AnyRetrieval/RParisMediumI2IRetrieval.json @@ -1,22 +1,34 @@ { "test": { "number_of_characters": 0, - "num_samples": 2721, + "num_samples": 6392, "num_queries": 70, - "num_documents": 2651, + "num_documents": 6322, "min_document_length": 0, "average_document_length": 0, "max_document_length": 0, "unique_documents": 0, - "num_document_images": 2651, + "min_document_image_width": 256, + "average_document_image_width": 256.0, + "max_document_image_width": 256, + "min_document_image_height": 256, + "average_document_image_height": 256.0, + "max_document_image_height": 256, + "num_document_images": 6322, "min_query_length": 0, "average_query_length": 0, "max_query_length": 0, "unique_queries": 0, "num_query_images": 70, + "min_query_image_width": 256, + "average_query_image_width": 256.0, + "max_query_image_width": 256, + "min_query_image_height": 256, + "average_query_image_height": 256.0, + "max_query_image_height": 256, "min_relevant_docs_per_query": 76, "average_relevant_docs_per_query": 246.05714285714285, "max_relevant_docs_per_query": 636, - "unique_relevant_docs": 2651 + "unique_relevant_docs": 6322 } -} +} \ No newline at end of file diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py b/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py index 2d46fbcc73..5b64df1b6c 100644 --- a/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/__init__.py @@ -4,5 +4,3 @@ from .eng.BLINKIT2TMultiChoice import * from .eng.CVBench import * from .eng.ImageCoDeT2IMultiChoice import * -from .eng.ROxfordI2IMultiChoice import * -from .eng.RParisI2IMultiChoice import * diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py deleted file mode 100644 index 5afaf6bd3a..0000000000 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py +++ /dev/null @@ -1,144 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice -from mteb.abstasks.TaskMetadata import TaskMetadata - - -# NOTE: These tasks are marked as Any2AnyRetrieval types they are the correct implementations of ROxford retrieval and RParis retrieval -# (as it requires masking out the different docs in corpus for every query). This aligns with the MIEB papeer. -class ROxfordEasyI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="ROxfordEasyI2IMultiChoice", - description="Retrieve photos of landmarks in Oxford, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-oxford-easy-multi", - "revision": "4c167c3ce529f19457c9b8e694258cc6cf8e7cc7", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - descriptive_stats={ - "n_samples": {"test": 5063}, - "avg_character_length": { - "test": { - "average_document_length": 0.0, - "average_query_length": 0.0, - "num_documents": 4993, - "num_queries": 70, - "average_relevant_docs_per_query": 44.5, - } - }, - }, - ) - skip_first_result = False - - -class ROxfordMediumI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="ROxfordMediumI2IMultiChoice", - description="Retrieve photos of landmarks in Oxford, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-oxford-medium-multi", - "revision": "83bd440268e200a4f60313070618e3f45000fa94", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - descriptive_stats={ - "n_samples": {"test": 5063}, - "avg_character_length": { - "test": { - "average_document_length": 0.0, - "average_query_length": 0.0, - "num_documents": 4993, - "num_queries": 70, - "average_relevant_docs_per_query": 78.9, - } - }, - }, - ) - skip_first_result = False - - -class ROxfordHardI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="ROxfordHardI2IMultiChoice", - description="Retrieve photos of landmarks in Oxford, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-oxford-hard-multi", - "revision": "fc7c4ae6655b1e6b132f3b262a359acef42dfce8", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - descriptive_stats={ - "n_samples": {"test": 5063}, - "avg_character_length": { - "test": { - "average_document_length": 0.0, - "average_query_length": 0.0, - "num_documents": 4993, - "num_queries": 70, - "average_relevant_docs_per_query": 35.7, - } - }, - }, - ) - skip_first_result = False diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py deleted file mode 100644 index 419afde02f..0000000000 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py +++ /dev/null @@ -1,144 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import AbsTaskAny2AnyMultiChoice -from mteb.abstasks.TaskMetadata import TaskMetadata - - -# NOTE: These tasks are marked as Any2AnyRetrieval types they are the correct implementations of ROxford retrieval and RParis retrieval -# (as it requires masking out the different docs in corpus for every query). This aligns with the MIEB papeer. -class RParisEasyI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="RParisEasyI2IMultiChoice", - description="Retrieve photos of landmarks in Paris, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-paris-easy-multi", - "revision": "db94b5afd0014ab8c978f20a0fbcc52da1612a08", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - descriptive_stats={ - "n_samples": {"test": 6392}, - "avg_character_length": { - "test": { - "average_document_length": 0.0, - "average_query_length": 0.0, - "num_documents": 6322, - "num_queries": 70, - "average_relevant_docs_per_query": 98.2, - } - }, - }, - ) - skip_first_result = False - - -class RParisMediumI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="RParisMediumI2IMultiChoice", - description="Retrieve photos of landmarks in Paris, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-paris-medium-multi", - "revision": "372c79fc823e1cebc1d55f8e0039aa239285e177", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - descriptive_stats={ - "n_samples": {"test": 6392}, - "avg_character_length": { - "test": { - "average_document_length": 0.0, - "average_query_length": 0.0, - "num_documents": 6322, - "num_queries": 70, - "average_relevant_docs_per_query": 246.1, - } - }, - }, - ) - skip_first_result = False - - -class RParisHardI2IMultiChoice(AbsTaskAny2AnyMultiChoice): - metadata = TaskMetadata( - name="RParisHardI2IMultiChoice", - description="Retrieve photos of landmarks in Paris, UK.", - reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", - dataset={ - "path": "JamieSJS/r-paris-hard-multi", - "revision": "4e5997e48fb2f2f8bf1c8973851dedeb17e09a83", - }, - type="Any2AnyRetrieval", - category="i2i", - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="accuracy", - date=("2009-01-01", "2010-04-01"), - domains=["Web"], - task_subtypes=["Object recognition"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["image"], - sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} -} - """, - descriptive_stats={ - "n_samples": {"test": 6392}, - "avg_character_length": { - "test": { - "average_document_length": 0.0, - "average_query_length": 0.0, - "num_documents": 6322, - "num_queries": 70, - "average_relevant_docs_per_query": 147.86, - } - }, - }, - ) - skip_first_result = False diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py index dbec8e6ae7..c7583a45df 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py @@ -1,23 +1,28 @@ from __future__ import annotations +import logging + +from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import MultiChoiceEvaluationMixin from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval from mteb.abstasks.TaskMetadata import TaskMetadata +logger = logging.getLogger(__name__) + -class ROxfordEasyI2IRetrieval(AbsTaskAny2AnyRetrieval): +class ROxfordEasyI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="ROxfordEasyI2IRetrieval", description="Retrieve photos of landmarks in Oxford, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-oxford-easy", - "revision": "b71b5f67a93aa63761b79a67bcf28bd2ae590902", + "path": "JamieSJS/r-oxford-easy-multi", + "revision": "4c167c3ce529f19457c9b8e694258cc6cf8e7cc7", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], @@ -35,14 +40,14 @@ class ROxfordEasyI2IRetrieval(AbsTaskAny2AnyRetrieval): } """, descriptive_stats={ - "n_samples": {"test": 70}, + "n_samples": {"test": 5063}, "avg_character_length": { "test": { "average_document_length": 0.0, "average_query_length": 0.0, - "num_documents": 516, + "num_documents": 4993, "num_queries": 70, - "average_relevant_docs_per_query": 43.3, + "average_relevant_docs_per_query": 44.5, } }, }, @@ -50,20 +55,20 @@ class ROxfordEasyI2IRetrieval(AbsTaskAny2AnyRetrieval): skip_first_result = False -class ROxfordMediumI2IRetrieval(AbsTaskAny2AnyRetrieval): +class ROxfordMediumI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="ROxfordMediumI2IRetrieval", description="Retrieve photos of landmarks in Oxford, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-oxford-medium", - "revision": "1dfb86730ee4b3f49b441f4896d473c83eb5ff0d", + "path": "JamieSJS/r-oxford-medium-multi", + "revision": "83bd440268e200a4f60313070618e3f45000fa94", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], @@ -81,12 +86,12 @@ class ROxfordMediumI2IRetrieval(AbsTaskAny2AnyRetrieval): } """, descriptive_stats={ - "n_samples": {"test": 70}, + "n_samples": {"test": 5063}, "avg_character_length": { "test": { "average_document_length": 0.0, "average_query_length": 0.0, - "num_documents": 788, + "num_documents": 4993, "num_queries": 70, "average_relevant_docs_per_query": 78.9, } @@ -96,20 +101,20 @@ class ROxfordMediumI2IRetrieval(AbsTaskAny2AnyRetrieval): skip_first_result = False -class ROxfordHardI2IRetrieval(AbsTaskAny2AnyRetrieval): +class ROxfordHardI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="ROxfordHardI2IRetrieval", description="Retrieve photos of landmarks in Oxford, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-oxford-hard", - "revision": "f71ab9d4aabcda93d55a7e65edfb3a34767d89e6", + "path": "JamieSJS/r-oxford-hard-multi", + "revision": "fc7c4ae6655b1e6b132f3b262a359acef42dfce8", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], @@ -127,12 +132,12 @@ class ROxfordHardI2IRetrieval(AbsTaskAny2AnyRetrieval): } """, descriptive_stats={ - "n_samples": {"test": 70}, + "n_samples": {"test": 5063}, "avg_character_length": { "test": { "average_document_length": 0.0, "average_query_length": 0.0, - "num_documents": 685, + "num_documents": 4993, "num_queries": 70, "average_relevant_docs_per_query": 35.7, } diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py index 8c2f6344fb..4cd698157b 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py @@ -1,23 +1,24 @@ from __future__ import annotations +from mteb.abstasks.Image.AbsTaskAny2AnyMultiChoice import MultiChoiceEvaluationMixin from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval from mteb.abstasks.TaskMetadata import TaskMetadata -class RParisEasyI2IRetrieval(AbsTaskAny2AnyRetrieval): +class RParisEasyI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="RParisEasyI2IRetrieval", - description="Retrieve photos of landmarks in Paris, France.", + description="Retrieve photos of landmarks in Paris, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-paris-easy", - "revision": "7d821ddebcb30ad343133e3a81e23347ac2a08a8", + "path": "JamieSJS/r-paris-easy-multi", + "revision": "db94b5afd0014ab8c978f20a0fbcc52da1612a08", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], @@ -35,12 +36,12 @@ class RParisEasyI2IRetrieval(AbsTaskAny2AnyRetrieval): } """, descriptive_stats={ - "n_samples": {"test": 70}, + "n_samples": {"test": 6392}, "avg_character_length": { "test": { "average_document_length": 0.0, "average_query_length": 0.0, - "num_documents": 1470, + "num_documents": 6322, "num_queries": 70, "average_relevant_docs_per_query": 98.2, } @@ -50,20 +51,20 @@ class RParisEasyI2IRetrieval(AbsTaskAny2AnyRetrieval): skip_first_result = False -class RParisMediumI2IRetrieval(AbsTaskAny2AnyRetrieval): +class RParisMediumI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="RParisMediumI2IRetrieval", - description="Retrieve photos of landmarks in Paris, France.", + description="Retrieve photos of landmarks in Paris, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-paris-medium", - "revision": "3d959815e102785efd628170281f1e65561b03d2", + "path": "JamieSJS/r-paris-medium-multi", + "revision": "372c79fc823e1cebc1d55f8e0039aa239285e177", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], @@ -81,14 +82,14 @@ class RParisMediumI2IRetrieval(AbsTaskAny2AnyRetrieval): } """, descriptive_stats={ - "n_samples": {"test": 70}, + "n_samples": {"test": 6392}, "avg_character_length": { "test": { "average_document_length": 0.0, "average_query_length": 0.0, - "num_documents": 2651, + "num_documents": 6322, "num_queries": 70, - "average_relevant_docs_per_query": 147.9, + "average_relevant_docs_per_query": 246.1, } }, }, @@ -96,20 +97,20 @@ class RParisMediumI2IRetrieval(AbsTaskAny2AnyRetrieval): skip_first_result = False -class RParisHardI2IRetrieval(AbsTaskAny2AnyRetrieval): +class RParisHardI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="RParisHardI2IRetrieval", - description="Retrieve photos of landmarks in Paris, France.", + description="Retrieve photos of landmarks in Paris, UK.", reference="https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html", dataset={ - "path": "JamieSJS/r-paris-hard", - "revision": "d3e0adf4e942446c04427511ccce281c86861248", + "path": "JamieSJS/r-paris-hard-multi", + "revision": "4e5997e48fb2f2f8bf1c8973851dedeb17e09a83", }, type="Any2AnyRetrieval", category="i2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="cv_recall_at_1", + main_score="map_at_5", date=("2009-01-01", "2010-04-01"), domains=["Web"], task_subtypes=["Object recognition"], @@ -127,14 +128,14 @@ class RParisHardI2IRetrieval(AbsTaskAny2AnyRetrieval): } """, descriptive_stats={ - "n_samples": {"test": 70}, + "n_samples": {"test": 6392}, "avg_character_length": { "test": { "average_document_length": 0.0, "average_query_length": 0.0, "num_documents": 6322, "num_queries": 70, - "average_relevant_docs_per_query": 35.7, + "average_relevant_docs_per_query": 147.86, } }, },