From b78b5f41c17cedaff57f3d9e1403b93a4d2f1d64 Mon Sep 17 00:00:00 2001 From: gowitheflow-1998 Date: Mon, 7 Apr 2025 20:07:08 +0100 Subject: [PATCH 1/3] make memotion code robust --- .../eng/MemotionI2TRetrieval.py | 33 +++++++++---------- .../eng/MemotionT2IRetrieval.py | 29 ++++++++-------- 2 files changed, 28 insertions(+), 34 deletions(-) diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py index dfc42881df..35350da123 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py @@ -21,15 +21,18 @@ def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = No def map_function(split_name): return lambda x, idx: { "id": f"corpus-{split_name}-{idx}", - "text": x["text_corrected"] if x["text_corrected"] else "", + "text": x["text_corrected"],# if x["text_corrected"] else "", "modality": "text", "image": None, } - # Apply the map function to each split and concatenate + split_datasets = {} + for split in dataset_splits: + split_datasets[split] = dataset[split].filter(lambda example: example['text_corrected'] != None) + shared_corpus = concatenate_datasets( [ - dataset[split].map( + split_datasets[split].map( map_function(split), with_indices=True, remove_columns=[ @@ -46,13 +49,11 @@ def map_function(split_name): for split in dataset_splits ] ) - # image corrupted & caption empty - shared_corpus = shared_corpus.select( - [i for i in range(len(shared_corpus)) if i not in [4578, 6781, 6784, 6786]] - ) + for split in splits: corpus[split] = shared_corpus - split_dataset = dataset[split] + split_dataset = split_datasets[split] + queries[split] = split_dataset.map( lambda x, idx: { "id": f"query-{split}-{idx}", @@ -72,18 +73,14 @@ def map_function(split_name): "text_corrected", ], ) - if split == "test": - queries[split] = queries[split].select( - [i for i in range(len(queries[split])) if i not in [489, 492, 494]] - ) + relevant_docs[split] = {} for index in range(len(split_dataset)): - if index not in [489, 492, 494]: - query_id = f"query-{split}-{index}" - doc_id = f"corpus-{split}-{index}" - if query_id not in relevant_docs[split]: - relevant_docs[split][query_id] = {} - relevant_docs[split][query_id][doc_id] = 1 + query_id = f"query-{split}-{index}" + doc_id = f"corpus-{split}-{index}" + if query_id not in relevant_docs[split]: + relevant_docs[split][query_id] = {} + relevant_docs[split][query_id][doc_id] = 1 return corpus, queries, relevant_docs diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py index dff7746b5a..b12c2ea20e 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py @@ -25,10 +25,14 @@ def map_function(split_name): "modality": "image", } + split_datasets = {} + for split in dataset_splits: + split_datasets[split] = dataset[split].filter(lambda example: example['text_corrected'] != None) + # Apply the map function to each split and concatenate shared_corpus = concatenate_datasets( [ - dataset[split].map( + split_datasets[split].map( map_function(split), with_indices=True, remove_columns=[ @@ -45,13 +49,10 @@ def map_function(split_name): for split in dataset_splits ] ) - # image corrupted - shared_corpus = shared_corpus.select( - [i for i in range(len(shared_corpus)) if i not in [4578, 6781, 6784, 6786]] - ) + for split in splits: corpus[split] = shared_corpus - split_dataset = dataset[split] + split_dataset = split_datasets[split] queries[split] = split_dataset.map( lambda x, idx: { "id": f"query-{split}-{idx}", @@ -71,18 +72,14 @@ def map_function(split_name): "text_corrected", ], ) - if split == "test": - queries[split] = queries[split].select( - [i for i in range(len(queries[split])) if i not in [489, 492, 494]] - ) + relevant_docs[split] = {} for index in range(len(split_dataset)): - if index not in [489, 492, 494]: - query_id = f"query-{split}-{index}" - doc_id = f"corpus-{split}-{index}" - if query_id not in relevant_docs[split]: - relevant_docs[split][query_id] = {} - relevant_docs[split][query_id][doc_id] = 1 + query_id = f"query-{split}-{index}" + doc_id = f"corpus-{split}-{index}" + if query_id not in relevant_docs[split]: + relevant_docs[split][query_id] = {} + relevant_docs[split][query_id][doc_id] = 1 return corpus, queries, relevant_docs From 4b71a3d06488fbdbebc2fe3921516a669983386a Mon Sep 17 00:00:00 2001 From: gowitheflow-1998 Date: Mon, 7 Apr 2025 20:07:46 +0100 Subject: [PATCH 2/3] lint --- .../Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py | 10 ++++++---- .../Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py | 6 ++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py index 35350da123..67fac03d90 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py @@ -21,14 +21,16 @@ def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = No def map_function(split_name): return lambda x, idx: { "id": f"corpus-{split_name}-{idx}", - "text": x["text_corrected"],# if x["text_corrected"] else "", + "text": x["text_corrected"], # if x["text_corrected"] else "", "modality": "text", "image": None, } split_datasets = {} for split in dataset_splits: - split_datasets[split] = dataset[split].filter(lambda example: example['text_corrected'] != None) + split_datasets[split] = dataset[split].filter( + lambda example: example["text_corrected"] != None + ) shared_corpus = concatenate_datasets( [ @@ -49,11 +51,11 @@ def map_function(split_name): for split in dataset_splits ] ) - + for split in splits: corpus[split] = shared_corpus split_dataset = split_datasets[split] - + queries[split] = split_dataset.map( lambda x, idx: { "id": f"query-{split}-{idx}", diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py index b12c2ea20e..1a4a9353cc 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py @@ -27,7 +27,9 @@ def map_function(split_name): split_datasets = {} for split in dataset_splits: - split_datasets[split] = dataset[split].filter(lambda example: example['text_corrected'] != None) + split_datasets[split] = dataset[split].filter( + lambda example: example["text_corrected"] != None + ) # Apply the map function to each split and concatenate shared_corpus = concatenate_datasets( @@ -72,7 +74,7 @@ def map_function(split_name): "text_corrected", ], ) - + relevant_docs[split] = {} for index in range(len(split_dataset)): query_id = f"query-{split}-{index}" From 40986ca40681eca7e0521293ce239e64f7369ff6 Mon Sep 17 00:00:00 2001 From: gowitheflow-1998 Date: Mon, 7 Apr 2025 20:14:27 +0100 Subject: [PATCH 3/3] clean comment --- mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py | 2 +- mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py index 67fac03d90..d4ac0cd463 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py @@ -21,7 +21,7 @@ def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = No def map_function(split_name): return lambda x, idx: { "id": f"corpus-{split_name}-{idx}", - "text": x["text_corrected"], # if x["text_corrected"] else "", + "text": x["text_corrected"], "modality": "text", "image": None, } diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py index 1a4a9353cc..59c27e35b6 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py @@ -31,7 +31,6 @@ def map_function(split_name): lambda example: example["text_corrected"] != None ) - # Apply the map function to each split and concatenate shared_corpus = concatenate_datasets( [ split_datasets[split].map(