diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py index dfc42881df..d4ac0cd463 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py @@ -21,15 +21,20 @@ def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = No def map_function(split_name): return lambda x, idx: { "id": f"corpus-{split_name}-{idx}", - "text": x["text_corrected"] if x["text_corrected"] else "", + "text": x["text_corrected"], "modality": "text", "image": None, } - # Apply the map function to each split and concatenate + split_datasets = {} + for split in dataset_splits: + split_datasets[split] = dataset[split].filter( + lambda example: example["text_corrected"] != None + ) + shared_corpus = concatenate_datasets( [ - dataset[split].map( + split_datasets[split].map( map_function(split), with_indices=True, remove_columns=[ @@ -46,13 +51,11 @@ def map_function(split_name): for split in dataset_splits ] ) - # image corrupted & caption empty - shared_corpus = shared_corpus.select( - [i for i in range(len(shared_corpus)) if i not in [4578, 6781, 6784, 6786]] - ) + for split in splits: corpus[split] = shared_corpus - split_dataset = dataset[split] + split_dataset = split_datasets[split] + queries[split] = split_dataset.map( lambda x, idx: { "id": f"query-{split}-{idx}", @@ -72,18 +75,14 @@ def map_function(split_name): "text_corrected", ], ) - if split == "test": - queries[split] = queries[split].select( - [i for i in range(len(queries[split])) if i not in [489, 492, 494]] - ) + relevant_docs[split] = {} for index in range(len(split_dataset)): - if index not in [489, 492, 494]: - query_id = f"query-{split}-{index}" - doc_id = f"corpus-{split}-{index}" - if query_id not in relevant_docs[split]: - relevant_docs[split][query_id] = {} - relevant_docs[split][query_id][doc_id] = 1 + query_id = f"query-{split}-{index}" + doc_id = f"corpus-{split}-{index}" + if query_id not in relevant_docs[split]: + relevant_docs[split][query_id] = {} + relevant_docs[split][query_id][doc_id] = 1 return corpus, queries, relevant_docs diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py index dff7746b5a..59c27e35b6 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py @@ -25,10 +25,15 @@ def map_function(split_name): "modality": "image", } - # Apply the map function to each split and concatenate + split_datasets = {} + for split in dataset_splits: + split_datasets[split] = dataset[split].filter( + lambda example: example["text_corrected"] != None + ) + shared_corpus = concatenate_datasets( [ - dataset[split].map( + split_datasets[split].map( map_function(split), with_indices=True, remove_columns=[ @@ -45,13 +50,10 @@ def map_function(split_name): for split in dataset_splits ] ) - # image corrupted - shared_corpus = shared_corpus.select( - [i for i in range(len(shared_corpus)) if i not in [4578, 6781, 6784, 6786]] - ) + for split in splits: corpus[split] = shared_corpus - split_dataset = dataset[split] + split_dataset = split_datasets[split] queries[split] = split_dataset.map( lambda x, idx: { "id": f"query-{split}-{idx}", @@ -71,18 +73,14 @@ def map_function(split_name): "text_corrected", ], ) - if split == "test": - queries[split] = queries[split].select( - [i for i in range(len(queries[split])) if i not in [489, 492, 494]] - ) + relevant_docs[split] = {} for index in range(len(split_dataset)): - if index not in [489, 492, 494]: - query_id = f"query-{split}-{index}" - doc_id = f"corpus-{split}-{index}" - if query_id not in relevant_docs[split]: - relevant_docs[split][query_id] = {} - relevant_docs[split][query_id][doc_id] = 1 + query_id = f"query-{split}-{index}" + doc_id = f"corpus-{split}-{index}" + if query_id not in relevant_docs[split]: + relevant_docs[split][query_id] = {} + relevant_docs[split][query_id][doc_id] = 1 return corpus, queries, relevant_docs