diff --git a/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py b/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py index 19a3d879c4..8d471ba0af 100644 --- a/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py +++ b/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py @@ -18,11 +18,11 @@ class SNLHierarchicalClusteringP2P(AbsTaskClusteringFast): metadata = TaskMetadata( name="SNLHierarchicalClusteringP2P", dataset={ - "path": "navjordj/SNL_summarization", - "revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1", + "path": "mteb/SNLHierarchicalClusteringP2P", + "revision": "693a321c42fb13ffe76bb9043f8d2aaa8f0a9499", }, description="Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters.", - reference="https://huggingface.co/datasets/navjordj/SNL_summarization", + reference="https://huggingface.co/datasets/mteb/SNLHierarchicalClusteringP2P", type="Clustering", category="p2p", modalities=["text"], @@ -48,12 +48,6 @@ class SNLHierarchicalClusteringP2P(AbsTaskClusteringFast): ) max_depth = 5 - def dataset_transform(self) -> None: - self.dataset = self.dataset.rename_columns( - {"article": "sentences", "category": "labels"} - ) - self.dataset = self.dataset.map(split_labels) - class SNLHierarchicalClusteringS2S(AbsTaskClusteringFast): max_document_to_embed = 1300 @@ -62,11 +56,11 @@ class SNLHierarchicalClusteringS2S(AbsTaskClusteringFast): metadata = TaskMetadata( name="SNLHierarchicalClusteringS2S", dataset={ - "path": "navjordj/SNL_summarization", - "revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1", + "path": "mteb/SNLHierarchicalClusteringS2S", + "revision": "b505e4ce65f255228e49dd07b6f8148731c5dc64", }, description="Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters.", - reference="https://huggingface.co/datasets/navjordj/SNL_summarization", + reference="https://huggingface.co/datasets/mteb/SNLHierarchicalClusteringS2S", type="Clustering", category="s2s", modalities=["text"], @@ -91,9 +85,3 @@ class SNLHierarchicalClusteringS2S(AbsTaskClusteringFast): prompt="Identify categories in a Norwegian lexicon", ) max_depth = 5 - - def dataset_transform(self) -> None: - self.dataset = self.dataset.rename_columns( - {"ingress": "sentences", "category": "labels"} - ) - self.dataset = self.dataset.map(split_labels) diff --git a/mteb/tasks/Clustering/nob/snl_clustering.py b/mteb/tasks/Clustering/nob/snl_clustering.py index ae63ba1983..524bcb048f 100644 --- a/mteb/tasks/Clustering/nob/snl_clustering.py +++ b/mteb/tasks/Clustering/nob/snl_clustering.py @@ -1,12 +1,9 @@ from __future__ import annotations -import random from collections.abc import Iterable from itertools import islice from typing import TypeVar -import datasets - from mteb.abstasks.AbsTaskClustering import AbsTaskClustering from mteb.abstasks.TaskMetadata import TaskMetadata @@ -27,11 +24,11 @@ class SNLClustering(AbsTaskClustering): metadata = TaskMetadata( name="SNLClustering", dataset={ - "path": "navjordj/SNL_summarization", - "revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1", + "path": "mteb/SNLClustering", + "revision": "e1c801d5a6fe26c89d5e878181246f5b292e6549", }, description="Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters.", - reference="https://huggingface.co/datasets/navjordj/SNL_summarization", + reference="https://huggingface.co/datasets/mteb/SNLClustering", type="Clustering", category="p2p", modalities=["text"], @@ -54,49 +51,3 @@ class SNLClustering(AbsTaskClustering): } """, ) - - def dataset_transform(self): - splits = self.metadata_dict["eval_splits"] - - documents: list = [] - labels: list = [] - label_col = "category" - - ds = {} - for split in splits: - ds_split = self.dataset[split] - - _label = self.normalize_labels(ds_split[label_col]) - documents.extend(ds_split["ingress"]) - labels.extend(_label) - - documents.extend(ds_split["article"]) - labels.extend(_label) - - assert len(documents) == len(labels) - - rng = random.Random(42) # local only seed - pairs = list(zip(documents, labels)) - rng.shuffle(pairs) - documents, labels = (list(collection) for collection in zip(*pairs)) - - # reduce size of dataset to not have too large datasets in the clustering task - documents_batched = list(batched(documents, 512))[:4] - labels_batched = list(batched(labels, 512))[:4] - - ds[split] = datasets.Dataset.from_dict( - { - "sentences": documents_batched, - "labels": labels_batched, - } - ) - - self.dataset = datasets.DatasetDict(ds) - - @staticmethod - def normalize_labels(labels: list[str]) -> list[str]: - # example label: - # Store norske leksikon,Kunst og estetikk,Musikk,Klassisk musikk,Internasjonale dirigenter - # When using 2 levels there is 17 unique labels - # When using 3 levels there is 121 unique labels - return [",".join(tuple(label.split(",")[:3])) for label in labels] diff --git a/mteb/tasks/Retrieval/nob/snl_retrieval.py b/mteb/tasks/Retrieval/nob/snl_retrieval.py index 9d2016a7c5..2dfe630390 100644 --- a/mteb/tasks/Retrieval/nob/snl_retrieval.py +++ b/mteb/tasks/Retrieval/nob/snl_retrieval.py @@ -10,11 +10,11 @@ class SNLRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( name="SNLRetrieval", dataset={ - "path": "navjordj/SNL_summarization", + "path": "adrlau/navjordj-SNL_summarization_copy", # TODO: replace with mteb/SNLRetrieval after #2820 is resolved. "revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1", }, description="Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'.", - reference="https://huggingface.co/datasets/navjordj/SNL_summarization", + reference="https://huggingface.co/datasets/mteb/SNLRetrieval", type="Retrieval", category="p2p", modalities=["text"], diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 55e33e994a..54210e6884 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -34,10 +34,6 @@ datasets_not_available = [ "AfriSentiLangClassification", - "SNLHierarchicalClusteringP2P", - "SNLClustering", - "SNLHierarchicalClusteringS2S", - "SNLRetrieval", ]