Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 6 additions & 18 deletions mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ class SNLHierarchicalClusteringP2P(AbsTaskClusteringFast):
metadata = TaskMetadata(
name="SNLHierarchicalClusteringP2P",
dataset={
"path": "navjordj/SNL_summarization",
"revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1",
"path": "mteb/SNLHierarchicalClusteringP2P",
"revision": "693a321c42fb13ffe76bb9043f8d2aaa8f0a9499",
},
description="Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters.",
reference="https://huggingface.co/datasets/navjordj/SNL_summarization",
reference="https://huggingface.co/datasets/mteb/SNLHierarchicalClusteringP2P",
type="Clustering",
category="p2p",
modalities=["text"],
Expand All @@ -48,12 +48,6 @@ class SNLHierarchicalClusteringP2P(AbsTaskClusteringFast):
)
max_depth = 5

def dataset_transform(self) -> None:
self.dataset = self.dataset.rename_columns(
{"article": "sentences", "category": "labels"}
)
self.dataset = self.dataset.map(split_labels)


class SNLHierarchicalClusteringS2S(AbsTaskClusteringFast):
max_document_to_embed = 1300
Expand All @@ -62,11 +56,11 @@ class SNLHierarchicalClusteringS2S(AbsTaskClusteringFast):
metadata = TaskMetadata(
name="SNLHierarchicalClusteringS2S",
dataset={
"path": "navjordj/SNL_summarization",
"revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1",
"path": "mteb/SNLHierarchicalClusteringS2S",
"revision": "b505e4ce65f255228e49dd07b6f8148731c5dc64",
},
description="Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters.",
reference="https://huggingface.co/datasets/navjordj/SNL_summarization",
reference="https://huggingface.co/datasets/mteb/SNLHierarchicalClusteringS2S",
type="Clustering",
category="s2s",
modalities=["text"],
Expand All @@ -91,9 +85,3 @@ class SNLHierarchicalClusteringS2S(AbsTaskClusteringFast):
prompt="Identify categories in a Norwegian lexicon",
)
max_depth = 5

def dataset_transform(self) -> None:
self.dataset = self.dataset.rename_columns(
{"ingress": "sentences", "category": "labels"}
)
self.dataset = self.dataset.map(split_labels)
55 changes: 3 additions & 52 deletions mteb/tasks/Clustering/nob/snl_clustering.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from __future__ import annotations

import random
from collections.abc import Iterable
from itertools import islice
from typing import TypeVar

import datasets

from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
from mteb.abstasks.TaskMetadata import TaskMetadata

Expand All @@ -27,11 +24,11 @@ class SNLClustering(AbsTaskClustering):
metadata = TaskMetadata(
name="SNLClustering",
dataset={
"path": "navjordj/SNL_summarization",
"revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1",
"path": "mteb/SNLClustering",
"revision": "e1c801d5a6fe26c89d5e878181246f5b292e6549",
},
description="Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters.",
reference="https://huggingface.co/datasets/navjordj/SNL_summarization",
reference="https://huggingface.co/datasets/mteb/SNLClustering",
type="Clustering",
category="p2p",
modalities=["text"],
Expand All @@ -54,49 +51,3 @@ class SNLClustering(AbsTaskClustering):
}
""",
)

def dataset_transform(self):
splits = self.metadata_dict["eval_splits"]

documents: list = []
labels: list = []
label_col = "category"

ds = {}
for split in splits:
ds_split = self.dataset[split]

_label = self.normalize_labels(ds_split[label_col])
documents.extend(ds_split["ingress"])
labels.extend(_label)

documents.extend(ds_split["article"])
labels.extend(_label)

assert len(documents) == len(labels)

rng = random.Random(42) # local only seed
pairs = list(zip(documents, labels))
rng.shuffle(pairs)
documents, labels = (list(collection) for collection in zip(*pairs))

# reduce size of dataset to not have too large datasets in the clustering task
documents_batched = list(batched(documents, 512))[:4]
labels_batched = list(batched(labels, 512))[:4]

ds[split] = datasets.Dataset.from_dict(
{
"sentences": documents_batched,
"labels": labels_batched,
}
)

self.dataset = datasets.DatasetDict(ds)

@staticmethod
def normalize_labels(labels: list[str]) -> list[str]:
# example label:
# Store norske leksikon,Kunst og estetikk,Musikk,Klassisk musikk,Internasjonale dirigenter
# When using 2 levels there is 17 unique labels
# When using 3 levels there is 121 unique labels
return [",".join(tuple(label.split(",")[:3])) for label in labels]
4 changes: 2 additions & 2 deletions mteb/tasks/Retrieval/nob/snl_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ class SNLRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="SNLRetrieval",
dataset={
"path": "navjordj/SNL_summarization",
"path": "adrlau/navjordj-SNL_summarization_copy", # TODO: replace with mteb/SNLRetrieval after #2820 is resolved.
"revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1",
},
description="Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'.",
reference="https://huggingface.co/datasets/navjordj/SNL_summarization",
reference="https://huggingface.co/datasets/mteb/SNLRetrieval",
type="Retrieval",
category="p2p",
modalities=["text"],
Expand Down
4 changes: 0 additions & 4 deletions tests/test_tasks/test_all_abstasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@

datasets_not_available = [
"AfriSentiLangClassification",
"SNLHierarchicalClusteringP2P",
"SNLClustering",
"SNLHierarchicalClusteringS2S",
"SNLRetrieval",
]


Expand Down