Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 201 additions & 0 deletions mteb/abstasks/AbsTaskSpectralClustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
from __future__ import annotations

import logging
from collections import Counter
from typing import Any

import networkx as nx
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to convert with try... ecept I think this should be moved inside __call__

import numpy as np
import sklearn
import sklearn.cluster
import tqdm
from datasets import Dataset
from sklearn import metrics

from mteb.encoder_interface import Encoder

from ..evaluation.evaluators import Evaluator
from ..evaluation.evaluators.utils import cos_sim
from ..load_results.task_results import ScoresDict
from .AbsTask import AbsTask
from .TaskMetadata import DescriptiveStatistics

logger = logging.getLogger(__name__)


class SpectralClusteringEvaluator(Evaluator):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be moved to evaluators

def __init__(
self,
sentences,
labels,
task_name: str | None = None,
clustering_batch_size: int = 500,
limit: int | None = None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be removed in 2.0

Suggested change
limit: int | None = None,

**kwargs,
):
super().__init__(**kwargs)
if limit is not None:
sentences = sentences[:limit]
labels = labels[:limit]
Comment on lines +37 to +39
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be removed in 2.0

Suggested change
if limit is not None:
sentences = sentences[:limit]
labels = labels[:limit]

self.sentences = sentences
self.labels = labels
self.clustering_batch_size = clustering_batch_size
self.task_name = task_name

def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):
if "batch_size" not in encode_kwargs:
encode_kwargs["batch_size"] = 32

corpus_embeddings = model.encode(
self.sentences,
task_name=self.task_name,
**encode_kwargs,
)

## Build a adjacency-matrix, which edge is similarity (defalut : cosine-similarity)
logger.info("Building a graph model...")
G = nx.Graph()
for i, i_text in enumerate(self.sentences[:-1]):
score_list = (
model.similarity(corpus_embeddings[i], corpus_embeddings[i + 1 :])[0]
* 100
if getattr(model, "similarity", None)
else cos_sim(corpus_embeddings[i], corpus_embeddings[i + 1 :])[0] * 100
)
for j_text, score in zip(self.sentences[i + 1 :], score_list):
G.add_edge(i_text, j_text, weight=score)

## Convert to numpy array, and Negative values are replaced with 0
adjacency_cos_score_matrix = nx.to_numpy_array(G)
adjacency_cos_score_matrix = np.where(
adjacency_cos_score_matrix < 0, 0, adjacency_cos_score_matrix
)

## Spectral Clustering
clustering = sklearn.cluster.SpectralClustering(
n_clusters=len(set(self.labels)),
affinity="precomputed",
assign_labels="discretize",
)
clustering.fit(adjacency_cos_score_matrix)
cluster_assignment = clustering.labels_

logger.info("Evaluating...")
v_measure = metrics.cluster.v_measure_score(self.labels, cluster_assignment)

return {"v_measure": v_measure}


class AbsTaskSpectralClustering(AbsTask):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you clustering is almost 1to1 to original clustering. Maybe it would be better to move evaluator to properties of task and your tasks will use

        for cluster_set in tqdm.tqdm(dataset, desc="Clustering"):
            evaluator = self.evaluator(
class Task(AbsClustering):
    evaluator = SpectralClusteringEvaluator

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Samoed
Thanks for your advice!

I will revise it again at 'evaluation' level (if it is deemed meaningful).
Additionally, I will apply try/except as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should build this on the fast clustering task, not AbsTaskClustering (it is much slower and gives less consistent estimates)

def __init__(self, **kwargs):
super().__init__(**kwargs)

def _add_main_score(self, scores) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def _evaluate_subset(
self,
model: Encoder,
dataset: Dataset,
*,
encode_kwargs: dict[str, Any] = {},
**kwargs,
) -> ScoresDict:
v_measures = []
for cluster_set in tqdm.tqdm(dataset, desc="Clustering"):
evaluator = SpectralClusteringEvaluator(
cluster_set["sentences"], # type: ignore
cluster_set["labels"], # type: ignore
task_name=self.metadata.name,
**kwargs,
)
metrics = evaluator(model, encode_kwargs=encode_kwargs)
v_measures.append(metrics["v_measure"])

v_mean = np.mean(v_measures)
v_std = np.std(v_measures)
scores = {"v_measure": v_mean, "v_measure_std": v_std, "v_measures": v_measures}
self._add_main_score(scores)
return scores

def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
) -> ClusteringDescriptiveStatistics:
if hf_subset:
sentences = self.dataset[hf_subset][split]["sentences"]
labels = self.dataset[hf_subset][split]["labels"]
elif compute_overall:
sentences = []
labels = []
for hf_subset in self.metadata.eval_langs:
sentences.extend(self.dataset[hf_subset][split]["sentences"])
labels.extend(self.dataset[hf_subset][split]["labels"])
else:
sentences = self.dataset[split]["sentences"]
labels = self.dataset[split]["labels"]

text_len = [len(t) for t in sentences]
all_sentences = []
for s in sentences:
all_sentences.extend(s)
total_text_len = sum(text_len)
total_labels = []
for label in labels:
if isinstance(label, list):
total_labels.extend(label)
else:
total_labels.append(label)
label_counter = Counter(total_labels)
return ClusteringDescriptiveStatistics(
num_samples=len(sentences),
number_of_characters=total_text_len,
min_text_length=min(text_len),
average_text_length=total_text_len / len(sentences),
max_text_length=max(text_len),
unique_texts=len(set(all_sentences)),
min_labels_per_text=min(label_counter.values()),
average_labels_per_text=len(total_labels) / len(sentences),
max_labels_per_text=max(label_counter.values()),
unique_labels=len(label_counter),
labels={
str(label): {
"count": value,
}
for label, value in label_counter.items()
},
)


class ClusteringDescriptiveStatistics(DescriptiveStatistics):
"""Descriptive statistics for Clustering

Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.

min_text_length: Minimum length of text
average_text_length: Average length of text
max_text_length: Maximum length of text
unique_texts: Number of unique texts

min_labels_per_text: Minimum number of labels per text
average_labels_per_text: Average number of labels per text
max_labels_per_text: Maximum number of labels per text
unique_labels: Number of unique labels
labels: dict of label frequencies
"""

num_samples: int
number_of_characters: int

min_text_length: int
average_text_length: float
max_text_length: int
unique_texts: int

min_labels_per_text: int
average_labels_per_text: float
max_labels_per_text: int

unique_labels: int
labels: dict[str, dict[str, int]]
1 change: 1 addition & 0 deletions mteb/abstasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .AbsTaskPairClassification import *
from .AbsTaskReranking import *
from .AbsTaskRetrieval import *
from .AbsTaskSpectralClustering import *
from .AbsTaskSpeedTask import *
from .AbsTaskSTS import *
from .AbsTaskSummarization import *
Expand Down
62 changes: 62 additions & 0 deletions mteb/tasks/Clustering/kor/KlueMrcDomainSpectralClustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

import datasets

from mteb.abstasks.AbsTaskSpectralClustering import AbsTaskSpectralClustering
from mteb.abstasks.TaskMetadata import TaskMetadata


class KlueMrcDomainClustering(AbsTaskSpectralClustering):
metadata = TaskMetadata(
name="KlueMrcDomainSpectralClustering",
description="this dataset is a processed and redistributed version of the KLUE-MRC dataset. Domain: Game / Media / Automotive / Finance / Real Estate / Education",
reference="https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_context_domain",
type="Clustering",
category="p2p",
modalities=["text"],
eval_splits=["test"],
eval_langs=["kor-Hang"],
main_score="v_measure",
dataset={
"path": "on-and-on/clustering_klue_mrc_context_domain",
"revision": "a814b5ef0b6814991785f2c31af8e38ef7bb3f0d",
},
date=("2016-01-01", "2020-12-31"),
domains=["News", "Written"],
task_subtypes=[],
license="cc-by-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation="""@misc{park2021klue,
title={KLUE: Korean Language Understanding Evaluation},
author={Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho},
year={2021},
eprint={2105.09680},
archivePrefix={arXiv},
primaryClass={cs.CL},
}""",
prompt="Identify the topic or theme of the given texts",
)

def dataset_transform(self):
documents: list = []
labels: list = []

split = self.metadata.eval_splits[0]
ds = {}

self.dataset = self.dataset.rename_columns(
{"text": "sentences", "label": "labels"}
)

documents.append(self.dataset[split]["sentences"])
labels.append(self.dataset[split]["labels"])

ds[split] = datasets.Dataset.from_dict(
{
"sentences": documents,
"labels": labels,
}
)
self.dataset = datasets.DatasetDict(ds)
62 changes: 62 additions & 0 deletions mteb/tasks/Clustering/kor/KlueYnatMrcCategorySpectralClustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

import datasets

from mteb.abstasks.AbsTaskSpectralClustering import AbsTaskSpectralClustering
from mteb.abstasks.TaskMetadata import TaskMetadata


class KlueYnatMrcCategorySpectralClustering(AbsTaskSpectralClustering):
metadata = TaskMetadata(
name="KlueYnatMrcCategorySpectralClustering",
description="this dataset is a processed and redistributed version of the KLUE-Ynat & KLUE-MRC dataset. News_category: IT/Science, Sports, Media/Culture, Ecomomy/Finance, Real Estate",
reference="https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_ynat_title",
type="Clustering",
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_langs=["kor-Hang"],
main_score="v_measure",
dataset={
"path": "on-and-on/clustering_klue_mrc_ynat_title",
"revision": "5bbded98f39e3bf6e81e15aa79c6616008519e29",
},
date=("2016-01-01", "2020-12-31"),
domains=["News", "Written"],
task_subtypes=[],
license="cc-by-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation="""@misc{park2021klue,
title={KLUE: Korean Language Understanding Evaluation},
author={Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho},
year={2021},
eprint={2105.09680},
archivePrefix={arXiv},
primaryClass={cs.CL},
}""",
prompt="Identify the topic or theme of the given texts",
)

def dataset_transform(self):
documents: list = []
labels: list = []

split = self.metadata.eval_splits[0]
ds = {}

self.dataset = self.dataset.rename_columns(
{"text": "sentences", "label": "labels"}
)

documents.append(self.dataset[split]["sentences"])
labels.append(self.dataset[split]["labels"])

ds[split] = datasets.Dataset.from_dict(
{
"sentences": documents,
"labels": labels,
}
)
self.dataset = datasets.DatasetDict(ds)