diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index b425985041..fa6978e171 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -60,6 +60,7 @@ "Tumor detection", "Duplicate Detection", "Gender Clustering", + "Voice Emotion Clustering", ] TASK_DOMAIN = Literal[ @@ -127,6 +128,7 @@ TASK_CATEGORY = Literal[ + "a2a", # Audio-to-audio "s2s", # Sentence-to-sentence "s2p", # Sentence-to-paragraph "p2p", # Paragraph-to-paragraph diff --git a/mteb/evaluation/evaluators/Audio/ClusteringEvaluator.py b/mteb/evaluation/evaluators/Audio/ClusteringEvaluator.py index 0d244fc970..1d7b1fad69 100644 --- a/mteb/evaluation/evaluators/Audio/ClusteringEvaluator.py +++ b/mteb/evaluation/evaluators/Audio/ClusteringEvaluator.py @@ -3,18 +3,20 @@ import logging from typing import Any +import numpy as np import sklearn import sklearn.cluster from datasets import Audio from scipy.optimize import linear_sum_assignment from sklearn import metrics +import random +from sklearn.decomposition import PCA from mteb.encoder_interface import Encoder from mteb.evaluation.evaluators.Evaluator import Evaluator logger = logging.getLogger(__name__) - class AudioClusteringEvaluator(Evaluator): def __init__( self, @@ -23,16 +25,41 @@ def __init__( task_name: str | None = None, clustering_batch_size: int = 500, limit: int | None = None, + cluster_algo: str = "KMeans", **kwargs, ): super().__init__(**kwargs) if limit is not None: audio = audio[:limit] labels = labels[:limit] + + random.seed(42) + combined = list(zip(audio, labels)) + random.shuffle(combined) + audio, labels = map(list, zip(*combined)) + self.audio = audio self.labels = labels self.clustering_batch_size = clustering_batch_size self.task_name = task_name + self.cluster_algo = cluster_algo + + def __clustering__(self): + if self.cluster_algo == "Kmeans": + logger.info("Fitting Mini-Batch K-Means model...") + clustering_model = sklearn.cluster.MiniBatchKMeans( + n_clusters=len(set(self.labels)), + batch_size=self.clustering_batch_size, + n_init="auto", + ) + elif self.cluster_algo == "DBSCAN": + # need to plot out the distribution of the embeddings to decide on parameters for DBSCAN + logger.info("Fitting DBSCAN model...") + clustering_model = sklearn.cluster.DBSCAN(eps=0.5, min_samples=5, metric="euclidean") + elif self.cluster_algo == "Agg": + logger.info("Fitting Agglomerative model...") + clustering_model = sklearn.cluster.AgglomerativeClustering(n_clusters=len(set(self.labels)),linkage='average', metric='cosine') + return clustering_model def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): if "batch_size" not in encode_kwargs: @@ -44,13 +71,13 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): ) logger.info("Fitting Mini-Batch K-Means model...") - clustering_model = sklearn.cluster.MiniBatchKMeans( - n_clusters=len(set(self.labels)), - batch_size=self.clustering_batch_size, - n_init="auto", - ) - clustering_model.fit(audio_embeddings) - cluster_assignment = clustering_model.labels_ + + pca = PCA(n_components=200) + audio_embeddings = pca.fit_transform(audio_embeddings) + + clustering_output = self.__clustering__() + clustering_output.fit(audio_embeddings) + cluster_assignment = clustering_output.labels_ logger.info("Evaluating...") v_measure = metrics.cluster.v_measure_score(self.labels, cluster_assignment) @@ -61,6 +88,8 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): matrix = metrics.confusion_matrix(self.labels, cluster_assignment) + silhouette = float(metrics.silhouette_score(audio_embeddings, cluster_assignment, metric='euclidean')) + print(self.cluster_algo) # get linear sum assignment row_ind, col_ind = linear_sum_assignment(matrix, maximize=True) total_correct = matrix[row_ind, col_ind].sum() @@ -71,4 +100,5 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): "nmi": nmi, "ari": ari, "cluster_accuracy": clustering_accuracy, - } + "silhouette": silhouette, + } \ No newline at end of file diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 05278fa664..c9979e78c7 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -72,6 +72,9 @@ voyage_models, voyage_v, wav2vec_models, + wavlm_models, + whisper_models, + qwen_models ) logger = logging.getLogger(__name__) @@ -138,6 +141,9 @@ voyage_models, fa_models, wav2vec_models, + wavlm_models, + whisper_models, + qwen_models ] MODEL_REGISTRY = {} diff --git a/mteb/models/qwen_models.py b/mteb/models/qwen_models.py new file mode 100644 index 0000000000..0355996f71 --- /dev/null +++ b/mteb/models/qwen_models.py @@ -0,0 +1,81 @@ +from functools import partial +from mteb.models.wrapper import Wrapper +from mteb.encoder_interface import PromptType, AudioEncoder +import numpy as np +import torch +import librosa +from transformers import AutoFeatureExtractor, Qwen2AudioForConditionalGeneration, AutoProcessor +from mteb.model_meta import ModelMeta +from datasets import Audio + +class Qwen2AudioWrapper(AudioEncoder): + def __init__(self, model_name: str, device: str | None = None, **kwargs): + super().__init__(device=device, **kwargs) + self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B") + self.model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B") + + self.audio_encoder = self.model.audio_tower + + if hasattr(self.model.config.audio_config, "d_model"): + self.embed_dim = self.model.config.audio_config.d_model + elif hasattr(self.model.config.audio_config, "hidden_size"): + self.embed_dim = self.model.config.audio_config.hidden_size + else: + self.embed_dim = None + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.model = self.model.to(self.device) + self.audio_encoder = self.audio_encoder.to(self.device) + print("Qwen2-Audio initialized. Hiden dim:", self.embed_dim) + + def get_audio_embeddings(self, audio_files: list[Audio] | Audio, batch_size: int = 32, **kwargs) -> np.ndarray: + if not isinstance(audio_files, list): + audio_files = [audio_files] + all_embeds = [] + for i in range(0, len(audio_files), batch_size): + batch = audio_files[i:i + batch_size] + audios = [file['array'] for file in batch] + sr = batch[0]['sampling_rate'] + + prompt = " ".join(["<|AUDIO|>"] * len(batch)) + inputs = self.processor(text=prompt, + audios=audios, + sampling_rate=sr, + return_tensors="pt", + padding=True + ) + + input_features = inputs.input_features.to(self.device) + with torch.no_grad(): + outputs = self.audio_encoder(input_features=input_features) + + embeds = outputs.last_hidden_state.mean(dim=1) + print(embeds.shape) + all_embeds.append(embeds.cpu().numpy()) + + return np.vstack(all_embeds) + + def encode(self, audio_files: list[Audio], *, task_name: str, prompt_type: PromptType | None = None, **kwargs) -> np.ndarray: + return self.get_audio_embeddings(audio_files, **kwargs) + + +qwen2_audio_meta = ModelMeta( + loader=partial(Qwen2AudioWrapper, model_name="Qwen/Qwen2-Audio-7B"), + name="Qwen/Qwen2-Audio-7B", + languages=["multilingual"], + open_weights=True, + revision=None, + release_date="2024-08-09", + max_tokens=float("inf"), + n_parameters=7_000_000_000, + memory_usage_mb=None, + embed_dim=1280, + license="Unknown", + reference="https://huggingface.co/Qwen/Qwen2-Audio-7B", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) diff --git a/mteb/models/wav2vec_models.py b/mteb/models/wav2vec_models.py index 10ce3cc1a9..8a9e3246da 100644 --- a/mteb/models/wav2vec_models.py +++ b/mteb/models/wav2vec_models.py @@ -1,7 +1,8 @@ from __future__ import annotations from functools import partial - +from mteb.models.wrapper import Wrapper +from mteb.encoder_interface import PromptType, AudioEncoder import numpy as np import torch from datasets import Audio @@ -10,7 +11,6 @@ from mteb.encoder_interface import AudioEncoder, PromptType from mteb.model_meta import ModelMeta - class Wav2vec2Wrapper(AudioEncoder): def __init__( self, @@ -48,10 +48,11 @@ def get_audio_embeddings( audio_data, sampling_rate=sampling_rates[0], padding=True, - return_tensors="pt", + + return_tensors="pt" ) - if self.device: + if hasattr(self, 'device') and self.device: inputs = {k: v.to(self.device) for k, v in inputs.items()} # Get embeddings @@ -63,6 +64,7 @@ def get_audio_embeddings( ) hidden_states = outputs.hidden_states[-1] + batch_embeddings = hidden_states.mean(dim=1).cpu().numpy() all_embeddings.append(batch_embeddings) @@ -88,6 +90,7 @@ def encode( ), name="facebook/wav2vec2-base", languages=["en"], + open_weights=True, revision="0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8", release_date="2020-10-26", @@ -115,6 +118,7 @@ def encode( ), name="facebook/wav2vec2-base-960h", languages=["en"], + open_weights=True, revision="22aad52d435eb6dbaf354bdad9b0da84ce7d6156", release_date="2020-10-26", @@ -130,6 +134,7 @@ def encode( public_training_code=None, public_training_data=None, training_datasets=None, + modalities=["audio"], ) @@ -142,6 +147,7 @@ def encode( ), name="facebook/wav2vec2-large", languages=["en"], + open_weights=True, revision="312b2410566b698c7a649068d413b2067848bd75", release_date="2020-10-26", @@ -157,6 +163,7 @@ def encode( public_training_code=None, public_training_data=None, training_datasets=None, + modalities=["audio"], ) @@ -169,6 +176,7 @@ def encode( ), name="facebook/wav2vec2-large-xlsr-53", languages=["en"], + open_weights=True, revision="c3f9d884181a224a6ac87bf8885c84d1cff3384f", release_date="2020-10-26", @@ -196,6 +204,7 @@ def encode( ), name="facebook/wav2vec2-lv-60-espeak-cv-ft", languages=["en"], + open_weights=True, revision="ae45363bf3413b374fecd9dc8bc1df0e24c3b7f4", release_date="2020-10-26", @@ -213,3 +222,4 @@ def encode( training_datasets=None, modalities=["audio"], ) + diff --git a/mteb/models/wavlm_models.py b/mteb/models/wavlm_models.py new file mode 100644 index 0000000000..108b6fb582 --- /dev/null +++ b/mteb/models/wavlm_models.py @@ -0,0 +1,252 @@ +from functools import partial +from mteb.models.wrapper import Wrapper +from mteb.encoder_interface import PromptType, AudioEncoder +import numpy as np +import torch +from transformers import WavLMModel, Wav2Vec2FeatureExtractor +from mteb.model_meta import ModelMeta +from datasets import Audio + +class WavlmWrapper(AudioEncoder): + def __init__(self, + model_name: str, + revision: str = "main", + device: str | None = None, + **kwargs + ): + super().__init__(device=device, **kwargs) + self.model_name = model_name + self.model_revision = revision + + self.model = WavLMModel.from_pretrained( + self.model_name, + revision=self.model_revision + ) + self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( + self.model_name, + revision=self.model_revision + ) + self.embed_dim = self.model.config.hidden_size + + if device: + self.model = self.model.to(device) + print("WavLM initialized.") + + def get_audio_embeddings( + self, + audio_files: list[Audio] | Audio, + batch_size: int = 32, + **kwargs + ) -> np.ndarray: + + + if not isinstance(audio_files, list): + audio_files = [audio_files] + + all_embeddings = [] + + for i in range(0, len(audio_files), batch_size): + batch = audio_files[i:i + batch_size] + + audio_data = [file['array'] for file in batch] + sampling_rates = [file['sampling_rate'] for file in batch] + + # Preprocess batch + inputs = self.feature_extractor( + audio_data, + sampling_rate=sampling_rates[0], + padding=True, + return_tensors="pt" + ) + + if hasattr(self, 'device') and self.device: + inputs = {k: v.to(self.device) for k, v in inputs.items()} + + # Get embeddings + with torch.no_grad(): + outputs = self.model( + input_values=inputs["input_values"], + output_hidden_states=True, + return_dict=True + ) + + hidden_states = outputs.hidden_states[-1] + batch_embeddings = hidden_states.mean(dim=1).cpu().numpy() + all_embeddings.append(batch_embeddings) + + return np.vstack(all_embeddings) + + def encode( + self, + audio_files: list[Audio], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs + ) -> np.ndarray: + + return self.get_audio_embeddings(audio_files, **kwargs) + + +wavlm_base = ModelMeta( + loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base"), + name="microsoft/wavlm-base", + languages=["eng"], + open_weights=True, + revision="main", + release_date="2022-07-19", + max_tokens=float("inf"), + n_parameters=94_700_000, + memory_usage_mb=361, + embed_dim=768, + license="MIT", + reference="https://huggingface.co/microsoft/wavlm-base", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) + +wavlm_base_sd = ModelMeta( + loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base-sd"), + name="microsoft/wavlm-base-sd", + languages=["eng"], + open_weights=True, + revision="main", + release_date="2022-07-19", + max_tokens=float("inf"), + n_parameters=94_700_000, + memory_usage_mb=361, + embed_dim=768, + license="MIT", + reference="https://huggingface.co/microsoft/wavlm-base-sd", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) +# print(f"wavlm_base: {wavlm_base.calculate_memory_usage_mb()}") + +wavlm_base_plus = ModelMeta( + loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base-plus"), + name="microsoft/wavlm-base-plus", + languages=["eng"], + open_weights=True, + revision="main", + release_date="2022-07-19", + max_tokens=float("inf"), + n_parameters=94_700_000, + memory_usage_mb=361, + embed_dim=768, + license="MIT", + reference="https://huggingface.co/microsoft/wavlm-base-plus", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) + +# print(f"wavlm_base_plus: {wavlm_base_plus.calculate_memory_usage_mb()}") + +wavlm_base_plus_sv = ModelMeta( + loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base-plus-sv"), + name="microsoft/wavlm-base-plus-sv", + languages=["eng"], + open_weights=True, + revision="main", + release_date="2022-07-19", # estimate + max_tokens=float("inf"), + n_parameters=94_700_000, + memory_usage_mb=361, + embed_dim=768, + license="MIT", + reference="https://huggingface.co/microsoft/wavlm-base-plus-sv", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) + +wavlm_base_plus_sd = ModelMeta( + loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base-plus-sd"), + name="microsoft/wavlm-base-plus-sd", + languages=["eng"], + open_weights=True, + revision="main", + release_date="2022-07-19", # estimate + max_tokens=float("inf"), + n_parameters=94_700_000, + memory_usage_mb=361, + embed_dim=768, + license="MIT", + reference="https://huggingface.co/microsoft/wavlm-base-plus-sd", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) + +# print(f"wavlm_base_plus_sv: {wavlm_base_plus_sv.calculate_memory_usage_mb()}") + +wavlm_base_sv = ModelMeta( + loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base-sv"), + name="microsoft/wavlm-base-sv", + languages=["eng"], + open_weights=True, + revision="main", + release_date="2022-07-19", # estimate + max_tokens=float("inf"), + n_parameters=94_700_000, + memory_usage_mb=361, + embed_dim=768, + license="MIT", + reference="https://huggingface.co/microsoft/wavlm-base-sv", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) + +# print(f"wavlm_base_sv: {wavlm_base_sv.calculate_memory_usage_mb()}") + +wavlm_large = ModelMeta( + loader=partial(WavlmWrapper, model_name="microsoft/wavlm-large"), + name="microsoft/wavlm-large", + languages=["eng"], + open_weights=True, + revision="main", + release_date="2022-07-19", # estimate + max_tokens=float("inf"), + n_parameters=316_620_000, + memory_usage_mb=1208, + embed_dim=1024, + license="MIT", + reference="https://huggingface.co/microsoft/wavlm-large", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) + +# print(f"wavlm_large: {wavlm_large.calculate_memory_usage_mb()}") \ No newline at end of file diff --git a/mteb/models/whisper_models.py b/mteb/models/whisper_models.py new file mode 100644 index 0000000000..3baeefa327 --- /dev/null +++ b/mteb/models/whisper_models.py @@ -0,0 +1,188 @@ +from functools import partial +from mteb.models.wrapper import Wrapper +from mteb.encoder_interface import PromptType, AudioEncoder +import numpy as np +import torch +from transformers import WhisperModel, WhisperProcessor +from mteb.model_meta import ModelMeta +from datasets import Audio + +class WhisperWrapper(AudioEncoder): + def __init__(self, + model_name: str, + revision: str = "main", + device: str | None = None, + **kwargs): + super().__init__(device=device, **kwargs) + self.model_name = model_name + self.model_revision = revision + + self.model = WhisperModel.from_pretrained(self.model_name, revision=self.model_revision) + self.feature_extractor = WhisperProcessor.from_pretrained(self.model_name, revision=self.model_revision) + self.embed_dim = self.model.config.d_model + + if device: + self.model = self.model.to(device) + print("Whisper model initialized.") + + def get_audio_embeddings(self, + audio_files: list[Audio] | Audio, + batch_size: int = 32, + **kwargs) -> np.ndarray: + if not isinstance(audio_files, list): + audio_files = [audio_files] + + all_embeddings = [] + for i in range(0, len(audio_files), batch_size): + batch = audio_files[i:i + batch_size] + audio_data = [file['array'] for file in batch] + sampling_rates = [file['sampling_rate'] for file in batch] + + # converts raw waveform to log-Mel spectrograms + inputs = self.feature_extractor( + audio_data, + sampling_rate=sampling_rates[0], + return_tensors="pt", + padding="max_length", # force padding to a fixed raw sample length + max_length=480000 # 30 seconds * 16000 Hz => 480000 samples -> 480000/160 = 3000 mel frames (whisper expects 3000 frames) + ) + + if hasattr(self, 'device') and self.device: + inputs = {k: v.to(self.device) for k, v in inputs.items()} + + with torch.no_grad(): + encoder_outputs = self.model.encoder(inputs.input_features) + + embeddings = encoder_outputs.last_hidden_state + batch_embeddings = embeddings.mean(dim=1).cpu().numpy() + print(batch_embeddings.shape) + all_embeddings.append(batch_embeddings) + + return np.vstack(all_embeddings) + + def encode(self, + audio_files: list[Audio], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs) -> np.ndarray: + return self.get_audio_embeddings(audio_files, **kwargs) + + + + +whisper_tiny = ModelMeta( + loader=partial(WhisperWrapper, model_name="openai/whisper-tiny"), + name="openai/whisper-tiny", + languages=["eng", "multilingual"], + open_weights=True, + revision="main", + release_date="2022-09-27", + max_tokens=float("inf"), + n_parameters=39_000_000, + memory_usage_mb=144, + embed_dim=512, + license="MIT", + reference="https://huggingface.co/openai/whisper-tiny", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) + +whisper_base = ModelMeta( + loader=partial(WhisperWrapper, model_name="openai/whisper-base"), + name="openai/whisper-base", + languages=["eng", "multilingual"], + open_weights=True, + revision="main", + release_date="2022-09-27", + max_tokens=float("inf"), + n_parameters=74_000_000, + memory_usage_mb=277, + embed_dim=512, + license="MIT", + reference="https://huggingface.co/openai/whisper-base", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) + +whisper_small = ModelMeta( + loader=partial(WhisperWrapper, model_name="openai/whisper-small"), + name="openai/whisper-small", + languages=["eng", "multilingual"], + open_weights=True, + revision="main", + release_date="2022-09-27", + max_tokens=float("inf"), + n_parameters=244_000_000, + memory_usage_mb=922, + embed_dim=768, + license="MIT", + reference="https://huggingface.co/openai/whisper-small", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) + +whisper_medium = ModelMeta( + loader=partial(WhisperWrapper, model_name="openai/whisper-medium"), + name="openai/whisper-medium", + languages=["eng", "multilingual"], + open_weights=True, + revision="main", + release_date="2022-09-27", + max_tokens=float("inf"), + n_parameters=769_000_000, + memory_usage_mb=2914, + embed_dim=1024, + license="MIT", + reference="https://huggingface.co/openai/whisper-medium", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) + +whisper_large_v3 = ModelMeta( + loader=partial(WhisperWrapper, model_name="openai/whisper-large-v3"), + name="openai/whisper-large-v3", + languages=["multilingual"], + open_weights=True, + revision="main", + release_date="2022-09-27", + max_tokens=float("inf"), + n_parameters=1550_000_000, + memory_usage_mb=5887, + embed_dim=1280, + license="MIT", + reference="https://huggingface.co/openai/whisper-large-v3", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, + modalities=["audio"] +) + +# print(f"whisper_tiny: {whisper_tiny.calculate_memory_usage_mb()}") +# print(f"whisper_base: {whisper_base.calculate_memory_usage_mb()}") +# print(f"whisper_small: {whisper_small.calculate_memory_usage_mb()}") +# print(f"whisper_medium: {whisper_medium.calculate_memory_usage_mb()}") +# print(f"whisper_large_v3: {whisper_large.calculate_memory_usage_mb()}") diff --git a/mteb/tasks/Audio/Clustering/eng/VoiceEmotions.py b/mteb/tasks/Audio/Clustering/eng/VoiceEmotions.py new file mode 100644 index 0000000000..5377672a04 --- /dev/null +++ b/mteb/tasks/Audio/Clustering/eng/VoiceEmotions.py @@ -0,0 +1,81 @@ +from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering +from mteb.abstasks.TaskMetadata import TaskMetadata +import random +import datasets +import mteb +from mteb import MTEB + + +class CREMADEmotionClustering(AbsTaskAudioClustering): + label_column_name: str = "label" + + metadata = TaskMetadata( + name="CREMADEmotionClustering", + description="Clustering audio recordings based on expressed emotions from the CREMA-D dataset.", + reference="https://huggingface.co/datasets/AbstractTTS/CREMA-D", + dataset={ + "path": "AbstractTTS/CREMA-D", + "revision": "main", + }, + type="AudioClustering", + category="a2a", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="nmi", + date=("2014-01-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Voice Emotion Clustering"], + license="not specified", + annotations_creators="derived", + dialect=[], + modalities=["audio"], + ) + + def dataset_transform(self): + EMOTION_MAP = {"anger": 0,"happy": 1,"neutral": 2,"sad": 3,"fear": 4,"disgust": 5} + splits = self.metadata.eval_splits + ds = {} + for split in splits: + ds_split = self.dataset[split] + audio = ds_split["audio"] + labels = ds_split["major_emotion"] + audio = [{"array": item["array"], "sampling_rate": item["sampling_rate"]} for item in audio] + labels = [EMOTION_MAP.get(str(label).lower().strip(), -1) for label in labels] + rng = random.Random(1111) + data_pairs = list(zip(audio, labels)) + rng.shuffle(data_pairs) + audio, labels = zip(*data_pairs) + + batch_size = 512 + audio_batched = [audio[i:i + batch_size] for i in range(0, len(audio), batch_size)] + labels_batched = [labels[i:i + batch_size] for i in range(0, len(labels), batch_size)] + + audio_batched = audio_batched[:4] + labels_batched = labels_batched[:4] + + audio_batched = [item for batch in audio_batched for item in batch] + labels_batched = [item for batch in labels_batched for item in batch] + + ds[split] = datasets.Dataset.from_dict({ + "audio": audio_batched, + "label": labels_batched, + }) + + self.dataset = datasets.DatasetDict(ds) + + + +if __name__ == "__main__": + model_name = "microsoft/wavlm-base" + model_name = "facebook/wav2vec2-base" + model = mteb.get_model(model_name) + print(f"Loaded model type: {type(model)}") + evaluation = mteb.MTEB(tasks=[CREMADEmotionClustering()]) + cluster_algo = "Kmeans" + results = evaluation.run(model, output_folder=f"results_Emotions/{cluster_algo}/{model_name}", overwrite_results=True, cluster_algo=cluster_algo, limit=224) + print(results) + + + + + diff --git a/mteb/tasks/Audio/Clustering/eng/VoiceGender.py b/mteb/tasks/Audio/Clustering/eng/VoiceGender.py index b24cb3259a..ce65b0ba3a 100644 --- a/mteb/tasks/Audio/Clustering/eng/VoiceGender.py +++ b/mteb/tasks/Audio/Clustering/eng/VoiceGender.py @@ -2,6 +2,8 @@ from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering from mteb.abstasks.TaskMetadata import TaskMetadata +import mteb +from mteb import MTEB class VoiceGenderClustering(AbsTaskAudioClustering): @@ -13,6 +15,7 @@ class VoiceGenderClustering(AbsTaskAudioClustering): dataset={ "path": "mmn3690/voice-gender-clustering", "revision": "1b202ea7bcd0abd5283e628248803e1569257c80", + }, type="AudioClustering", category="a2a", @@ -34,3 +37,17 @@ class VoiceGenderClustering(AbsTaskAudioClustering): year = "2018 }""", ) + +if __name__ == "__main__": + #model_name = "microsoft/wavlm-base" + model_name = "facebook/wav2vec2-base" + model = mteb.get_model(model_name) + print(f"Loaded model type: {type(model)}") + evaluation = mteb.MTEB(tasks=[VoiceGenderClustering()]) + cluster_algo = "Kmeans" + results = evaluation.run(model, output_folder=f"results_Gender/{cluster_algo}/{model_name}", overwrite_results=True, cluster_algo=cluster_algo) + print(results) + + # from datasets import load_dataset + # dataset = load_dataset("mmn3690/voice-gender-clustering", split="train") + # print(dataset["label"]) diff --git a/run.py b/run.py new file mode 100644 index 0000000000..048132d326 --- /dev/null +++ b/run.py @@ -0,0 +1,12 @@ +import mteb +from mteb.tasks.Audio.Clustering.eng.VoiceGender import VoiceGenderClustering +from mteb.tasks.Audio.Clustering.eng.VoiceEmotions import CREMADEmotionClustering + +# model_name = "microsoft/wavlm-base" +model_name = "Qwen/Qwen2-Audio-7B" +model = mteb.get_model(model_name) +print(f"Loaded model type: {type(model)}") +evaluation = mteb.MTEB(tasks=[CREMADEmotionClustering()]) +cluster_algo = "Kmeans" +results = evaluation.run(model, output_folder=f"results_Emotions/{cluster_algo}/{model_name}", overwrite_results=True, cluster_algo=cluster_algo, limit=224) +print(results) \ No newline at end of file