diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index fa6978e171..b425985041 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -60,7 +60,6 @@ "Tumor detection", "Duplicate Detection", "Gender Clustering", - "Voice Emotion Clustering", ] TASK_DOMAIN = Literal[ @@ -128,7 +127,6 @@ TASK_CATEGORY = Literal[ - "a2a", # Audio-to-audio "s2s", # Sentence-to-sentence "s2p", # Sentence-to-paragraph "p2p", # Paragraph-to-paragraph diff --git a/mteb/evaluation/evaluators/Audio/ClusteringEvaluator.py b/mteb/evaluation/evaluators/Audio/ClusteringEvaluator.py index 1d7b1fad69..0d244fc970 100644 --- a/mteb/evaluation/evaluators/Audio/ClusteringEvaluator.py +++ b/mteb/evaluation/evaluators/Audio/ClusteringEvaluator.py @@ -3,20 +3,18 @@ import logging from typing import Any -import numpy as np import sklearn import sklearn.cluster from datasets import Audio from scipy.optimize import linear_sum_assignment from sklearn import metrics -import random -from sklearn.decomposition import PCA from mteb.encoder_interface import Encoder from mteb.evaluation.evaluators.Evaluator import Evaluator logger = logging.getLogger(__name__) + class AudioClusteringEvaluator(Evaluator): def __init__( self, @@ -25,41 +23,16 @@ def __init__( task_name: str | None = None, clustering_batch_size: int = 500, limit: int | None = None, - cluster_algo: str = "KMeans", **kwargs, ): super().__init__(**kwargs) if limit is not None: audio = audio[:limit] labels = labels[:limit] - - random.seed(42) - combined = list(zip(audio, labels)) - random.shuffle(combined) - audio, labels = map(list, zip(*combined)) - self.audio = audio self.labels = labels self.clustering_batch_size = clustering_batch_size self.task_name = task_name - self.cluster_algo = cluster_algo - - def __clustering__(self): - if self.cluster_algo == "Kmeans": - logger.info("Fitting Mini-Batch K-Means model...") - clustering_model = sklearn.cluster.MiniBatchKMeans( - n_clusters=len(set(self.labels)), - batch_size=self.clustering_batch_size, - n_init="auto", - ) - elif self.cluster_algo == "DBSCAN": - # need to plot out the distribution of the embeddings to decide on parameters for DBSCAN - logger.info("Fitting DBSCAN model...") - clustering_model = sklearn.cluster.DBSCAN(eps=0.5, min_samples=5, metric="euclidean") - elif self.cluster_algo == "Agg": - logger.info("Fitting Agglomerative model...") - clustering_model = sklearn.cluster.AgglomerativeClustering(n_clusters=len(set(self.labels)),linkage='average', metric='cosine') - return clustering_model def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): if "batch_size" not in encode_kwargs: @@ -71,13 +44,13 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): ) logger.info("Fitting Mini-Batch K-Means model...") - - pca = PCA(n_components=200) - audio_embeddings = pca.fit_transform(audio_embeddings) - - clustering_output = self.__clustering__() - clustering_output.fit(audio_embeddings) - cluster_assignment = clustering_output.labels_ + clustering_model = sklearn.cluster.MiniBatchKMeans( + n_clusters=len(set(self.labels)), + batch_size=self.clustering_batch_size, + n_init="auto", + ) + clustering_model.fit(audio_embeddings) + cluster_assignment = clustering_model.labels_ logger.info("Evaluating...") v_measure = metrics.cluster.v_measure_score(self.labels, cluster_assignment) @@ -88,8 +61,6 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): matrix = metrics.confusion_matrix(self.labels, cluster_assignment) - silhouette = float(metrics.silhouette_score(audio_embeddings, cluster_assignment, metric='euclidean')) - print(self.cluster_algo) # get linear sum assignment row_ind, col_ind = linear_sum_assignment(matrix, maximize=True) total_correct = matrix[row_ind, col_ind].sum() @@ -100,5 +71,4 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): "nmi": nmi, "ari": ari, "cluster_accuracy": clustering_accuracy, - "silhouette": silhouette, - } \ No newline at end of file + } diff --git a/mteb/models/overview.py b/mteb/models/overview.py index c9979e78c7..05278fa664 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -72,9 +72,6 @@ voyage_models, voyage_v, wav2vec_models, - wavlm_models, - whisper_models, - qwen_models ) logger = logging.getLogger(__name__) @@ -141,9 +138,6 @@ voyage_models, fa_models, wav2vec_models, - wavlm_models, - whisper_models, - qwen_models ] MODEL_REGISTRY = {} diff --git a/mteb/models/qwen_models.py b/mteb/models/qwen_models.py deleted file mode 100644 index 0355996f71..0000000000 --- a/mteb/models/qwen_models.py +++ /dev/null @@ -1,81 +0,0 @@ -from functools import partial -from mteb.models.wrapper import Wrapper -from mteb.encoder_interface import PromptType, AudioEncoder -import numpy as np -import torch -import librosa -from transformers import AutoFeatureExtractor, Qwen2AudioForConditionalGeneration, AutoProcessor -from mteb.model_meta import ModelMeta -from datasets import Audio - -class Qwen2AudioWrapper(AudioEncoder): - def __init__(self, model_name: str, device: str | None = None, **kwargs): - super().__init__(device=device, **kwargs) - self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B") - self.model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B") - - self.audio_encoder = self.model.audio_tower - - if hasattr(self.model.config.audio_config, "d_model"): - self.embed_dim = self.model.config.audio_config.d_model - elif hasattr(self.model.config.audio_config, "hidden_size"): - self.embed_dim = self.model.config.audio_config.hidden_size - else: - self.embed_dim = None - self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") - self.model = self.model.to(self.device) - self.audio_encoder = self.audio_encoder.to(self.device) - print("Qwen2-Audio initialized. Hiden dim:", self.embed_dim) - - def get_audio_embeddings(self, audio_files: list[Audio] | Audio, batch_size: int = 32, **kwargs) -> np.ndarray: - if not isinstance(audio_files, list): - audio_files = [audio_files] - all_embeds = [] - for i in range(0, len(audio_files), batch_size): - batch = audio_files[i:i + batch_size] - audios = [file['array'] for file in batch] - sr = batch[0]['sampling_rate'] - - prompt = " ".join(["<|AUDIO|>"] * len(batch)) - inputs = self.processor(text=prompt, - audios=audios, - sampling_rate=sr, - return_tensors="pt", - padding=True - ) - - input_features = inputs.input_features.to(self.device) - with torch.no_grad(): - outputs = self.audio_encoder(input_features=input_features) - - embeds = outputs.last_hidden_state.mean(dim=1) - print(embeds.shape) - all_embeds.append(embeds.cpu().numpy()) - - return np.vstack(all_embeds) - - def encode(self, audio_files: list[Audio], *, task_name: str, prompt_type: PromptType | None = None, **kwargs) -> np.ndarray: - return self.get_audio_embeddings(audio_files, **kwargs) - - -qwen2_audio_meta = ModelMeta( - loader=partial(Qwen2AudioWrapper, model_name="Qwen/Qwen2-Audio-7B"), - name="Qwen/Qwen2-Audio-7B", - languages=["multilingual"], - open_weights=True, - revision=None, - release_date="2024-08-09", - max_tokens=float("inf"), - n_parameters=7_000_000_000, - memory_usage_mb=None, - embed_dim=1280, - license="Unknown", - reference="https://huggingface.co/Qwen/Qwen2-Audio-7B", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=True, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) diff --git a/mteb/models/wav2vec_models.py b/mteb/models/wav2vec_models.py index 8a9e3246da..10ce3cc1a9 100644 --- a/mteb/models/wav2vec_models.py +++ b/mteb/models/wav2vec_models.py @@ -1,8 +1,7 @@ from __future__ import annotations from functools import partial -from mteb.models.wrapper import Wrapper -from mteb.encoder_interface import PromptType, AudioEncoder + import numpy as np import torch from datasets import Audio @@ -11,6 +10,7 @@ from mteb.encoder_interface import AudioEncoder, PromptType from mteb.model_meta import ModelMeta + class Wav2vec2Wrapper(AudioEncoder): def __init__( self, @@ -48,11 +48,10 @@ def get_audio_embeddings( audio_data, sampling_rate=sampling_rates[0], padding=True, - - return_tensors="pt" + return_tensors="pt", ) - if hasattr(self, 'device') and self.device: + if self.device: inputs = {k: v.to(self.device) for k, v in inputs.items()} # Get embeddings @@ -64,7 +63,6 @@ def get_audio_embeddings( ) hidden_states = outputs.hidden_states[-1] - batch_embeddings = hidden_states.mean(dim=1).cpu().numpy() all_embeddings.append(batch_embeddings) @@ -90,7 +88,6 @@ def encode( ), name="facebook/wav2vec2-base", languages=["en"], - open_weights=True, revision="0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8", release_date="2020-10-26", @@ -118,7 +115,6 @@ def encode( ), name="facebook/wav2vec2-base-960h", languages=["en"], - open_weights=True, revision="22aad52d435eb6dbaf354bdad9b0da84ce7d6156", release_date="2020-10-26", @@ -134,7 +130,6 @@ def encode( public_training_code=None, public_training_data=None, training_datasets=None, - modalities=["audio"], ) @@ -147,7 +142,6 @@ def encode( ), name="facebook/wav2vec2-large", languages=["en"], - open_weights=True, revision="312b2410566b698c7a649068d413b2067848bd75", release_date="2020-10-26", @@ -163,7 +157,6 @@ def encode( public_training_code=None, public_training_data=None, training_datasets=None, - modalities=["audio"], ) @@ -176,7 +169,6 @@ def encode( ), name="facebook/wav2vec2-large-xlsr-53", languages=["en"], - open_weights=True, revision="c3f9d884181a224a6ac87bf8885c84d1cff3384f", release_date="2020-10-26", @@ -204,7 +196,6 @@ def encode( ), name="facebook/wav2vec2-lv-60-espeak-cv-ft", languages=["en"], - open_weights=True, revision="ae45363bf3413b374fecd9dc8bc1df0e24c3b7f4", release_date="2020-10-26", @@ -222,4 +213,3 @@ def encode( training_datasets=None, modalities=["audio"], ) - diff --git a/mteb/models/wavlm_models.py b/mteb/models/wavlm_models.py deleted file mode 100644 index 108b6fb582..0000000000 --- a/mteb/models/wavlm_models.py +++ /dev/null @@ -1,252 +0,0 @@ -from functools import partial -from mteb.models.wrapper import Wrapper -from mteb.encoder_interface import PromptType, AudioEncoder -import numpy as np -import torch -from transformers import WavLMModel, Wav2Vec2FeatureExtractor -from mteb.model_meta import ModelMeta -from datasets import Audio - -class WavlmWrapper(AudioEncoder): - def __init__(self, - model_name: str, - revision: str = "main", - device: str | None = None, - **kwargs - ): - super().__init__(device=device, **kwargs) - self.model_name = model_name - self.model_revision = revision - - self.model = WavLMModel.from_pretrained( - self.model_name, - revision=self.model_revision - ) - self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( - self.model_name, - revision=self.model_revision - ) - self.embed_dim = self.model.config.hidden_size - - if device: - self.model = self.model.to(device) - print("WavLM initialized.") - - def get_audio_embeddings( - self, - audio_files: list[Audio] | Audio, - batch_size: int = 32, - **kwargs - ) -> np.ndarray: - - - if not isinstance(audio_files, list): - audio_files = [audio_files] - - all_embeddings = [] - - for i in range(0, len(audio_files), batch_size): - batch = audio_files[i:i + batch_size] - - audio_data = [file['array'] for file in batch] - sampling_rates = [file['sampling_rate'] for file in batch] - - # Preprocess batch - inputs = self.feature_extractor( - audio_data, - sampling_rate=sampling_rates[0], - padding=True, - return_tensors="pt" - ) - - if hasattr(self, 'device') and self.device: - inputs = {k: v.to(self.device) for k, v in inputs.items()} - - # Get embeddings - with torch.no_grad(): - outputs = self.model( - input_values=inputs["input_values"], - output_hidden_states=True, - return_dict=True - ) - - hidden_states = outputs.hidden_states[-1] - batch_embeddings = hidden_states.mean(dim=1).cpu().numpy() - all_embeddings.append(batch_embeddings) - - return np.vstack(all_embeddings) - - def encode( - self, - audio_files: list[Audio], - *, - task_name: str, - prompt_type: PromptType | None = None, - **kwargs - ) -> np.ndarray: - - return self.get_audio_embeddings(audio_files, **kwargs) - - -wavlm_base = ModelMeta( - loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base"), - name="microsoft/wavlm-base", - languages=["eng"], - open_weights=True, - revision="main", - release_date="2022-07-19", - max_tokens=float("inf"), - n_parameters=94_700_000, - memory_usage_mb=361, - embed_dim=768, - license="MIT", - reference="https://huggingface.co/microsoft/wavlm-base", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) - -wavlm_base_sd = ModelMeta( - loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base-sd"), - name="microsoft/wavlm-base-sd", - languages=["eng"], - open_weights=True, - revision="main", - release_date="2022-07-19", - max_tokens=float("inf"), - n_parameters=94_700_000, - memory_usage_mb=361, - embed_dim=768, - license="MIT", - reference="https://huggingface.co/microsoft/wavlm-base-sd", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) -# print(f"wavlm_base: {wavlm_base.calculate_memory_usage_mb()}") - -wavlm_base_plus = ModelMeta( - loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base-plus"), - name="microsoft/wavlm-base-plus", - languages=["eng"], - open_weights=True, - revision="main", - release_date="2022-07-19", - max_tokens=float("inf"), - n_parameters=94_700_000, - memory_usage_mb=361, - embed_dim=768, - license="MIT", - reference="https://huggingface.co/microsoft/wavlm-base-plus", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) - -# print(f"wavlm_base_plus: {wavlm_base_plus.calculate_memory_usage_mb()}") - -wavlm_base_plus_sv = ModelMeta( - loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base-plus-sv"), - name="microsoft/wavlm-base-plus-sv", - languages=["eng"], - open_weights=True, - revision="main", - release_date="2022-07-19", # estimate - max_tokens=float("inf"), - n_parameters=94_700_000, - memory_usage_mb=361, - embed_dim=768, - license="MIT", - reference="https://huggingface.co/microsoft/wavlm-base-plus-sv", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) - -wavlm_base_plus_sd = ModelMeta( - loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base-plus-sd"), - name="microsoft/wavlm-base-plus-sd", - languages=["eng"], - open_weights=True, - revision="main", - release_date="2022-07-19", # estimate - max_tokens=float("inf"), - n_parameters=94_700_000, - memory_usage_mb=361, - embed_dim=768, - license="MIT", - reference="https://huggingface.co/microsoft/wavlm-base-plus-sd", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) - -# print(f"wavlm_base_plus_sv: {wavlm_base_plus_sv.calculate_memory_usage_mb()}") - -wavlm_base_sv = ModelMeta( - loader=partial(WavlmWrapper, model_name="microsoft/wavlm-base-sv"), - name="microsoft/wavlm-base-sv", - languages=["eng"], - open_weights=True, - revision="main", - release_date="2022-07-19", # estimate - max_tokens=float("inf"), - n_parameters=94_700_000, - memory_usage_mb=361, - embed_dim=768, - license="MIT", - reference="https://huggingface.co/microsoft/wavlm-base-sv", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) - -# print(f"wavlm_base_sv: {wavlm_base_sv.calculate_memory_usage_mb()}") - -wavlm_large = ModelMeta( - loader=partial(WavlmWrapper, model_name="microsoft/wavlm-large"), - name="microsoft/wavlm-large", - languages=["eng"], - open_weights=True, - revision="main", - release_date="2022-07-19", # estimate - max_tokens=float("inf"), - n_parameters=316_620_000, - memory_usage_mb=1208, - embed_dim=1024, - license="MIT", - reference="https://huggingface.co/microsoft/wavlm-large", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) - -# print(f"wavlm_large: {wavlm_large.calculate_memory_usage_mb()}") \ No newline at end of file diff --git a/mteb/models/whisper_models.py b/mteb/models/whisper_models.py deleted file mode 100644 index 3baeefa327..0000000000 --- a/mteb/models/whisper_models.py +++ /dev/null @@ -1,188 +0,0 @@ -from functools import partial -from mteb.models.wrapper import Wrapper -from mteb.encoder_interface import PromptType, AudioEncoder -import numpy as np -import torch -from transformers import WhisperModel, WhisperProcessor -from mteb.model_meta import ModelMeta -from datasets import Audio - -class WhisperWrapper(AudioEncoder): - def __init__(self, - model_name: str, - revision: str = "main", - device: str | None = None, - **kwargs): - super().__init__(device=device, **kwargs) - self.model_name = model_name - self.model_revision = revision - - self.model = WhisperModel.from_pretrained(self.model_name, revision=self.model_revision) - self.feature_extractor = WhisperProcessor.from_pretrained(self.model_name, revision=self.model_revision) - self.embed_dim = self.model.config.d_model - - if device: - self.model = self.model.to(device) - print("Whisper model initialized.") - - def get_audio_embeddings(self, - audio_files: list[Audio] | Audio, - batch_size: int = 32, - **kwargs) -> np.ndarray: - if not isinstance(audio_files, list): - audio_files = [audio_files] - - all_embeddings = [] - for i in range(0, len(audio_files), batch_size): - batch = audio_files[i:i + batch_size] - audio_data = [file['array'] for file in batch] - sampling_rates = [file['sampling_rate'] for file in batch] - - # converts raw waveform to log-Mel spectrograms - inputs = self.feature_extractor( - audio_data, - sampling_rate=sampling_rates[0], - return_tensors="pt", - padding="max_length", # force padding to a fixed raw sample length - max_length=480000 # 30 seconds * 16000 Hz => 480000 samples -> 480000/160 = 3000 mel frames (whisper expects 3000 frames) - ) - - if hasattr(self, 'device') and self.device: - inputs = {k: v.to(self.device) for k, v in inputs.items()} - - with torch.no_grad(): - encoder_outputs = self.model.encoder(inputs.input_features) - - embeddings = encoder_outputs.last_hidden_state - batch_embeddings = embeddings.mean(dim=1).cpu().numpy() - print(batch_embeddings.shape) - all_embeddings.append(batch_embeddings) - - return np.vstack(all_embeddings) - - def encode(self, - audio_files: list[Audio], - *, - task_name: str, - prompt_type: PromptType | None = None, - **kwargs) -> np.ndarray: - return self.get_audio_embeddings(audio_files, **kwargs) - - - - -whisper_tiny = ModelMeta( - loader=partial(WhisperWrapper, model_name="openai/whisper-tiny"), - name="openai/whisper-tiny", - languages=["eng", "multilingual"], - open_weights=True, - revision="main", - release_date="2022-09-27", - max_tokens=float("inf"), - n_parameters=39_000_000, - memory_usage_mb=144, - embed_dim=512, - license="MIT", - reference="https://huggingface.co/openai/whisper-tiny", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) - -whisper_base = ModelMeta( - loader=partial(WhisperWrapper, model_name="openai/whisper-base"), - name="openai/whisper-base", - languages=["eng", "multilingual"], - open_weights=True, - revision="main", - release_date="2022-09-27", - max_tokens=float("inf"), - n_parameters=74_000_000, - memory_usage_mb=277, - embed_dim=512, - license="MIT", - reference="https://huggingface.co/openai/whisper-base", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) - -whisper_small = ModelMeta( - loader=partial(WhisperWrapper, model_name="openai/whisper-small"), - name="openai/whisper-small", - languages=["eng", "multilingual"], - open_weights=True, - revision="main", - release_date="2022-09-27", - max_tokens=float("inf"), - n_parameters=244_000_000, - memory_usage_mb=922, - embed_dim=768, - license="MIT", - reference="https://huggingface.co/openai/whisper-small", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) - -whisper_medium = ModelMeta( - loader=partial(WhisperWrapper, model_name="openai/whisper-medium"), - name="openai/whisper-medium", - languages=["eng", "multilingual"], - open_weights=True, - revision="main", - release_date="2022-09-27", - max_tokens=float("inf"), - n_parameters=769_000_000, - memory_usage_mb=2914, - embed_dim=1024, - license="MIT", - reference="https://huggingface.co/openai/whisper-medium", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) - -whisper_large_v3 = ModelMeta( - loader=partial(WhisperWrapper, model_name="openai/whisper-large-v3"), - name="openai/whisper-large-v3", - languages=["multilingual"], - open_weights=True, - revision="main", - release_date="2022-09-27", - max_tokens=float("inf"), - n_parameters=1550_000_000, - memory_usage_mb=5887, - embed_dim=1280, - license="MIT", - reference="https://huggingface.co/openai/whisper-large-v3", - similarity_fn_name="cosine", - framework=["PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, - modalities=["audio"] -) - -# print(f"whisper_tiny: {whisper_tiny.calculate_memory_usage_mb()}") -# print(f"whisper_base: {whisper_base.calculate_memory_usage_mb()}") -# print(f"whisper_small: {whisper_small.calculate_memory_usage_mb()}") -# print(f"whisper_medium: {whisper_medium.calculate_memory_usage_mb()}") -# print(f"whisper_large_v3: {whisper_large.calculate_memory_usage_mb()}") diff --git a/mteb/tasks/Audio/Clustering/eng/VoiceEmotions.py b/mteb/tasks/Audio/Clustering/eng/VoiceEmotions.py deleted file mode 100644 index 5377672a04..0000000000 --- a/mteb/tasks/Audio/Clustering/eng/VoiceEmotions.py +++ /dev/null @@ -1,81 +0,0 @@ -from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering -from mteb.abstasks.TaskMetadata import TaskMetadata -import random -import datasets -import mteb -from mteb import MTEB - - -class CREMADEmotionClustering(AbsTaskAudioClustering): - label_column_name: str = "label" - - metadata = TaskMetadata( - name="CREMADEmotionClustering", - description="Clustering audio recordings based on expressed emotions from the CREMA-D dataset.", - reference="https://huggingface.co/datasets/AbstractTTS/CREMA-D", - dataset={ - "path": "AbstractTTS/CREMA-D", - "revision": "main", - }, - type="AudioClustering", - category="a2a", - eval_splits=["train"], - eval_langs=["eng-Latn"], - main_score="nmi", - date=("2014-01-01", "2024-12-31"), - domains=["Spoken"], - task_subtypes=["Voice Emotion Clustering"], - license="not specified", - annotations_creators="derived", - dialect=[], - modalities=["audio"], - ) - - def dataset_transform(self): - EMOTION_MAP = {"anger": 0,"happy": 1,"neutral": 2,"sad": 3,"fear": 4,"disgust": 5} - splits = self.metadata.eval_splits - ds = {} - for split in splits: - ds_split = self.dataset[split] - audio = ds_split["audio"] - labels = ds_split["major_emotion"] - audio = [{"array": item["array"], "sampling_rate": item["sampling_rate"]} for item in audio] - labels = [EMOTION_MAP.get(str(label).lower().strip(), -1) for label in labels] - rng = random.Random(1111) - data_pairs = list(zip(audio, labels)) - rng.shuffle(data_pairs) - audio, labels = zip(*data_pairs) - - batch_size = 512 - audio_batched = [audio[i:i + batch_size] for i in range(0, len(audio), batch_size)] - labels_batched = [labels[i:i + batch_size] for i in range(0, len(labels), batch_size)] - - audio_batched = audio_batched[:4] - labels_batched = labels_batched[:4] - - audio_batched = [item for batch in audio_batched for item in batch] - labels_batched = [item for batch in labels_batched for item in batch] - - ds[split] = datasets.Dataset.from_dict({ - "audio": audio_batched, - "label": labels_batched, - }) - - self.dataset = datasets.DatasetDict(ds) - - - -if __name__ == "__main__": - model_name = "microsoft/wavlm-base" - model_name = "facebook/wav2vec2-base" - model = mteb.get_model(model_name) - print(f"Loaded model type: {type(model)}") - evaluation = mteb.MTEB(tasks=[CREMADEmotionClustering()]) - cluster_algo = "Kmeans" - results = evaluation.run(model, output_folder=f"results_Emotions/{cluster_algo}/{model_name}", overwrite_results=True, cluster_algo=cluster_algo, limit=224) - print(results) - - - - - diff --git a/mteb/tasks/Audio/Clustering/eng/VoiceGender.py b/mteb/tasks/Audio/Clustering/eng/VoiceGender.py index ce65b0ba3a..b24cb3259a 100644 --- a/mteb/tasks/Audio/Clustering/eng/VoiceGender.py +++ b/mteb/tasks/Audio/Clustering/eng/VoiceGender.py @@ -2,8 +2,6 @@ from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering from mteb.abstasks.TaskMetadata import TaskMetadata -import mteb -from mteb import MTEB class VoiceGenderClustering(AbsTaskAudioClustering): @@ -15,7 +13,6 @@ class VoiceGenderClustering(AbsTaskAudioClustering): dataset={ "path": "mmn3690/voice-gender-clustering", "revision": "1b202ea7bcd0abd5283e628248803e1569257c80", - }, type="AudioClustering", category="a2a", @@ -37,17 +34,3 @@ class VoiceGenderClustering(AbsTaskAudioClustering): year = "2018 }""", ) - -if __name__ == "__main__": - #model_name = "microsoft/wavlm-base" - model_name = "facebook/wav2vec2-base" - model = mteb.get_model(model_name) - print(f"Loaded model type: {type(model)}") - evaluation = mteb.MTEB(tasks=[VoiceGenderClustering()]) - cluster_algo = "Kmeans" - results = evaluation.run(model, output_folder=f"results_Gender/{cluster_algo}/{model_name}", overwrite_results=True, cluster_algo=cluster_algo) - print(results) - - # from datasets import load_dataset - # dataset = load_dataset("mmn3690/voice-gender-clustering", split="train") - # print(dataset["label"]) diff --git a/run.py b/run.py deleted file mode 100644 index 048132d326..0000000000 --- a/run.py +++ /dev/null @@ -1,12 +0,0 @@ -import mteb -from mteb.tasks.Audio.Clustering.eng.VoiceGender import VoiceGenderClustering -from mteb.tasks.Audio.Clustering.eng.VoiceEmotions import CREMADEmotionClustering - -# model_name = "microsoft/wavlm-base" -model_name = "Qwen/Qwen2-Audio-7B" -model = mteb.get_model(model_name) -print(f"Loaded model type: {type(model)}") -evaluation = mteb.MTEB(tasks=[CREMADEmotionClustering()]) -cluster_algo = "Kmeans" -results = evaluation.run(model, output_folder=f"results_Emotions/{cluster_algo}/{model_name}", overwrite_results=True, cluster_algo=cluster_algo, limit=224) -print(results) \ No newline at end of file